@roosterburton
Nothing nefarious going on, I had a couple of things I was looking for:
- Find out how popular my courses are in the Chinese library (it turns out they account for only 2% of the total views)
- create a simple spreadsheet for librarian purposes, navigating the website and wading through potentially 1000+ courses to find for example untagged courses is just too slow and annoying
- gather some statistics about the library and get data on total duration, word counts, popularity by level etc
It turns out, for many stats you actually need to comb through all lessons in all courses to get the relevant data and that turned out to be really slow for larger libraries. Additionally a lot of data is just not accurate, e.g. word counts, for that you need to call ācountersā on top of everything else, but at that point I lost interest - just too annoying.
Here is a simplified example without the lesson stuff, also you donāt need any special privileges the data is accessible to all users who have added the language.
Summary
import requests
import csv
KEY = "12345"
language_code = "es"
base_url = "https://www.lingq.com"
headers = {
"accept": "application/json",
"content-type": "application/json",
"accept-encoding": "gzip, deflate, br",
"Authorization": f"Token {KEY}",
}
def fetch_and_save_to_csv():
library_data_list = []
page = 1
while True:
library_url = f"{base_url}/api/v3/{language_code}/search/?level=1&level=2&level=3&level=4&level=5&level=6&sortBy=mostLiked&type=collection&page={page}"
library_response = requests.get(library_url, headers=headers)
if library_response.status_code == 200:
library_data = library_response.json()
results = library_data.get("results", [])
library_data_list.extend(results)
next_url = library_data.get("next")
if not next_url:
print("No further pages")
break
else:
page += 1
print("going to page: ", page)
else:
print(
f"Failed to fetch data from page {page}. Status code: {library_response.status_code}"
)
break
# Processing
data_list = []
for result in library_data_list:
id_value = result.get("id")
title = result.get("title")
lessons_count = result.get("lessonsCount")
duration = result.get("duration")
level = result.get("level")
roses = result.get("rosesCount")
shared_by = result.get("sharedByName")
sharer_role = result.get("sharedByRole")
tags = result.get("tags", "")
status = result.get("status", "")
data_list.append(
{
"link": f"{base_url}/uni/learn/{language_code}/web/library/course/{id_value}",
"title": title,
"level": level,
"lessonsCount": lessons_count,
"duration": duration,
"roses": roses,
"shared_by": shared_by,
"sharer_role": sharer_role,
"tags": tags,
"status": status,
}
)
# Sort by
data_list.sort(key=lambda x: x["roses"], reverse=True)
# Write data to CSV
output_filename = f"{language_code}_library.csv"
with open(output_filename, "w", newline="", encoding="utf-8") as csv_file:
fieldnames = [
"link",
"title",
"level",
"lessonsCount",
"duration",
"roses",
"shared_by",
"sharer_role",
"tags",
"status",
]
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for data in data_list:
writer.writerow(data)
print(f"Data saved to {output_filename}.")
if __name__ == "__main__":
fetch_and_save_to_csv()