| | import requests |
| | from bs4 import BeautifulSoup |
| | from tqdm import tqdm |
| |
|
| |
|
| | def get_urls_from_file(file_path: str): |
| | """ |
| | Function to get urls from a file |
| | """ |
| | with open(file_path, "r") as f: |
| | urls = f.readlines() |
| | urls = [url.strip() for url in urls] |
| | return urls |
| |
|
| |
|
| | def get_base_url(url): |
| | parsed_url = urlparse(url) |
| | base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" |
| | return base_url |
| |
|
| |
|
| | def get_metadata(lectures_url, schedule_url): |
| | """ |
| | Function to get the lecture metadata from the lectures and schedule URLs. |
| | """ |
| | lecture_metadata = {} |
| |
|
| | |
| | r_lectures = requests.get(lectures_url) |
| | soup_lectures = BeautifulSoup(r_lectures.text, "html.parser") |
| |
|
| | |
| | r_schedule = requests.get(schedule_url) |
| | soup_schedule = BeautifulSoup(r_schedule.text, "html.parser") |
| |
|
| | |
| | lecture_blocks = soup_lectures.find_all("div", class_="lecture-container") |
| |
|
| | |
| | date_mapping = {} |
| | schedule_rows = soup_schedule.find_all("li", class_="table-row-lecture") |
| | for row in schedule_rows: |
| | try: |
| | date = ( |
| | row.find("div", {"data-label": "Date"}).get_text(separator=" ").strip() |
| | ) |
| | description_div = row.find("div", {"data-label": "Description"}) |
| | slides_link_tag = description_div.find("a", title="Download slides") |
| | slides_link = slides_link_tag["href"].strip() if slides_link_tag else None |
| | slides_link = ( |
| | f"https://dl4ds.github.io{slides_link}" if slides_link else None |
| | ) |
| | if slides_link: |
| | date_mapping[slides_link] = date |
| | except Exception as e: |
| | print(f"Error processing schedule row: {e}") |
| | continue |
| |
|
| | for block in lecture_blocks: |
| | try: |
| | |
| | title = block.find("span", style="font-weight: bold;").text.strip() |
| |
|
| | |
| | tldr = block.find("strong", text="tl;dr:").next_sibling.strip() |
| |
|
| | |
| | slides_link_tag = block.find("a", title="Download slides") |
| | slides_link = slides_link_tag["href"].strip() if slides_link_tag else None |
| | slides_link = ( |
| | f"https://dl4ds.github.io{slides_link}" if slides_link else None |
| | ) |
| |
|
| | |
| | recording_link_tag = block.find("a", title="Download lecture recording") |
| | recording_link = ( |
| | recording_link_tag["href"].strip() if recording_link_tag else None |
| | ) |
| |
|
| | |
| | suggested_readings_tag = block.find("p", text="Suggested Readings:") |
| | if suggested_readings_tag: |
| | suggested_readings = suggested_readings_tag.find_next_sibling("ul") |
| | if suggested_readings: |
| | suggested_readings = suggested_readings.get_text( |
| | separator="\n" |
| | ).strip() |
| | else: |
| | suggested_readings = "No specific readings provided." |
| | else: |
| | suggested_readings = "No specific readings provided." |
| |
|
| | |
| | date = date_mapping.get(slides_link, "No date available") |
| |
|
| | |
| | lecture_metadata[slides_link] = { |
| | "date": date, |
| | "tldr": tldr, |
| | "title": title, |
| | "lecture_recording": recording_link, |
| | "suggested_readings": suggested_readings, |
| | } |
| | except Exception as e: |
| | print(f"Error processing block: {e}") |
| | continue |
| |
|
| | return lecture_metadata |
| |
|