import time from datetime import datetime from bs4 import BeautifulSoup import json import requests from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.service import Service from selenium.common.exceptions import TimeoutException, NoSuchElementException from webdriver_manager.chrome import ChromeDriverManager BASE_TAG_URL = "https://www.deeplearning.ai/the-batch/tag/" VALID_CATEGORIES = [ "letters", "data-points", "research", "business", "science", "culture", "hardware", "ai-careers" ] def initialize_driver(): options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) return driver def load_all_articles(driver, url): wait = WebDriverWait(driver, 20) driver.get(url) time.sleep(3) category = url.split('/')[-2] all_articles_links = set() if category == "letters": last_url = "" while True: current_links = get_article_links_from_page(driver) all_articles_links.update(current_links) print(f"Collected {len(current_links)} articles on the current page in '{category}'") try: older_button = wait.until( EC.element_to_be_clickable((By.CLASS_NAME, "justify-self-end")) ) driver.execute_script("arguments[0].scrollIntoView({block: 'end'});", older_button) time.sleep(1) older_button.click() print(f"Clicked 'Older Posts' in'{category}'...") time.sleep(2) current_url = driver.current_url if current_url == last_url: print("The URL did not change after the click, we are stopping the 'Older Posts' pagination.") break last_url = current_url except (TimeoutException, NoSuchElementException): print("There is no 'Older Posts' button. Let's move on to the next category.") break else: while True: current_links = get_article_links_from_page(driver) all_articles_links.update(current_links) print(f"Collected {len(current_links)} articles on the current page in '{category}'") try: load_more_button = wait.until( EC.element_to_be_clickable((By.CLASS_NAME, "buttons_secondary__8o9u6")) ) driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more_button) time.sleep(1) driver.execute_script("arguments[0].click();", load_more_button) print(f"Clicked 'Load More' in '{category}'...") time.sleep(2) except (TimeoutException, NoSuchElementException): print( f"The 'Load More' button is unavailable or missing in '{category}'. Moving to the next category.") break return list(all_articles_links) def get_article_links_from_page(driver): soup = BeautifulSoup(driver.page_source, 'html.parser') all_links = set() for a in soup.find_all("a", href=True): href = a['href'] if href.startswith("/the-batch/") and not href.startswith("/the-batch/tag/"): full_url = "https://www.deeplearning.ai" + href if "issue" not in full_url: all_links.add(full_url) return list(all_links) def get_article_links(): driver = initialize_driver() all_links = set() for category in VALID_CATEGORIES: url = f"{BASE_TAG_URL}{category}/" print(f"Loading the category: {url}") category_links = load_all_articles(driver, url) print(f"Found {len(category_links)} articles in category '{category}'") all_links.update(category_links) driver.quit() return list(all_links) def parse_article(url, max_retries=3, delay=2): attempts = 0 while attempts < max_retries: try: response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") h1 = soup.find("h1") title = h1.get_text(strip=True) if h1 else "" description = "" if h1: span = h1.find("span") if span: description = span.get_text(strip=True) span.extract() title = h1.get_text(strip=True) image_tag = soup.find("meta", attrs={"property": "og:image"}) image_url = image_tag["content"] if image_tag else None date_meta = soup.find("meta", attrs={"property": "article:published_time"}) date_str = "" if date_meta: try: date_raw = date_meta["content"] date_str = datetime.fromisoformat(date_raw.split("T")[0]).strftime("%Y-%m-%d") except Exception: date_str = date_meta["content"] content = "" main_content = soup.find("div", class_="prose--styled") if main_content: paragraphs = main_content.find_all(["p", "li"]) content_lines = [p.get_text(strip=True) for p in paragraphs] content = "\n".join(content_lines) time.sleep(delay) return { "title": title.strip(), "description": description.strip(), "image_url": image_url, "date": date_str, "content": content.strip(), "source_url": url, } except (requests.RequestException, Exception) as e: attempts += 1 print(f"Error parsing URL {url} (Attempt {attempts}/{max_retries}): {e}") time.sleep(delay * attempts) print(f"Article skipped due to repeated errors: {url}") return None def run_parser_and_save_to_json(output_filename="data/articles_export.json"): print("Starting to parse article links...") all_article_urls = get_article_links() print(f"{len(all_article_urls)} unique links to articles collected.") parsed_articles = [] print("\n Starting to parse article content...") for i, url in enumerate(all_article_urls): print(f"Parsing the article {i + 1}/{len(all_article_urls)}: {url}") article_data = parse_article(url) if article_data: parsed_articles.append(article_data) print(f"\n Parsing completed. {len(parsed_articles)} articles collected.") with open(output_filename, "w", encoding="utf-8") as f: json.dump(parsed_articles, f, ensure_ascii=False, indent=4) print(f"All articles are saved in '{output_filename}'") print("\n Starting to parse articles...") try: with open(output_filename, "r", encoding="utf-8") as f: articles_to_filter = json.load(f) except FileNotFoundError: print(f"File '{output_filename}' not found for parse.") articles_to_filter = [] initial_count = len(articles_to_filter) filtered_articles = [a for a in articles_to_filter if a.get("content") != "[image]"] filtered_count = len(filtered_articles) print(f"Articles for parse: {initial_count}") print(f"Parsed articles: {filtered_count}") with open(output_filename, "w", encoding="utf-8") as f: json.dump(filtered_articles, f, ensure_ascii=False, indent=4) print(f"Parsed articles saved in '{output_filename}'") if __name__ == "__main__": import os os.makedirs("data", exist_ok=True) run_parser_and_save_to_json()