Spaces:
Sleeping
Sleeping
| # scripts/scrape_html_to_md.py | |
| import os | |
| import json | |
| import re | |
| from datetime import datetime | |
| from urllib.parse import urljoin | |
| from markdownify import markdownify as md | |
| from playwright.sync_api import sync_playwright | |
| from bs4 import BeautifulSoup | |
| BASE_URL = "https://tds.s-anand.net/#/2025-01/" | |
| BASE_ORIGIN = "https://tds.s-anand.net" | |
| OUTPUT_DIR = "markdown_files" | |
| METADATA_FILE = "metadata.json" | |
| visited = set() | |
| metadata = [] | |
| def sanitize_filename(title): | |
| return re.sub(r'[\/*?:"<>|]', "_", title).strip().replace(" ", "_") | |
| def extract_all_internal_links(page): | |
| links = page.eval_on_selector_all("a[href]", "els => els.map(el => el.href)") | |
| return list(set(link for link in links if BASE_ORIGIN in link and '/#/' in link)) | |
| def wait_for_article_and_get_html(page): | |
| page.wait_for_selector("article.markdown-section#main", timeout=10000) | |
| return page.inner_html("article.markdown-section#main") | |
| def crawl_page(page, url): | |
| if url in visited: | |
| return | |
| visited.add(url) | |
| print(f"π Visiting: {url}") | |
| try: | |
| page.goto(url, wait_until="domcontentloaded") | |
| page.wait_for_timeout(1000) | |
| html = wait_for_article_and_get_html(page) | |
| except Exception as e: | |
| print(f"β Error: {e}") | |
| return | |
| title = page.title().split(" - ")[0].strip() or f"page_{len(visited)}" | |
| filename = sanitize_filename(title) | |
| filepath = os.path.join(OUTPUT_DIR, f"{filename}.md") | |
| markdown = md(html) | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| f.write(f"---\ntitle: \"{title}\"\noriginal_url: \"{url}\"\ndownloaded_at: \"{datetime.now().isoformat()}\"\n---\n\n") | |
| f.write(markdown) | |
| metadata.append({ | |
| "title": title, | |
| "filename": f"{filename}.md", | |
| "original_url": url, | |
| "downloaded_at": datetime.now().isoformat() | |
| }) | |
| links = extract_all_internal_links(page) | |
| for link in links: | |
| crawl_page(page, link) | |
| def main(): | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| with sync_playwright() as p: | |
| browser = p.chromium.launch(headless=True) | |
| context = browser.new_context() | |
| page = context.new_page() | |
| crawl_page(page, BASE_URL) | |
| with open(METADATA_FILE, "w", encoding="utf-8") as f: | |
| json.dump(metadata, f, indent=2) | |
| print(f"β Completed. {len(metadata)} pages saved.") | |
| browser.close() | |
| if __name__ == "__main__": | |
| main() | |