""" download_classics.py ==================== Curated classical text downloader for MicroGPT Trivium/Quadrivium training corpus. Reads source_manifest.json and fetches each enabled source: - Project Gutenberg : downloads plain .txt files directly - MIT Internet Classics Archive: downloads HTML, strips markup with BeautifulSoup Downloaded files land in pipeline/inbox/ so the main pipeline picks them up. A download_log.json is written alongside this script recording every run. Usage ----- python download_classics.py # download all enabled sources python download_classics.py --list # list all available sources python download_classics.py --dry-run # show what would download without doing it python download_classics.py --art logic # download only logic texts Requirements ------------ pip install requests beautifulsoup4 """ import argparse import json import logging import re import sys import time from datetime import datetime, timezone from pathlib import Path # --------------------------------------------------------------------------- # Paths # --------------------------------------------------------------------------- SCRIPT_DIR = Path(__file__).resolve().parent MANIFEST_PATH = SCRIPT_DIR / "source_manifest.json" INBOX_DIR = SCRIPT_DIR.parent / "inbox" LOG_PATH = SCRIPT_DIR / "download_log.json" # --------------------------------------------------------------------------- # HTTP configuration # --------------------------------------------------------------------------- HEADERS = { "User-Agent": "MicroGPT-Classics-Downloader/1.0", "Accept": "text/html,text/plain,*/*", } REQUEST_TIMEOUT = 30 # seconds per HTTP request INTER_REQUEST_DELAY = 1.0 # seconds between downloads (rate-limit courtesy) # --------------------------------------------------------------------------- # Logging # --------------------------------------------------------------------------- logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%H:%M:%S", ) log = logging.getLogger("download_classics") # --------------------------------------------------------------------------- # Lazy imports with helpful error messages # --------------------------------------------------------------------------- def _require_requests(): try: import requests return requests except ImportError: log.error("'requests' is not installed. Run: pip install requests") sys.exit(1) def _require_bs4(): try: from bs4 import BeautifulSoup return BeautifulSoup except ImportError: log.error("'beautifulsoup4' is not installed. Run: pip install beautifulsoup4") sys.exit(1) # --------------------------------------------------------------------------- # Manifest loading # --------------------------------------------------------------------------- def load_manifest() -> dict: if not MANIFEST_PATH.exists(): log.error("Manifest not found: %s", MANIFEST_PATH) sys.exit(1) with MANIFEST_PATH.open(encoding="utf-8") as fh: manifest = json.load(fh) return manifest def filter_sources(sources: list[dict], *, art: str | None = None, enabled_only: bool = True) -> list[dict]: result = sources if enabled_only: result = [s for s in result if s.get("enabled", True)] if art: result = [s for s in result if s.get("art", "").lower() == art.lower()] return result # --------------------------------------------------------------------------- # Text cleaning helpers # --------------------------------------------------------------------------- def _strip_gutenberg_boilerplate(text: str) -> str: """Remove Project Gutenberg header and footer legalese from raw .txt files.""" # The header ends at a line matching "*** START OF..." or similar start_markers = [ r"\*\*\* START OF (THE|THIS) PROJECT GUTENBERG", r"\*\*\*START OF (THE|THIS) PROJECT GUTENBERG", ] end_markers = [ r"\*\*\* END OF (THE|THIS) PROJECT GUTENBERG", r"\*\*\*END OF (THE|THIS) PROJECT GUTENBERG", ] lines = text.splitlines() start_idx = 0 end_idx = len(lines) for i, line in enumerate(lines): for pat in start_markers: if re.search(pat, line, re.IGNORECASE): start_idx = i + 1 break for i, line in enumerate(lines): for pat in end_markers: if re.search(pat, line, re.IGNORECASE): end_idx = i break body = lines[start_idx:end_idx] return "\n".join(body).strip() def _extract_mit_classics_text(html: str, source_id: str) -> str: """ Strip MIT Classics HTML down to the main prose body. MIT Classics pages wrap the actual text inside a fairly simple layout: navigation links at top, a
or body paragraphs in the middle, and
a small footer. We grab everything inside , remove