Spaces:

LisaMegaWatts
/

pre-punctuation-processor

Sleeping

App Files Files Community

pre-punctuation-processor / sources /download_classics.py

LisaMegaWatts

Upload sources/download_classics.py with huggingface_hub

b9e58da verified about 2 months ago

raw

history blame contribute delete

15.9 kB

	"""
	download_classics.py
	====================
	Curated classical text downloader for MicroGPT Trivium/Quadrivium training corpus.

	Reads source_manifest.json and fetches each enabled source:
	- Project Gutenberg : downloads plain .txt files directly
	- MIT Internet Classics Archive: downloads HTML, strips markup with BeautifulSoup

	Downloaded files land in pipeline/inbox/ so the main pipeline picks them up.
	A download_log.json is written alongside this script recording every run.

	Usage
	-----
	python download_classics.py # download all enabled sources
	python download_classics.py --list # list all available sources
	python download_classics.py --dry-run # show what would download without doing it
	python download_classics.py --art logic # download only logic texts

	Requirements
	------------
	pip install requests beautifulsoup4
	"""

	import argparse
	import json
	import logging
	import re
	import sys
	import time
	from datetime import datetime, timezone
	from pathlib import Path

	# ---------------------------------------------------------------------------
	# Paths
	# ---------------------------------------------------------------------------

	SCRIPT_DIR = Path(__file__).resolve().parent
	MANIFEST_PATH = SCRIPT_DIR / "source_manifest.json"
	INBOX_DIR = SCRIPT_DIR.parent / "inbox"
	LOG_PATH = SCRIPT_DIR / "download_log.json"

	# ---------------------------------------------------------------------------
	# HTTP configuration
	# ---------------------------------------------------------------------------

	HEADERS = {
	"User-Agent": "MicroGPT-Classics-Downloader/1.0",
	"Accept": "text/html,text/plain,/",
	}
	REQUEST_TIMEOUT = 30 # seconds per HTTP request
	INTER_REQUEST_DELAY = 1.0 # seconds between downloads (rate-limit courtesy)

	# ---------------------------------------------------------------------------
	# Logging
	# ---------------------------------------------------------------------------

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s %(levelname)-8s %(message)s",
	datefmt="%H:%M:%S",
	)
	log = logging.getLogger("download_classics")

	# ---------------------------------------------------------------------------
	# Lazy imports with helpful error messages
	# ---------------------------------------------------------------------------

	def _require_requests():
	try:
	import requests
	return requests
	except ImportError:
	log.error("'requests' is not installed. Run: pip install requests")
	sys.exit(1)


	def _require_bs4():
	try:
	from bs4 import BeautifulSoup
	return BeautifulSoup
	except ImportError:
	log.error("'beautifulsoup4' is not installed. Run: pip install beautifulsoup4")
	sys.exit(1)


	# ---------------------------------------------------------------------------
	# Manifest loading
	# ---------------------------------------------------------------------------

	def load_manifest() -> dict:
	if not MANIFEST_PATH.exists():
	log.error("Manifest not found: %s", MANIFEST_PATH)
	sys.exit(1)
	with MANIFEST_PATH.open(encoding="utf-8") as fh:
	manifest = json.load(fh)
	return manifest


	def filter_sources(sources: list[dict], *, art: str \| None = None, enabled_only: bool = True) -> list[dict]:
	result = sources
	if enabled_only:
	result = [s for s in result if s.get("enabled", True)]
	if art:
	result = [s for s in result if s.get("art", "").lower() == art.lower()]
	return result


	# ---------------------------------------------------------------------------
	# Text cleaning helpers
	# ---------------------------------------------------------------------------

	def _strip_gutenberg_boilerplate(text: str) -> str:
	"""Remove Project Gutenberg header and footer legalese from raw .txt files."""
	# The header ends at a line matching "*** START OF..." or similar
	start_markers = [
	r"\\\* START OF (THE\|THIS) PROJECT GUTENBERG",
	r"\\\*START OF (THE\|THIS) PROJECT GUTENBERG",
	]
	end_markers = [
	r"\\\* END OF (THE\|THIS) PROJECT GUTENBERG",
	r"\\\*END OF (THE\|THIS) PROJECT GUTENBERG",
	]

	lines = text.splitlines()
	start_idx = 0
	end_idx = len(lines)

	for i, line in enumerate(lines):
	for pat in start_markers:
	if re.search(pat, line, re.IGNORECASE):
	start_idx = i + 1
	break

	for i, line in enumerate(lines):
	for pat in end_markers:
	if re.search(pat, line, re.IGNORECASE):
	end_idx = i
	break

	body = lines[start_idx:end_idx]
	return "\n".join(body).strip()


	def _extract_mit_classics_text(html: str, source_id: str) -> str:
	"""
	Strip MIT Classics HTML down to the main prose body.

	MIT Classics pages wrap the actual text inside a fairly simple layout:
	navigation links at top, a <pre> or body paragraphs in the middle, and
	a small footer. We grab everything inside <body>, remove <script>,
	<style>, <a> navigation blocks, and return clean plain text.
	"""
	BeautifulSoup = _require_bs4()

	soup = BeautifulSoup(html, "html.parser")

	# Kill noise elements
	for tag in soup(["script", "style", "head", "nav"]):
	tag.decompose()

	# MIT Classics often wraps text in a <pre> block
	pre = soup.find("pre")
	if pre:
	raw = pre.get_text(separator="\n")
	else:
	# Fall back to extracting from the full body
	body = soup.find("body") or soup
	# Remove navbars: anchors that are just navigation images or short links
	for a in body.find_all("a"):
	# Keep inline text links; drop bare image links
	if a.find("img"):
	a.decompose()
	raw = body.get_text(separator="\n")

	# Collapse excessive blank lines (more than two in a row → one blank)
	cleaned = re.sub(r"\n{3,}", "\n\n", raw)
	return cleaned.strip()


	# ---------------------------------------------------------------------------
	# Downloaders
	# ---------------------------------------------------------------------------

	def _download_gutenberg(source: dict, requests) -> str:
	"""Fetch a plain-text Gutenberg file and strip boilerplate."""
	url = source["url"]
	log.info(" GET %s", url)
	resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
	resp.raise_for_status()

	# Gutenberg serves UTF-8 but sometimes declares latin-1; detect from BOM
	try:
	text = resp.content.decode("utf-8-sig")
	except UnicodeDecodeError:
	text = resp.content.decode("latin-1")

	cleaned = _strip_gutenberg_boilerplate(text)
	if len(cleaned) < 1000:
	raise ValueError(
	f"Gutenberg body suspiciously short ({len(cleaned)} chars) for {source['id']}; "
	"boilerplate stripping may have failed"
	)
	return cleaned


	def _download_mit_classics(source: dict, requests) -> str:
	"""Fetch an MIT Classics text file (.mb.txt) or HTML page."""
	url = source["url"]
	log.info(" GET %s", url)
	resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
	resp.raise_for_status()

	# .mb.txt files are plain text - no HTML parsing needed
	if url.endswith(".mb.txt") or url.endswith(".txt"):
	text = resp.content.decode("utf-8", errors="replace").strip()
	else:
	html = resp.content.decode("utf-8", errors="replace")
	text = _extract_mit_classics_text(html, source["id"])

	if len(text) < 500:
	raise ValueError(
	f"MIT Classics body suspiciously short ({len(text)} chars) for {source['id']}"
	)
	return text


	def _download_ia(source: dict, requests) -> str:
	"""Fetch plain text from Internet Archive."""
	from sources.ia_search import get_ia_text

	identifier = source.get("identifier") or source.get("id")
	if not identifier:
	raise ValueError(f"No 'identifier' field for IA source: {source}")

	log.info(" Fetching IA text for: %s", identifier)
	text = get_ia_text(identifier)

	if len(text) < 1000:
	raise ValueError(
	f"IA body suspiciously short ({len(text)} chars) for {identifier}"
	)
	return text


	DOWNLOADER_MAP = {
	"gutenberg": _download_gutenberg,
	"mit_classics": _download_mit_classics,
	"internet_archive": _download_ia,
	}


	# ---------------------------------------------------------------------------
	# Core download logic
	# ---------------------------------------------------------------------------

	def download_source(source: dict, requests, dry_run: bool = False) -> dict:
	"""
	Download a single source and save it to inbox/.

	Returns a result dict suitable for inclusion in download_log.json.
	"""
	source_id = source["id"]
	filename = source["filename"]
	dest = INBOX_DIR / filename

	result = {
	"id": source_id,
	"filename": filename,
	"url": source["url"],
	"timestamp": datetime.now(timezone.utc).isoformat(),
	"status": None,
	"bytes": 0,
	"error": None,
	}

	if dry_run:
	log.info("[DRY-RUN] Would download: %s → %s", source_id, dest)
	result["status"] = "dry_run"
	return result

	source_type = source.get("source_type", "gutenberg")
	downloader = DOWNLOADER_MAP.get(source_type)
	if downloader is None:
	result["status"] = "skipped"
	result["error"] = f"Unknown source_type '{source_type}'"
	log.warning(" Skipping %s: %s", source_id, result["error"])
	return result

	try:
	text = downloader(source, requests)
	INBOX_DIR.mkdir(parents=True, exist_ok=True)
	dest.write_text(text, encoding="utf-8")
	byte_count = dest.stat().st_size
	result["status"] = "ok"
	result["bytes"] = byte_count
	log.info(" Saved %s (%s bytes)", dest.name, f"{byte_count:,}")
	except Exception as exc:
	result["status"] = "error"
	result["error"] = str(exc)
	log.error(" FAILED %s: %s", source_id, exc)

	return result


	# ---------------------------------------------------------------------------
	# Log persistence
	# ---------------------------------------------------------------------------

	def load_download_log() -> list:
	if LOG_PATH.exists():
	try:
	with LOG_PATH.open(encoding="utf-8") as fh:
	return json.load(fh)
	except (json.JSONDecodeError, OSError):
	return []
	return []


	def save_download_log(entries: list) -> None:
	with LOG_PATH.open("w", encoding="utf-8") as fh:
	json.dump(entries, fh, indent=2, ensure_ascii=False)


	# ---------------------------------------------------------------------------
	# CLI presentation helpers
	# ---------------------------------------------------------------------------

	CATEGORY_COLOURS = {
	"trivium": "\033[96m", # cyan
	"quadrivium": "\033[93m", # yellow
	"bridging": "\033[95m", # magenta
	}
	RESET = "\033[0m"
	BOLD = "\033[1m"


	def _coloured(text: str, colour: str) -> str:
	"""Apply ANSI colour if stdout is a TTY."""
	if sys.stdout.isatty():
	return f"{colour}{text}{RESET}"
	return text


	def print_source_list(sources: list[dict]) -> None:
	"""Pretty-print the full catalogue of sources."""
	arts: dict[str, list[dict]] = {}
	for s in sources:
	arts.setdefault(s.get("art", "unknown"), []).append(s)

	total_words = sum(s.get("estimated_words", 0) for s in sources)
	enabled_count = sum(1 for s in sources if s.get("enabled", True))

	print(f"\n{BOLD}MicroGPT Classical Corpus - {len(sources)} sources "
	f"({enabled_count} enabled, ~{total_words:,} words){RESET}\n")

	for art, art_sources in sorted(arts.items()):
	cat = art_sources[0].get("category", "")
	colour = CATEGORY_COLOURS.get(cat, "")
	label = _coloured(f"[{art.upper()}]", BOLD + colour)
	print(f" {label}")
	for s in art_sources:
	enabled_marker = " " if s.get("enabled", True) else " [DISABLED] "
	est = s.get("estimated_words", 0)
	print(
	f" {enabled_marker}{s['author']}: {s['title']}"
	f" ({est:,} words) -> {s['filename']}"
	)
	print()


	def print_summary(results: list[dict]) -> None:
	"""Print a download-run summary table."""
	ok = [r for r in results if r["status"] == "ok"]
	failed = [r for r in results if r["status"] == "error"]
	skipped = [r for r in results if r["status"] in ("skipped", "dry_run")]

	total_bytes = sum(r["bytes"] for r in ok)

	print(f"\n{'-' * 60}")
	print(f" Downloaded : {len(ok)}")
	print(f" Failed : {len(failed)}")
	print(f" Skipped : {len(skipped)}")
	print(f" Total size : {total_bytes / 1024:.1f} KB ({total_bytes:,} bytes)")

	if failed:
	print(f"\n {BOLD}Failures:{RESET}")
	for r in failed:
	print(f" - {r['id']}: {r['error']}")
	print(f"{'-' * 60}\n")


	# ---------------------------------------------------------------------------
	# Main entry point
	# ---------------------------------------------------------------------------

	def main() -> None:
	parser = argparse.ArgumentParser(
	description="Download classical Trivium/Quadrivium texts for MicroGPT training.",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog=__doc__,
	)
	parser.add_argument(
	"--list",
	action="store_true",
	help="List all available sources and exit",
	)
	parser.add_argument(
	"--dry-run",
	action="store_true",
	help="Show what would be downloaded without actually downloading",
	)
	parser.add_argument(
	"--art",
	metavar="ART",
	help="Download only sources for the given art (e.g. logic, rhetoric, geometry)",
	)
	parser.add_argument(
	"--include-disabled",
	action="store_true",
	help="Include sources marked enabled=false in the manifest",
	)
	args = parser.parse_args()

	# Load manifest
	manifest = load_manifest()
	all_sources = manifest.get("sources", [])

	# --list just prints and exits
	if args.list:
	print_source_list(all_sources)
	return

	# Determine which sources to process
	enabled_only = not args.include_disabled
	sources = filter_sources(all_sources, art=args.art, enabled_only=enabled_only)

	if not sources:
	filter_desc = f" with art='{args.art}'" if args.art else ""
	log.warning("No enabled sources found%s. Use --list to see all.", filter_desc)
	return

	if args.dry_run:
	log.info("DRY RUN - nothing will be written")

	# Print what we intend to do
	art_desc = f" (art={args.art})" if args.art else ""
	log.info(
	"Processing %d source(s)%s - inbox: %s",
	len(sources), art_desc, INBOX_DIR
	)

	requests = _require_requests()
	if not args.dry_run:
	_require_bs4() # validate early so we don't fail mid-run

	results: list[dict] = []
	for i, source in enumerate(sources, start=1):
	log.info(
	"[%d/%d] %s - %s (%s)",
	i, len(sources),
	source["id"],
	source["title"],
	source["source_type"],
	)

	result = download_source(source, requests, dry_run=args.dry_run)
	results.append(result)

	# Rate-limit: wait between requests (skip after last one)
	if i < len(sources) and not args.dry_run:
	time.sleep(INTER_REQUEST_DELAY)

	# Persist log
	if not args.dry_run:
	log_entries = load_download_log()
	log_entries.extend(results)
	save_download_log(log_entries)
	log.info("Download log updated: %s", LOG_PATH)

	print_summary(results)


	if __name__ == "__main__":
	main()