LisaMegaWatts commited on
Commit
b9e58da
·
verified ·
1 Parent(s): 1b33d1c

Upload sources/download_classics.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. sources/download_classics.py +485 -0
sources/download_classics.py ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ download_classics.py
3
+ ====================
4
+ Curated classical text downloader for MicroGPT Trivium/Quadrivium training corpus.
5
+
6
+ Reads source_manifest.json and fetches each enabled source:
7
+ - Project Gutenberg : downloads plain .txt files directly
8
+ - MIT Internet Classics Archive: downloads HTML, strips markup with BeautifulSoup
9
+
10
+ Downloaded files land in pipeline/inbox/ so the main pipeline picks them up.
11
+ A download_log.json is written alongside this script recording every run.
12
+
13
+ Usage
14
+ -----
15
+ python download_classics.py # download all enabled sources
16
+ python download_classics.py --list # list all available sources
17
+ python download_classics.py --dry-run # show what would download without doing it
18
+ python download_classics.py --art logic # download only logic texts
19
+
20
+ Requirements
21
+ ------------
22
+ pip install requests beautifulsoup4
23
+ """
24
+
25
+ import argparse
26
+ import json
27
+ import logging
28
+ import re
29
+ import sys
30
+ import time
31
+ from datetime import datetime, timezone
32
+ from pathlib import Path
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Paths
36
+ # ---------------------------------------------------------------------------
37
+
38
+ SCRIPT_DIR = Path(__file__).resolve().parent
39
+ MANIFEST_PATH = SCRIPT_DIR / "source_manifest.json"
40
+ INBOX_DIR = SCRIPT_DIR.parent / "inbox"
41
+ LOG_PATH = SCRIPT_DIR / "download_log.json"
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # HTTP configuration
45
+ # ---------------------------------------------------------------------------
46
+
47
+ HEADERS = {
48
+ "User-Agent": "MicroGPT-Classics-Downloader/1.0",
49
+ "Accept": "text/html,text/plain,*/*",
50
+ }
51
+ REQUEST_TIMEOUT = 30 # seconds per HTTP request
52
+ INTER_REQUEST_DELAY = 1.0 # seconds between downloads (rate-limit courtesy)
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Logging
56
+ # ---------------------------------------------------------------------------
57
+
58
+ logging.basicConfig(
59
+ level=logging.INFO,
60
+ format="%(asctime)s %(levelname)-8s %(message)s",
61
+ datefmt="%H:%M:%S",
62
+ )
63
+ log = logging.getLogger("download_classics")
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Lazy imports with helpful error messages
67
+ # ---------------------------------------------------------------------------
68
+
69
+ def _require_requests():
70
+ try:
71
+ import requests
72
+ return requests
73
+ except ImportError:
74
+ log.error("'requests' is not installed. Run: pip install requests")
75
+ sys.exit(1)
76
+
77
+
78
+ def _require_bs4():
79
+ try:
80
+ from bs4 import BeautifulSoup
81
+ return BeautifulSoup
82
+ except ImportError:
83
+ log.error("'beautifulsoup4' is not installed. Run: pip install beautifulsoup4")
84
+ sys.exit(1)
85
+
86
+
87
+ # ---------------------------------------------------------------------------
88
+ # Manifest loading
89
+ # ---------------------------------------------------------------------------
90
+
91
+ def load_manifest() -> dict:
92
+ if not MANIFEST_PATH.exists():
93
+ log.error("Manifest not found: %s", MANIFEST_PATH)
94
+ sys.exit(1)
95
+ with MANIFEST_PATH.open(encoding="utf-8") as fh:
96
+ manifest = json.load(fh)
97
+ return manifest
98
+
99
+
100
+ def filter_sources(sources: list[dict], *, art: str | None = None, enabled_only: bool = True) -> list[dict]:
101
+ result = sources
102
+ if enabled_only:
103
+ result = [s for s in result if s.get("enabled", True)]
104
+ if art:
105
+ result = [s for s in result if s.get("art", "").lower() == art.lower()]
106
+ return result
107
+
108
+
109
+ # ---------------------------------------------------------------------------
110
+ # Text cleaning helpers
111
+ # ---------------------------------------------------------------------------
112
+
113
+ def _strip_gutenberg_boilerplate(text: str) -> str:
114
+ """Remove Project Gutenberg header and footer legalese from raw .txt files."""
115
+ # The header ends at a line matching "*** START OF..." or similar
116
+ start_markers = [
117
+ r"\*\*\* START OF (THE|THIS) PROJECT GUTENBERG",
118
+ r"\*\*\*START OF (THE|THIS) PROJECT GUTENBERG",
119
+ ]
120
+ end_markers = [
121
+ r"\*\*\* END OF (THE|THIS) PROJECT GUTENBERG",
122
+ r"\*\*\*END OF (THE|THIS) PROJECT GUTENBERG",
123
+ ]
124
+
125
+ lines = text.splitlines()
126
+ start_idx = 0
127
+ end_idx = len(lines)
128
+
129
+ for i, line in enumerate(lines):
130
+ for pat in start_markers:
131
+ if re.search(pat, line, re.IGNORECASE):
132
+ start_idx = i + 1
133
+ break
134
+
135
+ for i, line in enumerate(lines):
136
+ for pat in end_markers:
137
+ if re.search(pat, line, re.IGNORECASE):
138
+ end_idx = i
139
+ break
140
+
141
+ body = lines[start_idx:end_idx]
142
+ return "\n".join(body).strip()
143
+
144
+
145
+ def _extract_mit_classics_text(html: str, source_id: str) -> str:
146
+ """
147
+ Strip MIT Classics HTML down to the main prose body.
148
+
149
+ MIT Classics pages wrap the actual text inside a fairly simple layout:
150
+ navigation links at top, a <pre> or body paragraphs in the middle, and
151
+ a small footer. We grab everything inside <body>, remove <script>,
152
+ <style>, <a> navigation blocks, and return clean plain text.
153
+ """
154
+ BeautifulSoup = _require_bs4()
155
+
156
+ soup = BeautifulSoup(html, "html.parser")
157
+
158
+ # Kill noise elements
159
+ for tag in soup(["script", "style", "head", "nav"]):
160
+ tag.decompose()
161
+
162
+ # MIT Classics often wraps text in a <pre> block
163
+ pre = soup.find("pre")
164
+ if pre:
165
+ raw = pre.get_text(separator="\n")
166
+ else:
167
+ # Fall back to extracting from the full body
168
+ body = soup.find("body") or soup
169
+ # Remove navbars: anchors that are just navigation images or short links
170
+ for a in body.find_all("a"):
171
+ # Keep inline text links; drop bare image links
172
+ if a.find("img"):
173
+ a.decompose()
174
+ raw = body.get_text(separator="\n")
175
+
176
+ # Collapse excessive blank lines (more than two in a row → one blank)
177
+ cleaned = re.sub(r"\n{3,}", "\n\n", raw)
178
+ return cleaned.strip()
179
+
180
+
181
+ # ---------------------------------------------------------------------------
182
+ # Downloaders
183
+ # ---------------------------------------------------------------------------
184
+
185
+ def _download_gutenberg(source: dict, requests) -> str:
186
+ """Fetch a plain-text Gutenberg file and strip boilerplate."""
187
+ url = source["url"]
188
+ log.info(" GET %s", url)
189
+ resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
190
+ resp.raise_for_status()
191
+
192
+ # Gutenberg serves UTF-8 but sometimes declares latin-1; detect from BOM
193
+ try:
194
+ text = resp.content.decode("utf-8-sig")
195
+ except UnicodeDecodeError:
196
+ text = resp.content.decode("latin-1")
197
+
198
+ cleaned = _strip_gutenberg_boilerplate(text)
199
+ if len(cleaned) < 1000:
200
+ raise ValueError(
201
+ f"Gutenberg body suspiciously short ({len(cleaned)} chars) for {source['id']}; "
202
+ "boilerplate stripping may have failed"
203
+ )
204
+ return cleaned
205
+
206
+
207
+ def _download_mit_classics(source: dict, requests) -> str:
208
+ """Fetch an MIT Classics text file (.mb.txt) or HTML page."""
209
+ url = source["url"]
210
+ log.info(" GET %s", url)
211
+ resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
212
+ resp.raise_for_status()
213
+
214
+ # .mb.txt files are plain text - no HTML parsing needed
215
+ if url.endswith(".mb.txt") or url.endswith(".txt"):
216
+ text = resp.content.decode("utf-8", errors="replace").strip()
217
+ else:
218
+ html = resp.content.decode("utf-8", errors="replace")
219
+ text = _extract_mit_classics_text(html, source["id"])
220
+
221
+ if len(text) < 500:
222
+ raise ValueError(
223
+ f"MIT Classics body suspiciously short ({len(text)} chars) for {source['id']}"
224
+ )
225
+ return text
226
+
227
+
228
+ def _download_ia(source: dict, requests) -> str:
229
+ """Fetch plain text from Internet Archive."""
230
+ from sources.ia_search import get_ia_text
231
+
232
+ identifier = source.get("identifier") or source.get("id")
233
+ if not identifier:
234
+ raise ValueError(f"No 'identifier' field for IA source: {source}")
235
+
236
+ log.info(" Fetching IA text for: %s", identifier)
237
+ text = get_ia_text(identifier)
238
+
239
+ if len(text) < 1000:
240
+ raise ValueError(
241
+ f"IA body suspiciously short ({len(text)} chars) for {identifier}"
242
+ )
243
+ return text
244
+
245
+
246
+ DOWNLOADER_MAP = {
247
+ "gutenberg": _download_gutenberg,
248
+ "mit_classics": _download_mit_classics,
249
+ "internet_archive": _download_ia,
250
+ }
251
+
252
+
253
+ # ---------------------------------------------------------------------------
254
+ # Core download logic
255
+ # ---------------------------------------------------------------------------
256
+
257
+ def download_source(source: dict, requests, dry_run: bool = False) -> dict:
258
+ """
259
+ Download a single source and save it to inbox/.
260
+
261
+ Returns a result dict suitable for inclusion in download_log.json.
262
+ """
263
+ source_id = source["id"]
264
+ filename = source["filename"]
265
+ dest = INBOX_DIR / filename
266
+
267
+ result = {
268
+ "id": source_id,
269
+ "filename": filename,
270
+ "url": source["url"],
271
+ "timestamp": datetime.now(timezone.utc).isoformat(),
272
+ "status": None,
273
+ "bytes": 0,
274
+ "error": None,
275
+ }
276
+
277
+ if dry_run:
278
+ log.info("[DRY-RUN] Would download: %s → %s", source_id, dest)
279
+ result["status"] = "dry_run"
280
+ return result
281
+
282
+ source_type = source.get("source_type", "gutenberg")
283
+ downloader = DOWNLOADER_MAP.get(source_type)
284
+ if downloader is None:
285
+ result["status"] = "skipped"
286
+ result["error"] = f"Unknown source_type '{source_type}'"
287
+ log.warning(" Skipping %s: %s", source_id, result["error"])
288
+ return result
289
+
290
+ try:
291
+ text = downloader(source, requests)
292
+ INBOX_DIR.mkdir(parents=True, exist_ok=True)
293
+ dest.write_text(text, encoding="utf-8")
294
+ byte_count = dest.stat().st_size
295
+ result["status"] = "ok"
296
+ result["bytes"] = byte_count
297
+ log.info(" Saved %s (%s bytes)", dest.name, f"{byte_count:,}")
298
+ except Exception as exc:
299
+ result["status"] = "error"
300
+ result["error"] = str(exc)
301
+ log.error(" FAILED %s: %s", source_id, exc)
302
+
303
+ return result
304
+
305
+
306
+ # ---------------------------------------------------------------------------
307
+ # Log persistence
308
+ # ---------------------------------------------------------------------------
309
+
310
+ def load_download_log() -> list:
311
+ if LOG_PATH.exists():
312
+ try:
313
+ with LOG_PATH.open(encoding="utf-8") as fh:
314
+ return json.load(fh)
315
+ except (json.JSONDecodeError, OSError):
316
+ return []
317
+ return []
318
+
319
+
320
+ def save_download_log(entries: list) -> None:
321
+ with LOG_PATH.open("w", encoding="utf-8") as fh:
322
+ json.dump(entries, fh, indent=2, ensure_ascii=False)
323
+
324
+
325
+ # ---------------------------------------------------------------------------
326
+ # CLI presentation helpers
327
+ # ---------------------------------------------------------------------------
328
+
329
+ CATEGORY_COLOURS = {
330
+ "trivium": "\033[96m", # cyan
331
+ "quadrivium": "\033[93m", # yellow
332
+ "bridging": "\033[95m", # magenta
333
+ }
334
+ RESET = "\033[0m"
335
+ BOLD = "\033[1m"
336
+
337
+
338
+ def _coloured(text: str, colour: str) -> str:
339
+ """Apply ANSI colour if stdout is a TTY."""
340
+ if sys.stdout.isatty():
341
+ return f"{colour}{text}{RESET}"
342
+ return text
343
+
344
+
345
+ def print_source_list(sources: list[dict]) -> None:
346
+ """Pretty-print the full catalogue of sources."""
347
+ arts: dict[str, list[dict]] = {}
348
+ for s in sources:
349
+ arts.setdefault(s.get("art", "unknown"), []).append(s)
350
+
351
+ total_words = sum(s.get("estimated_words", 0) for s in sources)
352
+ enabled_count = sum(1 for s in sources if s.get("enabled", True))
353
+
354
+ print(f"\n{BOLD}MicroGPT Classical Corpus - {len(sources)} sources "
355
+ f"({enabled_count} enabled, ~{total_words:,} words){RESET}\n")
356
+
357
+ for art, art_sources in sorted(arts.items()):
358
+ cat = art_sources[0].get("category", "")
359
+ colour = CATEGORY_COLOURS.get(cat, "")
360
+ label = _coloured(f"[{art.upper()}]", BOLD + colour)
361
+ print(f" {label}")
362
+ for s in art_sources:
363
+ enabled_marker = " " if s.get("enabled", True) else " [DISABLED] "
364
+ est = s.get("estimated_words", 0)
365
+ print(
366
+ f" {enabled_marker}{s['author']}: {s['title']}"
367
+ f" ({est:,} words) -> {s['filename']}"
368
+ )
369
+ print()
370
+
371
+
372
+ def print_summary(results: list[dict]) -> None:
373
+ """Print a download-run summary table."""
374
+ ok = [r for r in results if r["status"] == "ok"]
375
+ failed = [r for r in results if r["status"] == "error"]
376
+ skipped = [r for r in results if r["status"] in ("skipped", "dry_run")]
377
+
378
+ total_bytes = sum(r["bytes"] for r in ok)
379
+
380
+ print(f"\n{'-' * 60}")
381
+ print(f" Downloaded : {len(ok)}")
382
+ print(f" Failed : {len(failed)}")
383
+ print(f" Skipped : {len(skipped)}")
384
+ print(f" Total size : {total_bytes / 1024:.1f} KB ({total_bytes:,} bytes)")
385
+
386
+ if failed:
387
+ print(f"\n {BOLD}Failures:{RESET}")
388
+ for r in failed:
389
+ print(f" - {r['id']}: {r['error']}")
390
+ print(f"{'-' * 60}\n")
391
+
392
+
393
+ # ---------------------------------------------------------------------------
394
+ # Main entry point
395
+ # ---------------------------------------------------------------------------
396
+
397
+ def main() -> None:
398
+ parser = argparse.ArgumentParser(
399
+ description="Download classical Trivium/Quadrivium texts for MicroGPT training.",
400
+ formatter_class=argparse.RawDescriptionHelpFormatter,
401
+ epilog=__doc__,
402
+ )
403
+ parser.add_argument(
404
+ "--list",
405
+ action="store_true",
406
+ help="List all available sources and exit",
407
+ )
408
+ parser.add_argument(
409
+ "--dry-run",
410
+ action="store_true",
411
+ help="Show what would be downloaded without actually downloading",
412
+ )
413
+ parser.add_argument(
414
+ "--art",
415
+ metavar="ART",
416
+ help="Download only sources for the given art (e.g. logic, rhetoric, geometry)",
417
+ )
418
+ parser.add_argument(
419
+ "--include-disabled",
420
+ action="store_true",
421
+ help="Include sources marked enabled=false in the manifest",
422
+ )
423
+ args = parser.parse_args()
424
+
425
+ # Load manifest
426
+ manifest = load_manifest()
427
+ all_sources = manifest.get("sources", [])
428
+
429
+ # --list just prints and exits
430
+ if args.list:
431
+ print_source_list(all_sources)
432
+ return
433
+
434
+ # Determine which sources to process
435
+ enabled_only = not args.include_disabled
436
+ sources = filter_sources(all_sources, art=args.art, enabled_only=enabled_only)
437
+
438
+ if not sources:
439
+ filter_desc = f" with art='{args.art}'" if args.art else ""
440
+ log.warning("No enabled sources found%s. Use --list to see all.", filter_desc)
441
+ return
442
+
443
+ if args.dry_run:
444
+ log.info("DRY RUN - nothing will be written")
445
+
446
+ # Print what we intend to do
447
+ art_desc = f" (art={args.art})" if args.art else ""
448
+ log.info(
449
+ "Processing %d source(s)%s - inbox: %s",
450
+ len(sources), art_desc, INBOX_DIR
451
+ )
452
+
453
+ requests = _require_requests()
454
+ if not args.dry_run:
455
+ _require_bs4() # validate early so we don't fail mid-run
456
+
457
+ results: list[dict] = []
458
+ for i, source in enumerate(sources, start=1):
459
+ log.info(
460
+ "[%d/%d] %s - %s (%s)",
461
+ i, len(sources),
462
+ source["id"],
463
+ source["title"],
464
+ source["source_type"],
465
+ )
466
+
467
+ result = download_source(source, requests, dry_run=args.dry_run)
468
+ results.append(result)
469
+
470
+ # Rate-limit: wait between requests (skip after last one)
471
+ if i < len(sources) and not args.dry_run:
472
+ time.sleep(INTER_REQUEST_DELAY)
473
+
474
+ # Persist log
475
+ if not args.dry_run:
476
+ log_entries = load_download_log()
477
+ log_entries.extend(results)
478
+ save_download_log(log_entries)
479
+ log.info("Download log updated: %s", LOG_PATH)
480
+
481
+ print_summary(results)
482
+
483
+
484
+ if __name__ == "__main__":
485
+ main()