| """ |
| Internet Archive search and text retrieval for the text processing pipeline. |
| |
| Provides search, metadata, and text download capabilities for IA's |
| vast library of digitized classical texts. |
| |
| Usage: |
| from sources.ia_search import search_ia, get_ia_text, get_ia_formats |
| |
| results = search_ia("aristotle philosophy", rows=10) |
| text = get_ia_text("aristotlemetaphysi00markup") |
| """ |
|
|
| import logging |
| import re |
| from urllib.parse import quote_plus |
|
|
| logger = logging.getLogger(__name__) |
|
|
| |
| |
| |
|
|
| def _require_requests(): |
| try: |
| import requests |
| return requests |
| except ImportError: |
| logger.error("'requests' is not installed. Run: pip install requests") |
| raise |
|
|
|
|
| |
| |
| |
|
|
| IA_SEARCH_URL = "https://archive.org/advancedsearch.php" |
| IA_METADATA_URL = "https://archive.org/metadata" |
| IA_DOWNLOAD_URL = "https://archive.org/download" |
|
|
| HEADERS = { |
| "User-Agent": "PhilosophyCorpus-Pipeline/1.0", |
| "Accept": "application/json,text/plain,*/*", |
| } |
| REQUEST_TIMEOUT = 30 |
|
|
| |
| SUBJECT_FILTERS = { |
| "philosophy": "subject:(philosophy OR philosophical)", |
| "mathematics": "subject:(mathematics OR geometry OR arithmetic)", |
| "rhetoric": "subject:(rhetoric OR oratory)", |
| "logic": "subject:(logic OR reasoning OR dialectic)", |
| "ethics": "subject:(ethics OR moral)", |
| "metaphysics": "subject:(metaphysics OR ontology)", |
| "politics": "subject:(politics OR political)", |
| "classical": "subject:(classical OR ancient OR greek OR roman OR latin)", |
| } |
|
|
|
|
| |
| |
| |
|
|
| def search_ia( |
| query: str, |
| subject: str | None = None, |
| rows: int = 25, |
| page: int = 1, |
| ) -> list[dict]: |
| """Search Internet Archive for texts. |
| |
| Args: |
| query: Search query string. |
| subject: Optional subject filter key (e.g., 'philosophy', 'mathematics'). |
| rows: Number of results to return. |
| page: Page number for pagination. |
| |
| Returns: |
| List of result dicts with keys: identifier, title, creator, date, |
| description, downloads. |
| """ |
| requests = _require_requests() |
|
|
| |
| parts = [query, "mediatype:texts"] |
| if subject and subject in SUBJECT_FILTERS: |
| parts.append(SUBJECT_FILTERS[subject]) |
|
|
| full_query = " AND ".join(parts) |
|
|
| params = { |
| "q": full_query, |
| "fl[]": ["identifier", "title", "creator", "date", |
| "description", "downloads", "language"], |
| "sort[]": "downloads desc", |
| "rows": rows, |
| "page": page, |
| "output": "json", |
| } |
|
|
| logger.info("Searching IA: %s", full_query) |
|
|
| resp = requests.get( |
| IA_SEARCH_URL, |
| params=params, |
| headers=HEADERS, |
| timeout=REQUEST_TIMEOUT, |
| ) |
| resp.raise_for_status() |
|
|
| data = resp.json() |
| docs = data.get("response", {}).get("docs", []) |
|
|
| results = [] |
| for doc in docs: |
| results.append({ |
| "identifier": doc.get("identifier", ""), |
| "title": doc.get("title", "Unknown"), |
| "creator": doc.get("creator", "Unknown"), |
| "date": doc.get("date", ""), |
| "description": _truncate(doc.get("description", ""), 200), |
| "downloads": doc.get("downloads", 0), |
| "language": doc.get("language", ""), |
| }) |
|
|
| logger.info("Found %d results", len(results)) |
| return results |
|
|
|
|
| |
| |
| |
|
|
| def get_ia_formats(identifier: str) -> list[dict]: |
| """List available file formats for an IA item. |
| |
| Returns list of dicts with keys: name, format, size. |
| """ |
| requests = _require_requests() |
|
|
| url = f"{IA_METADATA_URL}/{identifier}/files" |
| resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT) |
| resp.raise_for_status() |
|
|
| data = resp.json() |
| files = data.get("result", []) |
|
|
| |
| text_formats = {"DjVuTXT", "Text", "Plain Text", "PDF"} |
| relevant = [] |
| for f in files: |
| fmt = f.get("format", "") |
| if fmt in text_formats or f.get("name", "").endswith((".txt", "_djvu.txt")): |
| relevant.append({ |
| "name": f.get("name", ""), |
| "format": fmt, |
| "size": f.get("size", "0"), |
| }) |
|
|
| return relevant |
|
|
|
|
| |
| |
| |
|
|
| def get_ia_text(identifier: str) -> str: |
| """Download the best available plain text for an IA item. |
| |
| Tries in order: |
| 1. {id}_djvu.txt (OCR-derived plain text — most common) |
| 2. Any .txt file in the item |
| 3. Falls back to the first available text format |
| |
| Returns: |
| The full text as a string. |
| |
| Raises: |
| ValueError: If no text could be retrieved. |
| """ |
| requests = _require_requests() |
|
|
| |
| djvu_url = f"{IA_DOWNLOAD_URL}/{identifier}/{identifier}_djvu.txt" |
| logger.info("Trying DjVu text: %s", djvu_url) |
|
|
| try: |
| resp = requests.get(djvu_url, headers=HEADERS, timeout=REQUEST_TIMEOUT) |
| if resp.status_code == 200 and len(resp.text.strip()) > 500: |
| logger.info("Got DjVu text: %d chars", len(resp.text)) |
| return resp.text |
| except Exception as exc: |
| logger.debug("DjVu text failed: %s", exc) |
|
|
| |
| formats = get_ia_formats(identifier) |
| for f in formats: |
| name = f["name"] |
| if name.endswith(".txt") and name != f"{identifier}_djvu.txt": |
| txt_url = f"{IA_DOWNLOAD_URL}/{identifier}/{name}" |
| logger.info("Trying alternate text: %s", txt_url) |
| try: |
| resp = requests.get(txt_url, headers=HEADERS, timeout=REQUEST_TIMEOUT) |
| if resp.status_code == 200 and len(resp.text.strip()) > 500: |
| logger.info("Got text from %s: %d chars", name, len(resp.text)) |
| return resp.text |
| except Exception as exc: |
| logger.debug("Alternate text failed (%s): %s", name, exc) |
|
|
| raise ValueError( |
| f"No plain text available for IA item '{identifier}'. " |
| f"Available formats: {[f['name'] for f in formats]}" |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def _truncate(text: str | list, max_len: int) -> str: |
| """Truncate text (or join list) to max_len characters.""" |
| if isinstance(text, list): |
| text = " ".join(text) |
| if not isinstance(text, str): |
| text = str(text) if text else "" |
| if len(text) > max_len: |
| return text[:max_len] + "..." |
| return text |
|
|