Spaces:

LisaMegaWatts
/

pre-punctuation-processor

Sleeping

App Files Files Community

LisaMegaWatts commited on Feb 20

Commit

1b33d1c

verified ·

1 Parent(s): 2e20c4a

Upload sources/ia_search.py with huggingface_hub

Browse files

Files changed (1) hide show

sources/ia_search.py +228 -0

sources/ia_search.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+Internet Archive search and text retrieval for the text processing pipeline.
+Provides search, metadata, and text download capabilities for IA's
+vast library of digitized classical texts.
+Usage:
+    from sources.ia_search import search_ia, get_ia_text, get_ia_formats
+    results = search_ia("aristotle philosophy", rows=10)
+    text = get_ia_text("aristotlemetaphysi00markup")
+"""
+import logging
+import re
+from urllib.parse import quote_plus
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Lazy imports
+# ---------------------------------------------------------------------------
+def _require_requests():
+    try:
+        import requests
+        return requests
+    except ImportError:
+        logger.error("'requests' is not installed. Run: pip install requests")
+        raise
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+IA_SEARCH_URL = "https://archive.org/advancedsearch.php"
+IA_METADATA_URL = "https://archive.org/metadata"
+IA_DOWNLOAD_URL = "https://archive.org/download"
+HEADERS = {
+    "User-Agent": "PhilosophyCorpus-Pipeline/1.0",
+    "Accept": "application/json,text/plain,*/*",
+}
+REQUEST_TIMEOUT = 30
+# Subject filters for philosophical texts
+SUBJECT_FILTERS = {
+    "philosophy": "subject:(philosophy OR philosophical)",
+    "mathematics": "subject:(mathematics OR geometry OR arithmetic)",
+    "rhetoric": "subject:(rhetoric OR oratory)",
+    "logic": "subject:(logic OR reasoning OR dialectic)",
+    "ethics": "subject:(ethics OR moral)",
+    "metaphysics": "subject:(metaphysics OR ontology)",
+    "politics": "subject:(politics OR political)",
+    "classical": "subject:(classical OR ancient OR greek OR roman OR latin)",
+}
+# ---------------------------------------------------------------------------
+# Search
+# ---------------------------------------------------------------------------
+def search_ia(
+    query: str,
+    subject: str | None = None,
+    rows: int = 25,
+    page: int = 1,
+) -> list[dict]:
+    """Search Internet Archive for texts.
+    Args:
+        query: Search query string.
+        subject: Optional subject filter key (e.g., 'philosophy', 'mathematics').
+        rows: Number of results to return.
+        page: Page number for pagination.
+    Returns:
+        List of result dicts with keys: identifier, title, creator, date,
+        description, downloads.
+    """
+    requests = _require_requests()
+    # Build query
+    parts = [query, "mediatype:texts"]
+    if subject and subject in SUBJECT_FILTERS:
+        parts.append(SUBJECT_FILTERS[subject])
+    full_query = " AND ".join(parts)
+    params = {
+        "q": full_query,
+        "fl[]": ["identifier", "title", "creator", "date",
+                 "description", "downloads", "language"],
+        "sort[]": "downloads desc",
+        "rows": rows,
+        "page": page,
+        "output": "json",
+    }
+    logger.info("Searching IA: %s", full_query)
+    resp = requests.get(
+        IA_SEARCH_URL,
+        params=params,
+        headers=HEADERS,
+        timeout=REQUEST_TIMEOUT,
+    )
+    resp.raise_for_status()
+    data = resp.json()
+    docs = data.get("response", {}).get("docs", [])
+    results = []
+    for doc in docs:
+        results.append({
+            "identifier": doc.get("identifier", ""),
+            "title": doc.get("title", "Unknown"),
+            "creator": doc.get("creator", "Unknown"),
+            "date": doc.get("date", ""),
+            "description": _truncate(doc.get("description", ""), 200),
+            "downloads": doc.get("downloads", 0),
+            "language": doc.get("language", ""),
+        })
+    logger.info("Found %d results", len(results))
+    return results
+# ---------------------------------------------------------------------------
+# Metadata and format discovery
+# ---------------------------------------------------------------------------
+def get_ia_formats(identifier: str) -> list[dict]:
+    """List available file formats for an IA item.
+    Returns list of dicts with keys: name, format, size.
+    """
+    requests = _require_requests()
+    url = f"{IA_METADATA_URL}/{identifier}/files"
+    resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
+    resp.raise_for_status()
+    data = resp.json()
+    files = data.get("result", [])
+    # Filter to text-relevant formats
+    text_formats = {"DjVuTXT", "Text", "Plain Text", "PDF"}
+    relevant = []
+    for f in files:
+        fmt = f.get("format", "")
+        if fmt in text_formats or f.get("name", "").endswith((".txt", "_djvu.txt")):
+            relevant.append({
+                "name": f.get("name", ""),
+                "format": fmt,
+                "size": f.get("size", "0"),
+            })
+    return relevant
+# ---------------------------------------------------------------------------
+# Text retrieval
+# ---------------------------------------------------------------------------
+def get_ia_text(identifier: str) -> str:
+    """Download the best available plain text for an IA item.
+    Tries in order:
+    1. {id}_djvu.txt (OCR-derived plain text — most common)
+    2. Any .txt file in the item
+    3. Falls back to the first available text format
+    Returns:
+        The full text as a string.
+    Raises:
+        ValueError: If no text could be retrieved.
+    """
+    requests = _require_requests()
+    # Strategy 1: Try the standard DjVu text file
+    djvu_url = f"{IA_DOWNLOAD_URL}/{identifier}/{identifier}_djvu.txt"
+    logger.info("Trying DjVu text: %s", djvu_url)
+    try:
+        resp = requests.get(djvu_url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
+        if resp.status_code == 200 and len(resp.text.strip()) > 500:
+            logger.info("Got DjVu text: %d chars", len(resp.text))
+            return resp.text
+    except Exception as exc:
+        logger.debug("DjVu text failed: %s", exc)
+    # Strategy 2: Check metadata for any .txt file
+    formats = get_ia_formats(identifier)
+    for f in formats:
+        name = f["name"]
+        if name.endswith(".txt") and name != f"{identifier}_djvu.txt":
+            txt_url = f"{IA_DOWNLOAD_URL}/{identifier}/{name}"
+            logger.info("Trying alternate text: %s", txt_url)
+            try:
+                resp = requests.get(txt_url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
+                if resp.status_code == 200 and len(resp.text.strip()) > 500:
+                    logger.info("Got text from %s: %d chars", name, len(resp.text))
+                    return resp.text
+            except Exception as exc:
+                logger.debug("Alternate text failed (%s): %s", name, exc)
+    raise ValueError(
+        f"No plain text available for IA item '{identifier}'. "
+        f"Available formats: {[f['name'] for f in formats]}"
+    )
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _truncate(text: str | list, max_len: int) -> str:
+    """Truncate text (or join list) to max_len characters."""
+    if isinstance(text, list):
+        text = " ".join(text)
+    if not isinstance(text, str):
+        text = str(text) if text else ""
+    if len(text) > max_len:
+        return text[:max_len] + "..."
+    return text