LisaMegaWatts's picture
Upload sources/ia_search.py with huggingface_hub
1b33d1c verified
"""
Internet Archive search and text retrieval for the text processing pipeline.
Provides search, metadata, and text download capabilities for IA's
vast library of digitized classical texts.
Usage:
from sources.ia_search import search_ia, get_ia_text, get_ia_formats
results = search_ia("aristotle philosophy", rows=10)
text = get_ia_text("aristotlemetaphysi00markup")
"""
import logging
import re
from urllib.parse import quote_plus
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Lazy imports
# ---------------------------------------------------------------------------
def _require_requests():
try:
import requests
return requests
except ImportError:
logger.error("'requests' is not installed. Run: pip install requests")
raise
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
IA_SEARCH_URL = "https://archive.org/advancedsearch.php"
IA_METADATA_URL = "https://archive.org/metadata"
IA_DOWNLOAD_URL = "https://archive.org/download"
HEADERS = {
"User-Agent": "PhilosophyCorpus-Pipeline/1.0",
"Accept": "application/json,text/plain,*/*",
}
REQUEST_TIMEOUT = 30
# Subject filters for philosophical texts
SUBJECT_FILTERS = {
"philosophy": "subject:(philosophy OR philosophical)",
"mathematics": "subject:(mathematics OR geometry OR arithmetic)",
"rhetoric": "subject:(rhetoric OR oratory)",
"logic": "subject:(logic OR reasoning OR dialectic)",
"ethics": "subject:(ethics OR moral)",
"metaphysics": "subject:(metaphysics OR ontology)",
"politics": "subject:(politics OR political)",
"classical": "subject:(classical OR ancient OR greek OR roman OR latin)",
}
# ---------------------------------------------------------------------------
# Search
# ---------------------------------------------------------------------------
def search_ia(
query: str,
subject: str | None = None,
rows: int = 25,
page: int = 1,
) -> list[dict]:
"""Search Internet Archive for texts.
Args:
query: Search query string.
subject: Optional subject filter key (e.g., 'philosophy', 'mathematics').
rows: Number of results to return.
page: Page number for pagination.
Returns:
List of result dicts with keys: identifier, title, creator, date,
description, downloads.
"""
requests = _require_requests()
# Build query
parts = [query, "mediatype:texts"]
if subject and subject in SUBJECT_FILTERS:
parts.append(SUBJECT_FILTERS[subject])
full_query = " AND ".join(parts)
params = {
"q": full_query,
"fl[]": ["identifier", "title", "creator", "date",
"description", "downloads", "language"],
"sort[]": "downloads desc",
"rows": rows,
"page": page,
"output": "json",
}
logger.info("Searching IA: %s", full_query)
resp = requests.get(
IA_SEARCH_URL,
params=params,
headers=HEADERS,
timeout=REQUEST_TIMEOUT,
)
resp.raise_for_status()
data = resp.json()
docs = data.get("response", {}).get("docs", [])
results = []
for doc in docs:
results.append({
"identifier": doc.get("identifier", ""),
"title": doc.get("title", "Unknown"),
"creator": doc.get("creator", "Unknown"),
"date": doc.get("date", ""),
"description": _truncate(doc.get("description", ""), 200),
"downloads": doc.get("downloads", 0),
"language": doc.get("language", ""),
})
logger.info("Found %d results", len(results))
return results
# ---------------------------------------------------------------------------
# Metadata and format discovery
# ---------------------------------------------------------------------------
def get_ia_formats(identifier: str) -> list[dict]:
"""List available file formats for an IA item.
Returns list of dicts with keys: name, format, size.
"""
requests = _require_requests()
url = f"{IA_METADATA_URL}/{identifier}/files"
resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
data = resp.json()
files = data.get("result", [])
# Filter to text-relevant formats
text_formats = {"DjVuTXT", "Text", "Plain Text", "PDF"}
relevant = []
for f in files:
fmt = f.get("format", "")
if fmt in text_formats or f.get("name", "").endswith((".txt", "_djvu.txt")):
relevant.append({
"name": f.get("name", ""),
"format": fmt,
"size": f.get("size", "0"),
})
return relevant
# ---------------------------------------------------------------------------
# Text retrieval
# ---------------------------------------------------------------------------
def get_ia_text(identifier: str) -> str:
"""Download the best available plain text for an IA item.
Tries in order:
1. {id}_djvu.txt (OCR-derived plain text — most common)
2. Any .txt file in the item
3. Falls back to the first available text format
Returns:
The full text as a string.
Raises:
ValueError: If no text could be retrieved.
"""
requests = _require_requests()
# Strategy 1: Try the standard DjVu text file
djvu_url = f"{IA_DOWNLOAD_URL}/{identifier}/{identifier}_djvu.txt"
logger.info("Trying DjVu text: %s", djvu_url)
try:
resp = requests.get(djvu_url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
if resp.status_code == 200 and len(resp.text.strip()) > 500:
logger.info("Got DjVu text: %d chars", len(resp.text))
return resp.text
except Exception as exc:
logger.debug("DjVu text failed: %s", exc)
# Strategy 2: Check metadata for any .txt file
formats = get_ia_formats(identifier)
for f in formats:
name = f["name"]
if name.endswith(".txt") and name != f"{identifier}_djvu.txt":
txt_url = f"{IA_DOWNLOAD_URL}/{identifier}/{name}"
logger.info("Trying alternate text: %s", txt_url)
try:
resp = requests.get(txt_url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
if resp.status_code == 200 and len(resp.text.strip()) > 500:
logger.info("Got text from %s: %d chars", name, len(resp.text))
return resp.text
except Exception as exc:
logger.debug("Alternate text failed (%s): %s", name, exc)
raise ValueError(
f"No plain text available for IA item '{identifier}'. "
f"Available formats: {[f['name'] for f in formats]}"
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _truncate(text: str | list, max_len: int) -> str:
"""Truncate text (or join list) to max_len characters."""
if isinstance(text, list):
text = " ".join(text)
if not isinstance(text, str):
text = str(text) if text else ""
if len(text) > max_len:
return text[:max_len] + "..."
return text