Spaces:

LisaMegaWatts
/

pre-punctuation-processor

Sleeping

App Files Files Community

pre-punctuation-processor / sources /ia_search.py

LisaMegaWatts

Upload sources/ia_search.py with huggingface_hub

1b33d1c verified about 2 months ago

raw

history blame contribute delete

7.38 kB

	"""
	Internet Archive search and text retrieval for the text processing pipeline.

	Provides search, metadata, and text download capabilities for IA's
	vast library of digitized classical texts.

	Usage:
	from sources.ia_search import search_ia, get_ia_text, get_ia_formats

	results = search_ia("aristotle philosophy", rows=10)
	text = get_ia_text("aristotlemetaphysi00markup")
	"""

	import logging
	import re
	from urllib.parse import quote_plus

	logger = logging.getLogger(__name__)

	# ---------------------------------------------------------------------------
	# Lazy imports
	# ---------------------------------------------------------------------------

	def _require_requests():
	try:
	import requests
	return requests
	except ImportError:
	logger.error("'requests' is not installed. Run: pip install requests")
	raise


	# ---------------------------------------------------------------------------
	# Constants
	# ---------------------------------------------------------------------------

	IA_SEARCH_URL = "https://archive.org/advancedsearch.php"
	IA_METADATA_URL = "https://archive.org/metadata"
	IA_DOWNLOAD_URL = "https://archive.org/download"

	HEADERS = {
	"User-Agent": "PhilosophyCorpus-Pipeline/1.0",
	"Accept": "application/json,text/plain,/",
	}
	REQUEST_TIMEOUT = 30

	# Subject filters for philosophical texts
	SUBJECT_FILTERS = {
	"philosophy": "subject:(philosophy OR philosophical)",
	"mathematics": "subject:(mathematics OR geometry OR arithmetic)",
	"rhetoric": "subject:(rhetoric OR oratory)",
	"logic": "subject:(logic OR reasoning OR dialectic)",
	"ethics": "subject:(ethics OR moral)",
	"metaphysics": "subject:(metaphysics OR ontology)",
	"politics": "subject:(politics OR political)",
	"classical": "subject:(classical OR ancient OR greek OR roman OR latin)",
	}


	# ---------------------------------------------------------------------------
	# Search
	# ---------------------------------------------------------------------------

	def search_ia(
	query: str,
	subject: str \| None = None,
	rows: int = 25,
	page: int = 1,
	) -> list[dict]:
	"""Search Internet Archive for texts.

	Args:
	query: Search query string.
	subject: Optional subject filter key (e.g., 'philosophy', 'mathematics').
	rows: Number of results to return.
	page: Page number for pagination.

	Returns:
	List of result dicts with keys: identifier, title, creator, date,
	description, downloads.
	"""
	requests = _require_requests()

	# Build query
	parts = [query, "mediatype:texts"]
	if subject and subject in SUBJECT_FILTERS:
	parts.append(SUBJECT_FILTERS[subject])

	full_query = " AND ".join(parts)

	params = {
	"q": full_query,
	"fl[]": ["identifier", "title", "creator", "date",
	"description", "downloads", "language"],
	"sort[]": "downloads desc",
	"rows": rows,
	"page": page,
	"output": "json",
	}

	logger.info("Searching IA: %s", full_query)

	resp = requests.get(
	IA_SEARCH_URL,
	params=params,
	headers=HEADERS,
	timeout=REQUEST_TIMEOUT,
	)
	resp.raise_for_status()

	data = resp.json()
	docs = data.get("response", {}).get("docs", [])

	results = []
	for doc in docs:
	results.append({
	"identifier": doc.get("identifier", ""),
	"title": doc.get("title", "Unknown"),
	"creator": doc.get("creator", "Unknown"),
	"date": doc.get("date", ""),
	"description": _truncate(doc.get("description", ""), 200),
	"downloads": doc.get("downloads", 0),
	"language": doc.get("language", ""),
	})

	logger.info("Found %d results", len(results))
	return results


	# ---------------------------------------------------------------------------
	# Metadata and format discovery
	# ---------------------------------------------------------------------------

	def get_ia_formats(identifier: str) -> list[dict]:
	"""List available file formats for an IA item.

	Returns list of dicts with keys: name, format, size.
	"""
	requests = _require_requests()

	url = f"{IA_METADATA_URL}/{identifier}/files"
	resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
	resp.raise_for_status()

	data = resp.json()
	files = data.get("result", [])

	# Filter to text-relevant formats
	text_formats = {"DjVuTXT", "Text", "Plain Text", "PDF"}
	relevant = []
	for f in files:
	fmt = f.get("format", "")
	if fmt in text_formats or f.get("name", "").endswith((".txt", "_djvu.txt")):
	relevant.append({
	"name": f.get("name", ""),
	"format": fmt,
	"size": f.get("size", "0"),
	})

	return relevant


	# ---------------------------------------------------------------------------
	# Text retrieval
	# ---------------------------------------------------------------------------

	def get_ia_text(identifier: str) -> str:
	"""Download the best available plain text for an IA item.

	Tries in order:
	1. {id}_djvu.txt (OCR-derived plain text — most common)
	2. Any .txt file in the item
	3. Falls back to the first available text format

	Returns:
	The full text as a string.

	Raises:
	ValueError: If no text could be retrieved.
	"""
	requests = _require_requests()

	# Strategy 1: Try the standard DjVu text file
	djvu_url = f"{IA_DOWNLOAD_URL}/{identifier}/{identifier}_djvu.txt"
	logger.info("Trying DjVu text: %s", djvu_url)

	try:
	resp = requests.get(djvu_url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
	if resp.status_code == 200 and len(resp.text.strip()) > 500:
	logger.info("Got DjVu text: %d chars", len(resp.text))
	return resp.text
	except Exception as exc:
	logger.debug("DjVu text failed: %s", exc)

	# Strategy 2: Check metadata for any .txt file
	formats = get_ia_formats(identifier)
	for f in formats:
	name = f["name"]
	if name.endswith(".txt") and name != f"{identifier}_djvu.txt":
	txt_url = f"{IA_DOWNLOAD_URL}/{identifier}/{name}"
	logger.info("Trying alternate text: %s", txt_url)
	try:
	resp = requests.get(txt_url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
	if resp.status_code == 200 and len(resp.text.strip()) > 500:
	logger.info("Got text from %s: %d chars", name, len(resp.text))
	return resp.text
	except Exception as exc:
	logger.debug("Alternate text failed (%s): %s", name, exc)

	raise ValueError(
	f"No plain text available for IA item '{identifier}'. "
	f"Available formats: {[f['name'] for f in formats]}"
	)


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------

	def _truncate(text: str \| list, max_len: int) -> str:
	"""Truncate text (or join list) to max_len characters."""
	if isinstance(text, list):
	text = " ".join(text)
	if not isinstance(text, str):
	text = str(text) if text else ""
	if len(text) > max_len:
	return text[:max_len] + "..."
	return text