LisaMegaWatts commited on
Commit
1b33d1c
·
verified ·
1 Parent(s): 2e20c4a

Upload sources/ia_search.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. sources/ia_search.py +228 -0
sources/ia_search.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Internet Archive search and text retrieval for the text processing pipeline.
3
+
4
+ Provides search, metadata, and text download capabilities for IA's
5
+ vast library of digitized classical texts.
6
+
7
+ Usage:
8
+ from sources.ia_search import search_ia, get_ia_text, get_ia_formats
9
+
10
+ results = search_ia("aristotle philosophy", rows=10)
11
+ text = get_ia_text("aristotlemetaphysi00markup")
12
+ """
13
+
14
+ import logging
15
+ import re
16
+ from urllib.parse import quote_plus
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # Lazy imports
22
+ # ---------------------------------------------------------------------------
23
+
24
+ def _require_requests():
25
+ try:
26
+ import requests
27
+ return requests
28
+ except ImportError:
29
+ logger.error("'requests' is not installed. Run: pip install requests")
30
+ raise
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Constants
35
+ # ---------------------------------------------------------------------------
36
+
37
+ IA_SEARCH_URL = "https://archive.org/advancedsearch.php"
38
+ IA_METADATA_URL = "https://archive.org/metadata"
39
+ IA_DOWNLOAD_URL = "https://archive.org/download"
40
+
41
+ HEADERS = {
42
+ "User-Agent": "PhilosophyCorpus-Pipeline/1.0",
43
+ "Accept": "application/json,text/plain,*/*",
44
+ }
45
+ REQUEST_TIMEOUT = 30
46
+
47
+ # Subject filters for philosophical texts
48
+ SUBJECT_FILTERS = {
49
+ "philosophy": "subject:(philosophy OR philosophical)",
50
+ "mathematics": "subject:(mathematics OR geometry OR arithmetic)",
51
+ "rhetoric": "subject:(rhetoric OR oratory)",
52
+ "logic": "subject:(logic OR reasoning OR dialectic)",
53
+ "ethics": "subject:(ethics OR moral)",
54
+ "metaphysics": "subject:(metaphysics OR ontology)",
55
+ "politics": "subject:(politics OR political)",
56
+ "classical": "subject:(classical OR ancient OR greek OR roman OR latin)",
57
+ }
58
+
59
+
60
+ # ---------------------------------------------------------------------------
61
+ # Search
62
+ # ---------------------------------------------------------------------------
63
+
64
+ def search_ia(
65
+ query: str,
66
+ subject: str | None = None,
67
+ rows: int = 25,
68
+ page: int = 1,
69
+ ) -> list[dict]:
70
+ """Search Internet Archive for texts.
71
+
72
+ Args:
73
+ query: Search query string.
74
+ subject: Optional subject filter key (e.g., 'philosophy', 'mathematics').
75
+ rows: Number of results to return.
76
+ page: Page number for pagination.
77
+
78
+ Returns:
79
+ List of result dicts with keys: identifier, title, creator, date,
80
+ description, downloads.
81
+ """
82
+ requests = _require_requests()
83
+
84
+ # Build query
85
+ parts = [query, "mediatype:texts"]
86
+ if subject and subject in SUBJECT_FILTERS:
87
+ parts.append(SUBJECT_FILTERS[subject])
88
+
89
+ full_query = " AND ".join(parts)
90
+
91
+ params = {
92
+ "q": full_query,
93
+ "fl[]": ["identifier", "title", "creator", "date",
94
+ "description", "downloads", "language"],
95
+ "sort[]": "downloads desc",
96
+ "rows": rows,
97
+ "page": page,
98
+ "output": "json",
99
+ }
100
+
101
+ logger.info("Searching IA: %s", full_query)
102
+
103
+ resp = requests.get(
104
+ IA_SEARCH_URL,
105
+ params=params,
106
+ headers=HEADERS,
107
+ timeout=REQUEST_TIMEOUT,
108
+ )
109
+ resp.raise_for_status()
110
+
111
+ data = resp.json()
112
+ docs = data.get("response", {}).get("docs", [])
113
+
114
+ results = []
115
+ for doc in docs:
116
+ results.append({
117
+ "identifier": doc.get("identifier", ""),
118
+ "title": doc.get("title", "Unknown"),
119
+ "creator": doc.get("creator", "Unknown"),
120
+ "date": doc.get("date", ""),
121
+ "description": _truncate(doc.get("description", ""), 200),
122
+ "downloads": doc.get("downloads", 0),
123
+ "language": doc.get("language", ""),
124
+ })
125
+
126
+ logger.info("Found %d results", len(results))
127
+ return results
128
+
129
+
130
+ # ---------------------------------------------------------------------------
131
+ # Metadata and format discovery
132
+ # ---------------------------------------------------------------------------
133
+
134
+ def get_ia_formats(identifier: str) -> list[dict]:
135
+ """List available file formats for an IA item.
136
+
137
+ Returns list of dicts with keys: name, format, size.
138
+ """
139
+ requests = _require_requests()
140
+
141
+ url = f"{IA_METADATA_URL}/{identifier}/files"
142
+ resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
143
+ resp.raise_for_status()
144
+
145
+ data = resp.json()
146
+ files = data.get("result", [])
147
+
148
+ # Filter to text-relevant formats
149
+ text_formats = {"DjVuTXT", "Text", "Plain Text", "PDF"}
150
+ relevant = []
151
+ for f in files:
152
+ fmt = f.get("format", "")
153
+ if fmt in text_formats or f.get("name", "").endswith((".txt", "_djvu.txt")):
154
+ relevant.append({
155
+ "name": f.get("name", ""),
156
+ "format": fmt,
157
+ "size": f.get("size", "0"),
158
+ })
159
+
160
+ return relevant
161
+
162
+
163
+ # ---------------------------------------------------------------------------
164
+ # Text retrieval
165
+ # ---------------------------------------------------------------------------
166
+
167
+ def get_ia_text(identifier: str) -> str:
168
+ """Download the best available plain text for an IA item.
169
+
170
+ Tries in order:
171
+ 1. {id}_djvu.txt (OCR-derived plain text — most common)
172
+ 2. Any .txt file in the item
173
+ 3. Falls back to the first available text format
174
+
175
+ Returns:
176
+ The full text as a string.
177
+
178
+ Raises:
179
+ ValueError: If no text could be retrieved.
180
+ """
181
+ requests = _require_requests()
182
+
183
+ # Strategy 1: Try the standard DjVu text file
184
+ djvu_url = f"{IA_DOWNLOAD_URL}/{identifier}/{identifier}_djvu.txt"
185
+ logger.info("Trying DjVu text: %s", djvu_url)
186
+
187
+ try:
188
+ resp = requests.get(djvu_url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
189
+ if resp.status_code == 200 and len(resp.text.strip()) > 500:
190
+ logger.info("Got DjVu text: %d chars", len(resp.text))
191
+ return resp.text
192
+ except Exception as exc:
193
+ logger.debug("DjVu text failed: %s", exc)
194
+
195
+ # Strategy 2: Check metadata for any .txt file
196
+ formats = get_ia_formats(identifier)
197
+ for f in formats:
198
+ name = f["name"]
199
+ if name.endswith(".txt") and name != f"{identifier}_djvu.txt":
200
+ txt_url = f"{IA_DOWNLOAD_URL}/{identifier}/{name}"
201
+ logger.info("Trying alternate text: %s", txt_url)
202
+ try:
203
+ resp = requests.get(txt_url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
204
+ if resp.status_code == 200 and len(resp.text.strip()) > 500:
205
+ logger.info("Got text from %s: %d chars", name, len(resp.text))
206
+ return resp.text
207
+ except Exception as exc:
208
+ logger.debug("Alternate text failed (%s): %s", name, exc)
209
+
210
+ raise ValueError(
211
+ f"No plain text available for IA item '{identifier}'. "
212
+ f"Available formats: {[f['name'] for f in formats]}"
213
+ )
214
+
215
+
216
+ # ---------------------------------------------------------------------------
217
+ # Helpers
218
+ # ---------------------------------------------------------------------------
219
+
220
+ def _truncate(text: str | list, max_len: int) -> str:
221
+ """Truncate text (or join list) to max_len characters."""
222
+ if isinstance(text, list):
223
+ text = " ".join(text)
224
+ if not isinstance(text, str):
225
+ text = str(text) if text else ""
226
+ if len(text) > max_len:
227
+ return text[:max_len] + "..."
228
+ return text