LisaMegaWatts commited on
Commit
a9bec47
·
verified ·
1 Parent(s): 2f85e47

Enable Gradio queue for streaming UI updates

Browse files
Files changed (1) hide show
  1. app.py +187 -4
app.py CHANGED
@@ -1,8 +1,9 @@
1
  """
2
  Gradio frontend for the text processing pipeline.
3
 
4
- Provides drag-and-drop file upload, URL fetching, Internet Archive
5
- search/browse, and corpus management with HuggingFace push.
 
6
 
7
  Usage:
8
  python app.py # Launch on http://localhost:7860
@@ -168,7 +169,126 @@ def add_ia_text(identifier: str) -> str:
168
 
169
 
170
  # ---------------------------------------------------------------------------
171
- # Tab 3: Corpus Management
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  # ---------------------------------------------------------------------------
173
 
174
  def get_corpus_stats() -> str:
@@ -282,6 +402,68 @@ def build_ui():
282
  fetch_output = gr.Textbox(label="Result", lines=4)
283
  fetch_btn.click(fetch_url, inputs=[url_input], outputs=[fetch_output])
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  with gr.Tab("Search Internet Archive"):
286
  gr.Markdown("### Search the Internet Archive for classical texts")
287
  with gr.Row():
@@ -353,7 +535,8 @@ def main():
353
  args = parser.parse_args()
354
 
355
  app = build_ui()
356
- app.launch(share=args.share, server_port=args.port)
 
357
 
358
 
359
  if __name__ == "__main__":
 
1
  """
2
  Gradio frontend for the text processing pipeline.
3
 
4
+ Provides drag-and-drop file upload, URL fetching, search across
5
+ Project Gutenberg / MIT Classics / Internet Archive, and corpus
6
+ management with HuggingFace push.
7
 
8
  Usage:
9
  python app.py # Launch on http://localhost:7860
 
169
 
170
 
171
  # ---------------------------------------------------------------------------
172
+ # Tab 3: Search Project Gutenberg
173
+ # ---------------------------------------------------------------------------
174
+
175
+ def search_gutenberg_ui(query: str, topic: str) -> list[list]:
176
+ """Search Gutenberg via Gutendex and return results as table rows."""
177
+ if not query.strip():
178
+ return []
179
+
180
+ from sources.gutenberg_search import search_gutenberg
181
+
182
+ topic_key = topic.lower() if topic != "All" else None
183
+ results = search_gutenberg(query, topic=topic_key, rows=20)
184
+
185
+ rows = []
186
+ for r in results:
187
+ rows.append([
188
+ str(r["id"]),
189
+ r["title"],
190
+ r["author"],
191
+ r["subjects"][:60],
192
+ str(r["download_count"]),
193
+ ])
194
+
195
+ return rows
196
+
197
+
198
+ def add_gutenberg_text(book_id: str) -> str:
199
+ """Download a Gutenberg text and process it through the pipeline."""
200
+ if not book_id.strip():
201
+ return "Please enter a Gutenberg book ID."
202
+
203
+ from sources.gutenberg_search import get_gutenberg_text
204
+
205
+ pipeline = get_pipeline()
206
+
207
+ try:
208
+ bid = int(book_id.strip())
209
+ text = get_gutenberg_text(bid)
210
+
211
+ fname = f"gutenberg_{bid}.txt"
212
+ dest = pipeline.inbox / fname
213
+ dest.write_text(text, encoding="utf-8")
214
+
215
+ new_chunks = pipeline.process_inbox()
216
+ train_n, val_n = pipeline.rebuild_output()
217
+
218
+ return (
219
+ f"Downloaded: Gutenberg #{bid} ({len(text):,} chars)\n"
220
+ f"Processed: {new_chunks} new chunks\n"
221
+ f"Total corpus: {train_n} train / {val_n} val"
222
+ )
223
+ except ValueError as e:
224
+ return f"Error: Invalid book ID '{book_id}' — enter a number (e.g. 1497)"
225
+ except Exception as e:
226
+ return f"Error: {e}"
227
+
228
+
229
+ # ---------------------------------------------------------------------------
230
+ # Tab 4: Browse MIT Classics
231
+ # ---------------------------------------------------------------------------
232
+
233
+ def search_mit_ui(query: str, author: str) -> list[list]:
234
+ """Search MIT Classics catalog and return results as table rows."""
235
+ from sources.mit_classics_search import search_mit_classics
236
+
237
+ author_key = author if author != "All" else ""
238
+ results = search_mit_classics(query=query.strip(), author=author_key)
239
+
240
+ rows = []
241
+ for r in results:
242
+ rows.append([
243
+ r["author"],
244
+ r["title"],
245
+ r["work_path"],
246
+ ])
247
+
248
+ return rows
249
+
250
+
251
+ def get_mit_authors_list() -> list[str]:
252
+ """Get author list for the dropdown (lazy-loaded)."""
253
+ try:
254
+ from sources.mit_classics_search import get_authors
255
+ return ["All"] + get_authors()
256
+ except Exception:
257
+ return ["All"]
258
+
259
+
260
+ def add_mit_text(work_path: str) -> str:
261
+ """Download an MIT Classics text and process it through the pipeline."""
262
+ if not work_path.strip():
263
+ return "Please enter a work path (e.g. /Plato/republic.html)."
264
+
265
+ from sources.mit_classics_search import get_mit_text
266
+
267
+ pipeline = get_pipeline()
268
+
269
+ try:
270
+ text = get_mit_text(work_path.strip())
271
+
272
+ # Build filename from path: /Aristotle/rhetoric.html -> mit_aristotle_rhetoric.txt
273
+ parts = work_path.strip("/").replace(".html", "").split("/")
274
+ fname = "mit_" + "_".join(parts).lower() + ".txt"
275
+ dest = pipeline.inbox / fname
276
+ dest.write_text(text, encoding="utf-8")
277
+
278
+ new_chunks = pipeline.process_inbox()
279
+ train_n, val_n = pipeline.rebuild_output()
280
+
281
+ return (
282
+ f"Downloaded: {work_path} ({len(text):,} chars)\n"
283
+ f"Processed: {new_chunks} new chunks\n"
284
+ f"Total corpus: {train_n} train / {val_n} val"
285
+ )
286
+ except Exception as e:
287
+ return f"Error: {e}"
288
+
289
+
290
+ # ---------------------------------------------------------------------------
291
+ # Tab 5: Corpus Management
292
  # ---------------------------------------------------------------------------
293
 
294
  def get_corpus_stats() -> str:
 
402
  fetch_output = gr.Textbox(label="Result", lines=4)
403
  fetch_btn.click(fetch_url, inputs=[url_input], outputs=[fetch_output])
404
 
405
+ with gr.Tab("Search Gutenberg"):
406
+ gr.Markdown("### Search Project Gutenberg for public domain texts")
407
+ with gr.Row():
408
+ gut_query = gr.Textbox(label="Search Query", placeholder="aristotle philosophy")
409
+ gut_topic = gr.Dropdown(
410
+ choices=["All", "Philosophy", "Ethics", "Politics",
411
+ "Metaphysics", "Science", "Mathematics",
412
+ "Classical", "Religion", "History"],
413
+ value="Philosophy",
414
+ label="Topic Filter",
415
+ )
416
+ gut_search_btn = gr.Button("Search", variant="primary")
417
+ gut_results = gr.Dataframe(
418
+ headers=["ID", "Title", "Author", "Subjects", "Downloads"],
419
+ label="Search Results",
420
+ interactive=False,
421
+ )
422
+ gut_search_btn.click(
423
+ search_gutenberg_ui,
424
+ inputs=[gut_query, gut_topic],
425
+ outputs=[gut_results],
426
+ )
427
+
428
+ gr.Markdown("### Add a text to the corpus")
429
+ gut_id_input = gr.Textbox(
430
+ label="Gutenberg Book ID",
431
+ placeholder="Paste a book ID from the search results above (e.g. 1497)",
432
+ )
433
+ gut_add_btn = gr.Button("Download and Process")
434
+ gut_add_output = gr.Textbox(label="Result", lines=4)
435
+ gut_add_btn.click(add_gutenberg_text, inputs=[gut_id_input], outputs=[gut_add_output])
436
+
437
+ with gr.Tab("Browse MIT Classics"):
438
+ gr.Markdown("### Search the MIT Internet Classics Archive (441 works by 59 authors)")
439
+ with gr.Row():
440
+ mit_query = gr.Textbox(label="Search Query", placeholder="republic")
441
+ mit_author = gr.Dropdown(
442
+ choices=get_mit_authors_list(),
443
+ value="All",
444
+ label="Author Filter",
445
+ )
446
+ mit_search_btn = gr.Button("Search", variant="primary")
447
+ mit_results = gr.Dataframe(
448
+ headers=["Author", "Title", "Work Path"],
449
+ label="Search Results",
450
+ interactive=False,
451
+ )
452
+ mit_search_btn.click(
453
+ search_mit_ui,
454
+ inputs=[mit_query, mit_author],
455
+ outputs=[mit_results],
456
+ )
457
+
458
+ gr.Markdown("### Add a text to the corpus")
459
+ mit_path_input = gr.Textbox(
460
+ label="Work Path",
461
+ placeholder="Paste a work path from the results above (e.g. /Plato/republic.html)",
462
+ )
463
+ mit_add_btn = gr.Button("Download and Process")
464
+ mit_add_output = gr.Textbox(label="Result", lines=4)
465
+ mit_add_btn.click(add_mit_text, inputs=[mit_path_input], outputs=[mit_add_output])
466
+
467
  with gr.Tab("Search Internet Archive"):
468
  gr.Markdown("### Search the Internet Archive for classical texts")
469
  with gr.Row():
 
535
  args = parser.parse_args()
536
 
537
  app = build_ui()
538
+ app.queue()
539
+ app.launch(share=args.share, server_name="0.0.0.0", server_port=args.port)
540
 
541
 
542
  if __name__ == "__main__":