Spaces:

LisaMegaWatts
/

pre-punctuation-processor

Sleeping

App Files Files Community

LisaMegaWatts commited on Feb 23

Commit

a9bec47

verified ·

1 Parent(s): 2f85e47

Enable Gradio queue for streaming UI updates

Browse files

Files changed (1) hide show

app.py +187 -4

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 """
 Gradio frontend for the text processing pipeline.
-Provides drag-and-drop file upload, URL fetching, Internet Archive
-search/browse, and corpus management with HuggingFace push.
 Usage:
     python app.py                  # Launch on http://localhost:7860
@@ -168,7 +169,126 @@ def add_ia_text(identifier: str) -> str:
 # ---------------------------------------------------------------------------
-# Tab 3: Corpus Management
 # ---------------------------------------------------------------------------
 def get_corpus_stats() -> str:
@@ -282,6 +402,68 @@ def build_ui():
             fetch_output = gr.Textbox(label="Result", lines=4)
             fetch_btn.click(fetch_url, inputs=[url_input], outputs=[fetch_output])
         with gr.Tab("Search Internet Archive"):
             gr.Markdown("### Search the Internet Archive for classical texts")
             with gr.Row():
@@ -353,7 +535,8 @@ def main():
     args = parser.parse_args()
     app = build_ui()
-    app.launch(share=args.share, server_port=args.port)
 if __name__ == "__main__":

 """
 Gradio frontend for the text processing pipeline.
+Provides drag-and-drop file upload, URL fetching, search across
+Project Gutenberg / MIT Classics / Internet Archive, and corpus
+management with HuggingFace push.
 Usage:
     python app.py                  # Launch on http://localhost:7860
 # ---------------------------------------------------------------------------
+# Tab 3: Search Project Gutenberg
+# ---------------------------------------------------------------------------
+def search_gutenberg_ui(query: str, topic: str) -> list[list]:
+    """Search Gutenberg via Gutendex and return results as table rows."""
+    if not query.strip():
+        return []
+    from sources.gutenberg_search import search_gutenberg
+    topic_key = topic.lower() if topic != "All" else None
+    results = search_gutenberg(query, topic=topic_key, rows=20)
+    rows = []
+    for r in results:
+        rows.append([
+            str(r["id"]),
+            r["title"],
+            r["author"],
+            r["subjects"][:60],
+            str(r["download_count"]),
+        ])
+    return rows
+def add_gutenberg_text(book_id: str) -> str:
+    """Download a Gutenberg text and process it through the pipeline."""
+    if not book_id.strip():
+        return "Please enter a Gutenberg book ID."
+    from sources.gutenberg_search import get_gutenberg_text
+    pipeline = get_pipeline()
+    try:
+        bid = int(book_id.strip())
+        text = get_gutenberg_text(bid)
+        fname = f"gutenberg_{bid}.txt"
+        dest = pipeline.inbox / fname
+        dest.write_text(text, encoding="utf-8")
+        new_chunks = pipeline.process_inbox()
+        train_n, val_n = pipeline.rebuild_output()
+        return (
+            f"Downloaded: Gutenberg #{bid} ({len(text):,} chars)\n"
+            f"Processed: {new_chunks} new chunks\n"
+            f"Total corpus: {train_n} train / {val_n} val"
+        )
+    except ValueError as e:
+        return f"Error: Invalid book ID '{book_id}' — enter a number (e.g. 1497)"
+    except Exception as e:
+        return f"Error: {e}"
+# ---------------------------------------------------------------------------
+# Tab 4: Browse MIT Classics
+# ---------------------------------------------------------------------------
+def search_mit_ui(query: str, author: str) -> list[list]:
+    """Search MIT Classics catalog and return results as table rows."""
+    from sources.mit_classics_search import search_mit_classics
+    author_key = author if author != "All" else ""
+    results = search_mit_classics(query=query.strip(), author=author_key)
+    rows = []
+    for r in results:
+        rows.append([
+            r["author"],
+            r["title"],
+            r["work_path"],
+        ])
+    return rows
+def get_mit_authors_list() -> list[str]:
+    """Get author list for the dropdown (lazy-loaded)."""
+    try:
+        from sources.mit_classics_search import get_authors
+        return ["All"] + get_authors()
+    except Exception:
+        return ["All"]
+def add_mit_text(work_path: str) -> str:
+    """Download an MIT Classics text and process it through the pipeline."""
+    if not work_path.strip():
+        return "Please enter a work path (e.g. /Plato/republic.html)."
+    from sources.mit_classics_search import get_mit_text
+    pipeline = get_pipeline()
+    try:
+        text = get_mit_text(work_path.strip())
+        # Build filename from path: /Aristotle/rhetoric.html -> mit_aristotle_rhetoric.txt
+        parts = work_path.strip("/").replace(".html", "").split("/")
+        fname = "mit_" + "_".join(parts).lower() + ".txt"
+        dest = pipeline.inbox / fname
+        dest.write_text(text, encoding="utf-8")
+        new_chunks = pipeline.process_inbox()
+        train_n, val_n = pipeline.rebuild_output()
+        return (
+            f"Downloaded: {work_path} ({len(text):,} chars)\n"
+            f"Processed: {new_chunks} new chunks\n"
+            f"Total corpus: {train_n} train / {val_n} val"
+        )
+    except Exception as e:
+        return f"Error: {e}"
+# ---------------------------------------------------------------------------
+# Tab 5: Corpus Management
 # ---------------------------------------------------------------------------
 def get_corpus_stats() -> str:
             fetch_output = gr.Textbox(label="Result", lines=4)
             fetch_btn.click(fetch_url, inputs=[url_input], outputs=[fetch_output])
+        with gr.Tab("Search Gutenberg"):
+            gr.Markdown("### Search Project Gutenberg for public domain texts")
+            with gr.Row():
+                gut_query = gr.Textbox(label="Search Query", placeholder="aristotle philosophy")
+                gut_topic = gr.Dropdown(
+                    choices=["All", "Philosophy", "Ethics", "Politics",
+                             "Metaphysics", "Science", "Mathematics",
+                             "Classical", "Religion", "History"],
+                    value="Philosophy",
+                    label="Topic Filter",
+                )
+            gut_search_btn = gr.Button("Search", variant="primary")
+            gut_results = gr.Dataframe(
+                headers=["ID", "Title", "Author", "Subjects", "Downloads"],
+                label="Search Results",
+                interactive=False,
+            )
+            gut_search_btn.click(
+                search_gutenberg_ui,
+                inputs=[gut_query, gut_topic],
+                outputs=[gut_results],
+            )
+            gr.Markdown("### Add a text to the corpus")
+            gut_id_input = gr.Textbox(
+                label="Gutenberg Book ID",
+                placeholder="Paste a book ID from the search results above (e.g. 1497)",
+            )
+            gut_add_btn = gr.Button("Download and Process")
+            gut_add_output = gr.Textbox(label="Result", lines=4)
+            gut_add_btn.click(add_gutenberg_text, inputs=[gut_id_input], outputs=[gut_add_output])
+        with gr.Tab("Browse MIT Classics"):
+            gr.Markdown("### Search the MIT Internet Classics Archive (441 works by 59 authors)")
+            with gr.Row():
+                mit_query = gr.Textbox(label="Search Query", placeholder="republic")
+                mit_author = gr.Dropdown(
+                    choices=get_mit_authors_list(),
+                    value="All",
+                    label="Author Filter",
+                )
+            mit_search_btn = gr.Button("Search", variant="primary")
+            mit_results = gr.Dataframe(
+                headers=["Author", "Title", "Work Path"],
+                label="Search Results",
+                interactive=False,
+            )
+            mit_search_btn.click(
+                search_mit_ui,
+                inputs=[mit_query, mit_author],
+                outputs=[mit_results],
+            )
+            gr.Markdown("### Add a text to the corpus")
+            mit_path_input = gr.Textbox(
+                label="Work Path",
+                placeholder="Paste a work path from the results above (e.g. /Plato/republic.html)",
+            )
+            mit_add_btn = gr.Button("Download and Process")
+            mit_add_output = gr.Textbox(label="Result", lines=4)
+            mit_add_btn.click(add_mit_text, inputs=[mit_path_input], outputs=[mit_add_output])
         with gr.Tab("Search Internet Archive"):
             gr.Markdown("### Search the Internet Archive for classical texts")
             with gr.Row():
     args = parser.parse_args()
     app = build_ui()
+    app.queue()
+    app.launch(share=args.share, server_name="0.0.0.0", server_port=args.port)
 if __name__ == "__main__":