""" Gradio UI for the Polymer Datasheet Crawler Agent. Deployable as a HuggingFace Space. """ from __future__ import annotations import json import logging import os import tempfile from pathlib import Path import gradio as gr import pandas as pd from graph import ( build_graph, db, run_search, run_upload, search_database, get_database_summary, ) from pdf_extractor import extract_text_from_pdf from models import DatasheetRecord logging.basicConfig( level=logging.INFO, format="%(asctime)s | %(name)s | %(levelname)s | %(message)s", ) logger = logging.getLogger(__name__) # ══════════════════════════════════════════════════════════════════════════════ # Handler Functions # ══════════════════════════════════════════════════════════════════════════════ def handle_search( manufacturer: str, polymer_family: str, grade: str, progress=gr.Progress(), ) -> tuple[str, pd.DataFrame, str]: """ Handle the 'Search & Add' tab: run the full LangGraph workflow to search, parse, and store a datasheet. """ if not manufacturer.strip() and not polymer_family.strip(): return ( "⚠️ Please provide at least a manufacturer or polymer family.", pd.DataFrame(), "", ) progress(0.1, desc="Initializing search...") try: progress(0.3, desc="Searching the web with Tavily...") result = run_search( manufacturer=manufacturer.strip(), polymer_family=polymer_family.strip(), grade=grade.strip(), ) progress(0.9, desc="Done!") status = result.get("status", "unknown") message = result.get("message", "") # Build display dataframe from parsed record parsed = result.get("parsed_datasheet") display_df = pd.DataFrame() json_output = "" if parsed: record = DatasheetRecord(**parsed) if isinstance(parsed, dict) else parsed flat = record.to_flat_dict() # Filter out empty values and metadata for display display_data = { k: v for k, v in flat.items() if v and k not in ("id", "created_at") } display_df = pd.DataFrame( list(display_data.items()), columns=["Property", "Value"], ) json_output = json.dumps(flat, indent=2) status_icon = "✅" if status == "success" else "❌" return f"{status_icon} {message}", display_df, json_output except Exception as exc: logger.exception("Search handler error") return f"❌ Error: {exc}", pd.DataFrame(), "" def handle_upload( file_obj, progress=gr.Progress(), ) -> tuple[str, pd.DataFrame, str]: """ Handle the 'Upload Datasheet' tab: extract text from PDF, then run the LangGraph workflow in upload mode. """ if file_obj is None: return "⚠️ Please upload a PDF file.", pd.DataFrame(), "" progress(0.1, desc="Reading PDF...") try: # Gradio gives us a file path file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj) extracted_text = extract_text_from_pdf(file_path) if not extracted_text.strip(): return ( "⚠️ Could not extract text from the PDF. " "It may be image-based (scanned). Try a text-based PDF.", pd.DataFrame(), "", ) progress(0.4, desc="Parsing with LLM...") result = run_upload(uploaded_text=extracted_text) progress(0.9, desc="Done!") status = result.get("status", "unknown") message = result.get("message", "") parsed = result.get("parsed_datasheet") display_df = pd.DataFrame() json_output = "" if parsed: record = DatasheetRecord(**parsed) if isinstance(parsed, dict) else parsed flat = record.to_flat_dict() display_data = { k: v for k, v in flat.items() if v and k not in ("id", "created_at") } display_df = pd.DataFrame( list(display_data.items()), columns=["Property", "Value"], ) json_output = json.dumps(flat, indent=2) status_icon = "✅" if status == "success" else "❌" return f"{status_icon} {message}", display_df, json_output except Exception as exc: logger.exception("Upload handler error") return f"❌ Error: {exc}", pd.DataFrame(), "" def handle_db_search( query: str, manufacturer: str, polymer_family: str, ) -> pd.DataFrame: """Search the database and return results.""" try: df = search_database( query=query.strip(), manufacturer=manufacturer.strip(), polymer_family=polymer_family.strip(), ) if df.empty: return pd.DataFrame({"Info": ["No matching records found."]}) return df except Exception as exc: logger.exception("DB search error") return pd.DataFrame({"Error": [str(exc)]}) def handle_db_summary() -> tuple[pd.DataFrame, str]: """Get the full database summary.""" try: df = get_database_summary() count = db.count() info = f"📊 Database contains {count} datasheet(s)." if df.empty: return pd.DataFrame({"Info": ["Database is empty."]}), info return df, info except Exception as exc: logger.exception("DB summary error") return pd.DataFrame({"Error": [str(exc)]}), f"❌ Error: {exc}" def handle_export_csv() -> str | None: """Export the entire database to a CSV file for download.""" try: df = db.get_all_dataframe() if df.empty: return None tmp = tempfile.NamedTemporaryFile( suffix=".csv", delete=False, mode="w", encoding="utf-8", ) df.to_csv(tmp.name, index=False) tmp.close() return tmp.name except Exception as exc: logger.exception("Export error") return None # ══════════════════════════════════════════════════════════════════════════════ # Gradio App # ══════════════════════════════════════════════════════════════════════════════ def create_app() -> gr.Blocks: """Build the Gradio Blocks application.""" with gr.Blocks( title="🧪 Polymer Datasheet Agent", theme=gr.themes.Soft(), css=""" .header { text-align: center; margin-bottom: 1em; } .status-box { font-size: 1.1em; font-weight: 600; padding: 0.5em; } """, ) as app: # ── Header ─────────────────────────────────────────────────────── gr.Markdown( """ # 🧪 Polymer Datasheet Crawler Agent **Build a searchable database of commercial polymer datasheets.** This agent uses **Tavily** to search the web for technical datasheets, **LLaMA 3.1** to extract structured properties, and stores results in a local **SQLite** database. --- """, elem_classes=["header"], ) # ── Tab 1: Search & Add ────────────────────────────────────────── with gr.Tab("🔍 Search & Add Datasheet"): gr.Markdown( "Enter a manufacturer and/or polymer family to search for " "datasheets online and add them to the database." ) with gr.Row(): manufacturer_input = gr.Textbox( label="Manufacturer", placeholder="e.g., SABIC, BASF, DuPont", scale=2, ) polymer_input = gr.Textbox( label="Polymer Family", placeholder="e.g., Polycarbonate, Nylon 6,6, PEEK", scale=2, ) grade_input = gr.Textbox( label="Grade (optional)", placeholder="e.g., Lexan 141R, Ultramid A3K", scale=2, ) search_btn = gr.Button("🔍 Search & Add", variant="primary", size="lg") search_status = gr.Textbox( label="Status", interactive=False, elem_classes=["status-box"], ) with gr.Accordion("Extracted Properties", open=True): search_table = gr.Dataframe( label="Parsed Datasheet", interactive=False, wrap=True, ) with gr.Accordion("Raw JSON Output", open=False): search_json = gr.Code( label="JSON", language="json", interactive=False, ) search_btn.click( fn=handle_search, inputs=[manufacturer_input, polymer_input, grade_input], outputs=[search_status, search_table, search_json], ) # ── Tab 2: Upload Datasheet ────────────────────────────────────── with gr.Tab("📄 Upload Datasheet"): gr.Markdown( "Upload a PDF datasheet to extract properties and add to the database." ) file_input = gr.File( label="Upload PDF Datasheet", file_types=[".pdf"], type="filepath", ) upload_btn = gr.Button("📄 Parse & Add", variant="primary", size="lg") upload_status = gr.Textbox( label="Status", interactive=False, elem_classes=["status-box"], ) with gr.Accordion("Extracted Properties", open=True): upload_table = gr.Dataframe( label="Parsed Datasheet", interactive=False, wrap=True, ) with gr.Accordion("Raw JSON Output", open=False): upload_json = gr.Code( label="JSON", language="json", interactive=False, ) upload_btn.click( fn=handle_upload, inputs=[file_input], outputs=[upload_status, upload_table, upload_json], ) # ── Tab 3: Database Browser ────────────────────────────────────── with gr.Tab("🗄️ Database Browser"): gr.Markdown("Search and browse the existing datasheet database.") with gr.Row(): db_query = gr.Textbox( label="Search query", placeholder="Free text search across all fields...", scale=3, ) db_manufacturer = gr.Textbox( label="Filter: Manufacturer", placeholder="e.g., BASF", scale=2, ) db_polymer = gr.Textbox( label="Filter: Polymer Family", placeholder="e.g., Polyamide", scale=2, ) with gr.Row(): db_search_btn = gr.Button("🔍 Search Database", variant="primary") db_refresh_btn = gr.Button("🔄 Show All Records") db_export_btn = gr.Button("📥 Export to CSV") db_info = gr.Textbox(label="Info", interactive=False) db_results = gr.Dataframe( label="Database Records", interactive=False, wrap=True, ) export_file = gr.File(label="Download CSV", visible=True) db_search_btn.click( fn=handle_db_search, inputs=[db_query, db_manufacturer, db_polymer], outputs=[db_results], ) db_refresh_btn.click( fn=handle_db_summary, inputs=[], outputs=[db_results, db_info], ) db_export_btn.click( fn=handle_export_csv, inputs=[], outputs=[export_file], ) # ── Tab 4: About / Help ────────────────────────────────────────── with gr.Tab("ℹ️ About"): gr.Markdown( """ ## Architecture This application is built with: - **[LangGraph](https://github.com/langchain-ai/langgraph)** — Orchestrates the agent workflow as a directed state graph. - **[Tavily](https://tavily.com)** — AI-optimized web search API for finding datasheets. - **[LLaMA 3.1](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)** — Open-source LLM via HuggingFace Inference API for structured extraction. - **SQLite + SQLAlchemy** — Local relational database. - **[Gradio](https://gradio.app)** — Web UI, deployable on HuggingFace Spaces. ## Workflow ``` User Input ──► Router ──► Web Search (Tavily) ──► LLM Parse (LLaMA 3.1) ──► Store DB ──► Output │ ▲ └──► Process Upload (PDF) ─────────────────┘ ``` ## Property Categories The agent extracts properties across these categories: - **General**: Material name, trade name, manufacturer, grade, applications - **Mechanical**: Tensile/flexural strength, modulus, impact, hardness - **Thermal**: Tm, Tg, HDT, Vicat, CTE, thermal conductivity - **Physical**: Density, MFI, water absorption, specific gravity - **Electrical**: Dielectric strength/constant, resistivity - **Chemical Resistance**: Acid, alkali, solvent, UV resistance - **Regulatory**: FDA, RoHS, REACH, UL94 ## Data Sources The crawler prioritizes trusted sources including: MatWeb, Omnexus, UL Prospector, Campus Plastics, and official manufacturer portals (SABIC, BASF, DuPont, Dow, etc.) --- *Built for Plinity — Infinite Recyclable Polymers Project* """ ) return app # ══════════════════════════════════════════════════════════════════════════════ # Main # ══════════════════════════════════════════════════════════════════════════════ if __name__ == "__main__": app = create_app() app.launch( server_name="0.0.0.0", server_port=7860, share=False, )