Spaces:
Sleeping
Sleeping
| """ | |
| Gradio UI for the Polymer Datasheet Crawler Agent. | |
| Deployable as a HuggingFace Space. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import os | |
| import tempfile | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| from graph import ( | |
| build_graph, | |
| db, | |
| run_search, | |
| run_upload, | |
| search_database, | |
| get_database_summary, | |
| ) | |
| from pdf_extractor import extract_text_from_pdf | |
| from models import DatasheetRecord | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s | %(name)s | %(levelname)s | %(message)s", | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Handler Functions | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def handle_search( | |
| manufacturer: str, | |
| polymer_family: str, | |
| grade: str, | |
| progress=gr.Progress(), | |
| ) -> tuple[str, pd.DataFrame, str]: | |
| """ | |
| Handle the 'Search & Add' tab: run the full LangGraph workflow | |
| to search, parse, and store a datasheet. | |
| """ | |
| if not manufacturer.strip() and not polymer_family.strip(): | |
| return ( | |
| "β οΈ Please provide at least a manufacturer or polymer family.", | |
| pd.DataFrame(), | |
| "", | |
| ) | |
| progress(0.1, desc="Initializing search...") | |
| try: | |
| progress(0.3, desc="Searching the web with Tavily...") | |
| result = run_search( | |
| manufacturer=manufacturer.strip(), | |
| polymer_family=polymer_family.strip(), | |
| grade=grade.strip(), | |
| ) | |
| progress(0.9, desc="Done!") | |
| status = result.get("status", "unknown") | |
| message = result.get("message", "") | |
| # Build display dataframe from parsed record | |
| parsed = result.get("parsed_datasheet") | |
| display_df = pd.DataFrame() | |
| json_output = "" | |
| if parsed: | |
| record = DatasheetRecord(**parsed) if isinstance(parsed, dict) else parsed | |
| flat = record.to_flat_dict() | |
| # Filter out empty values and metadata for display | |
| display_data = { | |
| k: v for k, v in flat.items() | |
| if v and k not in ("id", "created_at") | |
| } | |
| display_df = pd.DataFrame( | |
| list(display_data.items()), | |
| columns=["Property", "Value"], | |
| ) | |
| json_output = json.dumps(flat, indent=2) | |
| status_icon = "β " if status == "success" else "β" | |
| return f"{status_icon} {message}", display_df, json_output | |
| except Exception as exc: | |
| logger.exception("Search handler error") | |
| return f"β Error: {exc}", pd.DataFrame(), "" | |
| def handle_upload( | |
| file_obj, | |
| progress=gr.Progress(), | |
| ) -> tuple[str, pd.DataFrame, str]: | |
| """ | |
| Handle the 'Upload Datasheet' tab: extract text from PDF, | |
| then run the LangGraph workflow in upload mode. | |
| """ | |
| if file_obj is None: | |
| return "β οΈ Please upload a PDF file.", pd.DataFrame(), "" | |
| progress(0.1, desc="Reading PDF...") | |
| try: | |
| # Gradio gives us a file path | |
| file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj) | |
| extracted_text = extract_text_from_pdf(file_path) | |
| if not extracted_text.strip(): | |
| return ( | |
| "β οΈ Could not extract text from the PDF. " | |
| "It may be image-based (scanned). Try a text-based PDF.", | |
| pd.DataFrame(), | |
| "", | |
| ) | |
| progress(0.4, desc="Parsing with LLM...") | |
| result = run_upload(uploaded_text=extracted_text) | |
| progress(0.9, desc="Done!") | |
| status = result.get("status", "unknown") | |
| message = result.get("message", "") | |
| parsed = result.get("parsed_datasheet") | |
| display_df = pd.DataFrame() | |
| json_output = "" | |
| if parsed: | |
| record = DatasheetRecord(**parsed) if isinstance(parsed, dict) else parsed | |
| flat = record.to_flat_dict() | |
| display_data = { | |
| k: v for k, v in flat.items() | |
| if v and k not in ("id", "created_at") | |
| } | |
| display_df = pd.DataFrame( | |
| list(display_data.items()), | |
| columns=["Property", "Value"], | |
| ) | |
| json_output = json.dumps(flat, indent=2) | |
| status_icon = "β " if status == "success" else "β" | |
| return f"{status_icon} {message}", display_df, json_output | |
| except Exception as exc: | |
| logger.exception("Upload handler error") | |
| return f"β Error: {exc}", pd.DataFrame(), "" | |
| def handle_db_search( | |
| query: str, | |
| manufacturer: str, | |
| polymer_family: str, | |
| ) -> pd.DataFrame: | |
| """Search the database and return results.""" | |
| try: | |
| df = search_database( | |
| query=query.strip(), | |
| manufacturer=manufacturer.strip(), | |
| polymer_family=polymer_family.strip(), | |
| ) | |
| if df.empty: | |
| return pd.DataFrame({"Info": ["No matching records found."]}) | |
| return df | |
| except Exception as exc: | |
| logger.exception("DB search error") | |
| return pd.DataFrame({"Error": [str(exc)]}) | |
| def handle_db_summary() -> tuple[pd.DataFrame, str]: | |
| """Get the full database summary.""" | |
| try: | |
| df = get_database_summary() | |
| count = db.count() | |
| info = f"π Database contains {count} datasheet(s)." | |
| if df.empty: | |
| return pd.DataFrame({"Info": ["Database is empty."]}), info | |
| return df, info | |
| except Exception as exc: | |
| logger.exception("DB summary error") | |
| return pd.DataFrame({"Error": [str(exc)]}), f"β Error: {exc}" | |
| def handle_export_csv() -> str | None: | |
| """Export the entire database to a CSV file for download.""" | |
| try: | |
| df = db.get_all_dataframe() | |
| if df.empty: | |
| return None | |
| tmp = tempfile.NamedTemporaryFile( | |
| suffix=".csv", delete=False, mode="w", encoding="utf-8", | |
| ) | |
| df.to_csv(tmp.name, index=False) | |
| tmp.close() | |
| return tmp.name | |
| except Exception as exc: | |
| logger.exception("Export error") | |
| return None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Gradio App | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def create_app() -> gr.Blocks: | |
| """Build the Gradio Blocks application.""" | |
| with gr.Blocks( | |
| title="π§ͺ Polymer Datasheet Agent", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .header { text-align: center; margin-bottom: 1em; } | |
| .status-box { font-size: 1.1em; font-weight: 600; padding: 0.5em; } | |
| """, | |
| ) as app: | |
| # ββ Header βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gr.Markdown( | |
| """ | |
| # π§ͺ Polymer Datasheet Crawler Agent | |
| **Build a searchable database of commercial polymer datasheets.** | |
| This agent uses **Tavily** to search the web for technical datasheets, | |
| **LLaMA 3.1** to extract structured properties, and stores results in | |
| a local **SQLite** database. | |
| --- | |
| """, | |
| elem_classes=["header"], | |
| ) | |
| # ββ Tab 1: Search & Add ββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π Search & Add Datasheet"): | |
| gr.Markdown( | |
| "Enter a manufacturer and/or polymer family to search for " | |
| "datasheets online and add them to the database." | |
| ) | |
| with gr.Row(): | |
| manufacturer_input = gr.Textbox( | |
| label="Manufacturer", | |
| placeholder="e.g., SABIC, BASF, DuPont", | |
| scale=2, | |
| ) | |
| polymer_input = gr.Textbox( | |
| label="Polymer Family", | |
| placeholder="e.g., Polycarbonate, Nylon 6,6, PEEK", | |
| scale=2, | |
| ) | |
| grade_input = gr.Textbox( | |
| label="Grade (optional)", | |
| placeholder="e.g., Lexan 141R, Ultramid A3K", | |
| scale=2, | |
| ) | |
| search_btn = gr.Button("π Search & Add", variant="primary", size="lg") | |
| search_status = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| elem_classes=["status-box"], | |
| ) | |
| with gr.Accordion("Extracted Properties", open=True): | |
| search_table = gr.Dataframe( | |
| label="Parsed Datasheet", | |
| interactive=False, | |
| wrap=True, | |
| ) | |
| with gr.Accordion("Raw JSON Output", open=False): | |
| search_json = gr.Code( | |
| label="JSON", | |
| language="json", | |
| interactive=False, | |
| ) | |
| search_btn.click( | |
| fn=handle_search, | |
| inputs=[manufacturer_input, polymer_input, grade_input], | |
| outputs=[search_status, search_table, search_json], | |
| ) | |
| # ββ Tab 2: Upload Datasheet ββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π Upload Datasheet"): | |
| gr.Markdown( | |
| "Upload a PDF datasheet to extract properties and add to the database." | |
| ) | |
| file_input = gr.File( | |
| label="Upload PDF Datasheet", | |
| file_types=[".pdf"], | |
| type="filepath", | |
| ) | |
| upload_btn = gr.Button("π Parse & Add", variant="primary", size="lg") | |
| upload_status = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| elem_classes=["status-box"], | |
| ) | |
| with gr.Accordion("Extracted Properties", open=True): | |
| upload_table = gr.Dataframe( | |
| label="Parsed Datasheet", | |
| interactive=False, | |
| wrap=True, | |
| ) | |
| with gr.Accordion("Raw JSON Output", open=False): | |
| upload_json = gr.Code( | |
| label="JSON", | |
| language="json", | |
| interactive=False, | |
| ) | |
| upload_btn.click( | |
| fn=handle_upload, | |
| inputs=[file_input], | |
| outputs=[upload_status, upload_table, upload_json], | |
| ) | |
| # ββ Tab 3: Database Browser ββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("ποΈ Database Browser"): | |
| gr.Markdown("Search and browse the existing datasheet database.") | |
| with gr.Row(): | |
| db_query = gr.Textbox( | |
| label="Search query", | |
| placeholder="Free text search across all fields...", | |
| scale=3, | |
| ) | |
| db_manufacturer = gr.Textbox( | |
| label="Filter: Manufacturer", | |
| placeholder="e.g., BASF", | |
| scale=2, | |
| ) | |
| db_polymer = gr.Textbox( | |
| label="Filter: Polymer Family", | |
| placeholder="e.g., Polyamide", | |
| scale=2, | |
| ) | |
| with gr.Row(): | |
| db_search_btn = gr.Button("π Search Database", variant="primary") | |
| db_refresh_btn = gr.Button("π Show All Records") | |
| db_export_btn = gr.Button("π₯ Export to CSV") | |
| db_info = gr.Textbox(label="Info", interactive=False) | |
| db_results = gr.Dataframe( | |
| label="Database Records", | |
| interactive=False, | |
| wrap=True, | |
| ) | |
| export_file = gr.File(label="Download CSV", visible=True) | |
| db_search_btn.click( | |
| fn=handle_db_search, | |
| inputs=[db_query, db_manufacturer, db_polymer], | |
| outputs=[db_results], | |
| ) | |
| db_refresh_btn.click( | |
| fn=handle_db_summary, | |
| inputs=[], | |
| outputs=[db_results, db_info], | |
| ) | |
| db_export_btn.click( | |
| fn=handle_export_csv, | |
| inputs=[], | |
| outputs=[export_file], | |
| ) | |
| # ββ Tab 4: About / Help ββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("βΉοΈ About"): | |
| gr.Markdown( | |
| """ | |
| ## Architecture | |
| This application is built with: | |
| - **[LangGraph](https://github.com/langchain-ai/langgraph)** β | |
| Orchestrates the agent workflow as a directed state graph. | |
| - **[Tavily](https://tavily.com)** β | |
| AI-optimized web search API for finding datasheets. | |
| - **[LLaMA 3.1](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)** β | |
| Open-source LLM via HuggingFace Inference API for structured extraction. | |
| - **SQLite + SQLAlchemy** β Local relational database. | |
| - **[Gradio](https://gradio.app)** β Web UI, deployable on HuggingFace Spaces. | |
| ## Workflow | |
| ``` | |
| User Input βββΊ Router βββΊ Web Search (Tavily) βββΊ LLM Parse (LLaMA 3.1) βββΊ Store DB βββΊ Output | |
| β β² | |
| ββββΊ Process Upload (PDF) ββββββββββββββββββ | |
| ``` | |
| ## Property Categories | |
| The agent extracts properties across these categories: | |
| - **General**: Material name, trade name, manufacturer, grade, applications | |
| - **Mechanical**: Tensile/flexural strength, modulus, impact, hardness | |
| - **Thermal**: Tm, Tg, HDT, Vicat, CTE, thermal conductivity | |
| - **Physical**: Density, MFI, water absorption, specific gravity | |
| - **Electrical**: Dielectric strength/constant, resistivity | |
| - **Chemical Resistance**: Acid, alkali, solvent, UV resistance | |
| - **Regulatory**: FDA, RoHS, REACH, UL94 | |
| ## Data Sources | |
| The crawler prioritizes trusted sources including: | |
| MatWeb, Omnexus, UL Prospector, Campus Plastics, | |
| and official manufacturer portals (SABIC, BASF, DuPont, Dow, etc.) | |
| --- | |
| *Built for Plinity β Infinite Recyclable Polymers Project* | |
| """ | |
| ) | |
| return app | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Main | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| app = create_app() | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| ) | |