Spaces:

ravimohan19
/

polymer-datasheet-agent

Sleeping

File size: 16,822 Bytes

4e03699

"""

Gradio UI for the Polymer Datasheet Crawler Agent.

Deployable as a HuggingFace Space.

"""

from __future__ import annotations

import json
import logging
import os
import tempfile
from pathlib import Path

import gradio as gr
import pandas as pd

from graph import (
    build_graph,
    db,
    run_search,
    run_upload,
    search_database,
    get_database_summary,
)
from pdf_extractor import extract_text_from_pdf
from models import DatasheetRecord

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(name)s | %(levelname)s | %(message)s",
)
logger = logging.getLogger(__name__)


# ══════════════════════════════════════════════════════════════════════════════
#  Handler Functions
# ══════════════════════════════════════════════════════════════════════════════

def handle_search(

    manufacturer: str,

    polymer_family: str,

    grade: str,

    progress=gr.Progress(),

) -> tuple[str, pd.DataFrame, str]:
    """

    Handle the 'Search & Add' tab: run the full LangGraph workflow

    to search, parse, and store a datasheet.

    """
    if not manufacturer.strip() and not polymer_family.strip():
        return (
            "⚠️ Please provide at least a manufacturer or polymer family.",
            pd.DataFrame(),
            "",
        )

    progress(0.1, desc="Initializing search...")
    try:
        progress(0.3, desc="Searching the web with Tavily...")
        result = run_search(
            manufacturer=manufacturer.strip(),
            polymer_family=polymer_family.strip(),
            grade=grade.strip(),
        )
        progress(0.9, desc="Done!")

        status = result.get("status", "unknown")
        message = result.get("message", "")

        # Build display dataframe from parsed record
        parsed = result.get("parsed_datasheet")
        display_df = pd.DataFrame()
        json_output = ""

        if parsed:
            record = DatasheetRecord(**parsed) if isinstance(parsed, dict) else parsed
            flat = record.to_flat_dict()
            # Filter out empty values and metadata for display
            display_data = {
                k: v for k, v in flat.items()
                if v and k not in ("id", "created_at")
            }
            display_df = pd.DataFrame(
                list(display_data.items()),
                columns=["Property", "Value"],
            )
            json_output = json.dumps(flat, indent=2)

        status_icon = "✅" if status == "success" else "❌"
        return f"{status_icon} {message}", display_df, json_output

    except Exception as exc:
        logger.exception("Search handler error")
        return f"❌ Error: {exc}", pd.DataFrame(), ""


def handle_upload(

    file_obj,

    progress=gr.Progress(),

) -> tuple[str, pd.DataFrame, str]:
    """

    Handle the 'Upload Datasheet' tab: extract text from PDF,

    then run the LangGraph workflow in upload mode.

    """
    if file_obj is None:
        return "⚠️ Please upload a PDF file.", pd.DataFrame(), ""

    progress(0.1, desc="Reading PDF...")
    try:
        # Gradio gives us a file path
        file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
        extracted_text = extract_text_from_pdf(file_path)

        if not extracted_text.strip():
            return (
                "⚠️ Could not extract text from the PDF. "
                "It may be image-based (scanned). Try a text-based PDF.",
                pd.DataFrame(),
                "",
            )

        progress(0.4, desc="Parsing with LLM...")
        result = run_upload(uploaded_text=extracted_text)
        progress(0.9, desc="Done!")

        status = result.get("status", "unknown")
        message = result.get("message", "")

        parsed = result.get("parsed_datasheet")
        display_df = pd.DataFrame()
        json_output = ""

        if parsed:
            record = DatasheetRecord(**parsed) if isinstance(parsed, dict) else parsed
            flat = record.to_flat_dict()
            display_data = {
                k: v for k, v in flat.items()
                if v and k not in ("id", "created_at")
            }
            display_df = pd.DataFrame(
                list(display_data.items()),
                columns=["Property", "Value"],
            )
            json_output = json.dumps(flat, indent=2)

        status_icon = "✅" if status == "success" else "❌"
        return f"{status_icon} {message}", display_df, json_output

    except Exception as exc:
        logger.exception("Upload handler error")
        return f"❌ Error: {exc}", pd.DataFrame(), ""


def handle_db_search(

    query: str,

    manufacturer: str,

    polymer_family: str,

) -> pd.DataFrame:
    """Search the database and return results."""
    try:
        df = search_database(
            query=query.strip(),
            manufacturer=manufacturer.strip(),
            polymer_family=polymer_family.strip(),
        )
        if df.empty:
            return pd.DataFrame({"Info": ["No matching records found."]})
        return df
    except Exception as exc:
        logger.exception("DB search error")
        return pd.DataFrame({"Error": [str(exc)]})


def handle_db_summary() -> tuple[pd.DataFrame, str]:
    """Get the full database summary."""
    try:
        df = get_database_summary()
        count = db.count()
        info = f"📊 Database contains {count} datasheet(s)."
        if df.empty:
            return pd.DataFrame({"Info": ["Database is empty."]}), info
        return df, info
    except Exception as exc:
        logger.exception("DB summary error")
        return pd.DataFrame({"Error": [str(exc)]}), f"❌ Error: {exc}"


def handle_export_csv() -> str | None:
    """Export the entire database to a CSV file for download."""
    try:
        df = db.get_all_dataframe()
        if df.empty:
            return None
        tmp = tempfile.NamedTemporaryFile(
            suffix=".csv", delete=False, mode="w", encoding="utf-8",
        )
        df.to_csv(tmp.name, index=False)
        tmp.close()
        return tmp.name
    except Exception as exc:
        logger.exception("Export error")
        return None


# ══════════════════════════════════════════════════════════════════════════════
#  Gradio App
# ══════════════════════════════════════════════════════════════════════════════

def create_app() -> gr.Blocks:
    """Build the Gradio Blocks application."""

    with gr.Blocks(
        title="🧪 Polymer Datasheet Agent",
        theme=gr.themes.Soft(),
        css="""

        .header { text-align: center; margin-bottom: 1em; }

        .status-box { font-size: 1.1em; font-weight: 600; padding: 0.5em; }

        """,
    ) as app:

        # ── Header ───────────────────────────────────────────────────────
        gr.Markdown(
            """

            # 🧪 Polymer Datasheet Crawler Agent

            **Build a searchable database of commercial polymer datasheets.**



            This agent uses **Tavily** to search the web for technical datasheets,

            **LLaMA 3.1** to extract structured properties, and stores results in

            a local **SQLite** database.



            ---

            """,
            elem_classes=["header"],
        )

        # ── Tab 1: Search & Add ──────────────────────────────────────────
        with gr.Tab("🔍 Search & Add Datasheet"):
            gr.Markdown(
                "Enter a manufacturer and/or polymer family to search for "
                "datasheets online and add them to the database."
            )

            with gr.Row():
                manufacturer_input = gr.Textbox(
                    label="Manufacturer",
                    placeholder="e.g., SABIC, BASF, DuPont",
                    scale=2,
                )
                polymer_input = gr.Textbox(
                    label="Polymer Family",
                    placeholder="e.g., Polycarbonate, Nylon 6,6, PEEK",
                    scale=2,
                )
                grade_input = gr.Textbox(
                    label="Grade (optional)",
                    placeholder="e.g., Lexan 141R, Ultramid A3K",
                    scale=2,
                )

            search_btn = gr.Button("🔍 Search & Add", variant="primary", size="lg")

            search_status = gr.Textbox(
                label="Status",
                interactive=False,
                elem_classes=["status-box"],
            )

            with gr.Accordion("Extracted Properties", open=True):
                search_table = gr.Dataframe(
                    label="Parsed Datasheet",
                    interactive=False,
                    wrap=True,
                )

            with gr.Accordion("Raw JSON Output", open=False):
                search_json = gr.Code(
                    label="JSON",
                    language="json",
                    interactive=False,
                )

            search_btn.click(
                fn=handle_search,
                inputs=[manufacturer_input, polymer_input, grade_input],
                outputs=[search_status, search_table, search_json],
            )

        # ── Tab 2: Upload Datasheet ──────────────────────────────────────
        with gr.Tab("📄 Upload Datasheet"):
            gr.Markdown(
                "Upload a PDF datasheet to extract properties and add to the database."
            )

            file_input = gr.File(
                label="Upload PDF Datasheet",
                file_types=[".pdf"],
                type="filepath",
            )
            upload_btn = gr.Button("📄 Parse & Add", variant="primary", size="lg")

            upload_status = gr.Textbox(
                label="Status",
                interactive=False,
                elem_classes=["status-box"],
            )

            with gr.Accordion("Extracted Properties", open=True):
                upload_table = gr.Dataframe(
                    label="Parsed Datasheet",
                    interactive=False,
                    wrap=True,
                )

            with gr.Accordion("Raw JSON Output", open=False):
                upload_json = gr.Code(
                    label="JSON",
                    language="json",
                    interactive=False,
                )

            upload_btn.click(
                fn=handle_upload,
                inputs=[file_input],
                outputs=[upload_status, upload_table, upload_json],
            )

        # ── Tab 3: Database Browser ──────────────────────────────────────
        with gr.Tab("🗄️ Database Browser"):
            gr.Markdown("Search and browse the existing datasheet database.")

            with gr.Row():
                db_query = gr.Textbox(
                    label="Search query",
                    placeholder="Free text search across all fields...",
                    scale=3,
                )
                db_manufacturer = gr.Textbox(
                    label="Filter: Manufacturer",
                    placeholder="e.g., BASF",
                    scale=2,
                )
                db_polymer = gr.Textbox(
                    label="Filter: Polymer Family",
                    placeholder="e.g., Polyamide",
                    scale=2,
                )

            with gr.Row():
                db_search_btn = gr.Button("🔍 Search Database", variant="primary")
                db_refresh_btn = gr.Button("🔄 Show All Records")
                db_export_btn = gr.Button("📥 Export to CSV")

            db_info = gr.Textbox(label="Info", interactive=False)

            db_results = gr.Dataframe(
                label="Database Records",
                interactive=False,
                wrap=True,
            )

            export_file = gr.File(label="Download CSV", visible=True)

            db_search_btn.click(
                fn=handle_db_search,
                inputs=[db_query, db_manufacturer, db_polymer],
                outputs=[db_results],
            )

            db_refresh_btn.click(
                fn=handle_db_summary,
                inputs=[],
                outputs=[db_results, db_info],
            )

            db_export_btn.click(
                fn=handle_export_csv,
                inputs=[],
                outputs=[export_file],
            )

        # ── Tab 4: About / Help ──────────────────────────────────────────
        with gr.Tab("ℹ️ About"):
            gr.Markdown(
                """

                ## Architecture



                This application is built with:



                - **[LangGraph](https://github.com/langchain-ai/langgraph)** —

                  Orchestrates the agent workflow as a directed state graph.

                - **[Tavily](https://tavily.com)** —

                  AI-optimized web search API for finding datasheets.

                - **[LLaMA 3.1](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)** —

                  Open-source LLM via HuggingFace Inference API for structured extraction.

                - **SQLite + SQLAlchemy** — Local relational database.

                - **[Gradio](https://gradio.app)** — Web UI, deployable on HuggingFace Spaces.



                ## Workflow



                ```

                User Input ──► Router ──► Web Search (Tavily) ──► LLM Parse (LLaMA 3.1) ──► Store DB ──► Output

                                  │                                          ▲

                                  └──► Process Upload (PDF) ─────────────────┘

                ```



                ## Property Categories



                The agent extracts properties across these categories:

                - **General**: Material name, trade name, manufacturer, grade, applications

                - **Mechanical**: Tensile/flexural strength, modulus, impact, hardness

                - **Thermal**: Tm, Tg, HDT, Vicat, CTE, thermal conductivity

                - **Physical**: Density, MFI, water absorption, specific gravity

                - **Electrical**: Dielectric strength/constant, resistivity

                - **Chemical Resistance**: Acid, alkali, solvent, UV resistance

                - **Regulatory**: FDA, RoHS, REACH, UL94



                ## Data Sources



                The crawler prioritizes trusted sources including:

                MatWeb, Omnexus, UL Prospector, Campus Plastics,

                and official manufacturer portals (SABIC, BASF, DuPont, Dow, etc.)



                ---

                *Built for Plinity — Infinite Recyclable Polymers Project*

                """
            )

    return app


# ══════════════════════════════════════════════════════════════════════════════
#  Main
# ══════════════════════════════════════════════════════════════════════════════

if __name__ == "__main__":
    app = create_app()
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
    )