ravimohan19's picture
Upload app.py with huggingface_hub
4e03699 verified
"""
Gradio UI for the Polymer Datasheet Crawler Agent.
Deployable as a HuggingFace Space.
"""
from __future__ import annotations
import json
import logging
import os
import tempfile
from pathlib import Path
import gradio as gr
import pandas as pd
from graph import (
build_graph,
db,
run_search,
run_upload,
search_database,
get_database_summary,
)
from pdf_extractor import extract_text_from_pdf
from models import DatasheetRecord
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(name)s | %(levelname)s | %(message)s",
)
logger = logging.getLogger(__name__)
# ══════════════════════════════════════════════════════════════════════════════
# Handler Functions
# ══════════════════════════════════════════════════════════════════════════════
def handle_search(
manufacturer: str,
polymer_family: str,
grade: str,
progress=gr.Progress(),
) -> tuple[str, pd.DataFrame, str]:
"""
Handle the 'Search & Add' tab: run the full LangGraph workflow
to search, parse, and store a datasheet.
"""
if not manufacturer.strip() and not polymer_family.strip():
return (
"⚠️ Please provide at least a manufacturer or polymer family.",
pd.DataFrame(),
"",
)
progress(0.1, desc="Initializing search...")
try:
progress(0.3, desc="Searching the web with Tavily...")
result = run_search(
manufacturer=manufacturer.strip(),
polymer_family=polymer_family.strip(),
grade=grade.strip(),
)
progress(0.9, desc="Done!")
status = result.get("status", "unknown")
message = result.get("message", "")
# Build display dataframe from parsed record
parsed = result.get("parsed_datasheet")
display_df = pd.DataFrame()
json_output = ""
if parsed:
record = DatasheetRecord(**parsed) if isinstance(parsed, dict) else parsed
flat = record.to_flat_dict()
# Filter out empty values and metadata for display
display_data = {
k: v for k, v in flat.items()
if v and k not in ("id", "created_at")
}
display_df = pd.DataFrame(
list(display_data.items()),
columns=["Property", "Value"],
)
json_output = json.dumps(flat, indent=2)
status_icon = "βœ…" if status == "success" else "❌"
return f"{status_icon} {message}", display_df, json_output
except Exception as exc:
logger.exception("Search handler error")
return f"❌ Error: {exc}", pd.DataFrame(), ""
def handle_upload(
file_obj,
progress=gr.Progress(),
) -> tuple[str, pd.DataFrame, str]:
"""
Handle the 'Upload Datasheet' tab: extract text from PDF,
then run the LangGraph workflow in upload mode.
"""
if file_obj is None:
return "⚠️ Please upload a PDF file.", pd.DataFrame(), ""
progress(0.1, desc="Reading PDF...")
try:
# Gradio gives us a file path
file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
extracted_text = extract_text_from_pdf(file_path)
if not extracted_text.strip():
return (
"⚠️ Could not extract text from the PDF. "
"It may be image-based (scanned). Try a text-based PDF.",
pd.DataFrame(),
"",
)
progress(0.4, desc="Parsing with LLM...")
result = run_upload(uploaded_text=extracted_text)
progress(0.9, desc="Done!")
status = result.get("status", "unknown")
message = result.get("message", "")
parsed = result.get("parsed_datasheet")
display_df = pd.DataFrame()
json_output = ""
if parsed:
record = DatasheetRecord(**parsed) if isinstance(parsed, dict) else parsed
flat = record.to_flat_dict()
display_data = {
k: v for k, v in flat.items()
if v and k not in ("id", "created_at")
}
display_df = pd.DataFrame(
list(display_data.items()),
columns=["Property", "Value"],
)
json_output = json.dumps(flat, indent=2)
status_icon = "βœ…" if status == "success" else "❌"
return f"{status_icon} {message}", display_df, json_output
except Exception as exc:
logger.exception("Upload handler error")
return f"❌ Error: {exc}", pd.DataFrame(), ""
def handle_db_search(
query: str,
manufacturer: str,
polymer_family: str,
) -> pd.DataFrame:
"""Search the database and return results."""
try:
df = search_database(
query=query.strip(),
manufacturer=manufacturer.strip(),
polymer_family=polymer_family.strip(),
)
if df.empty:
return pd.DataFrame({"Info": ["No matching records found."]})
return df
except Exception as exc:
logger.exception("DB search error")
return pd.DataFrame({"Error": [str(exc)]})
def handle_db_summary() -> tuple[pd.DataFrame, str]:
"""Get the full database summary."""
try:
df = get_database_summary()
count = db.count()
info = f"πŸ“Š Database contains {count} datasheet(s)."
if df.empty:
return pd.DataFrame({"Info": ["Database is empty."]}), info
return df, info
except Exception as exc:
logger.exception("DB summary error")
return pd.DataFrame({"Error": [str(exc)]}), f"❌ Error: {exc}"
def handle_export_csv() -> str | None:
"""Export the entire database to a CSV file for download."""
try:
df = db.get_all_dataframe()
if df.empty:
return None
tmp = tempfile.NamedTemporaryFile(
suffix=".csv", delete=False, mode="w", encoding="utf-8",
)
df.to_csv(tmp.name, index=False)
tmp.close()
return tmp.name
except Exception as exc:
logger.exception("Export error")
return None
# ══════════════════════════════════════════════════════════════════════════════
# Gradio App
# ══════════════════════════════════════════════════════════════════════════════
def create_app() -> gr.Blocks:
"""Build the Gradio Blocks application."""
with gr.Blocks(
title="πŸ§ͺ Polymer Datasheet Agent",
theme=gr.themes.Soft(),
css="""
.header { text-align: center; margin-bottom: 1em; }
.status-box { font-size: 1.1em; font-weight: 600; padding: 0.5em; }
""",
) as app:
# ── Header ───────────────────────────────────────────────────────
gr.Markdown(
"""
# πŸ§ͺ Polymer Datasheet Crawler Agent
**Build a searchable database of commercial polymer datasheets.**
This agent uses **Tavily** to search the web for technical datasheets,
**LLaMA 3.1** to extract structured properties, and stores results in
a local **SQLite** database.
---
""",
elem_classes=["header"],
)
# ── Tab 1: Search & Add ──────────────────────────────────────────
with gr.Tab("πŸ” Search & Add Datasheet"):
gr.Markdown(
"Enter a manufacturer and/or polymer family to search for "
"datasheets online and add them to the database."
)
with gr.Row():
manufacturer_input = gr.Textbox(
label="Manufacturer",
placeholder="e.g., SABIC, BASF, DuPont",
scale=2,
)
polymer_input = gr.Textbox(
label="Polymer Family",
placeholder="e.g., Polycarbonate, Nylon 6,6, PEEK",
scale=2,
)
grade_input = gr.Textbox(
label="Grade (optional)",
placeholder="e.g., Lexan 141R, Ultramid A3K",
scale=2,
)
search_btn = gr.Button("πŸ” Search & Add", variant="primary", size="lg")
search_status = gr.Textbox(
label="Status",
interactive=False,
elem_classes=["status-box"],
)
with gr.Accordion("Extracted Properties", open=True):
search_table = gr.Dataframe(
label="Parsed Datasheet",
interactive=False,
wrap=True,
)
with gr.Accordion("Raw JSON Output", open=False):
search_json = gr.Code(
label="JSON",
language="json",
interactive=False,
)
search_btn.click(
fn=handle_search,
inputs=[manufacturer_input, polymer_input, grade_input],
outputs=[search_status, search_table, search_json],
)
# ── Tab 2: Upload Datasheet ──────────────────────────────────────
with gr.Tab("πŸ“„ Upload Datasheet"):
gr.Markdown(
"Upload a PDF datasheet to extract properties and add to the database."
)
file_input = gr.File(
label="Upload PDF Datasheet",
file_types=[".pdf"],
type="filepath",
)
upload_btn = gr.Button("πŸ“„ Parse & Add", variant="primary", size="lg")
upload_status = gr.Textbox(
label="Status",
interactive=False,
elem_classes=["status-box"],
)
with gr.Accordion("Extracted Properties", open=True):
upload_table = gr.Dataframe(
label="Parsed Datasheet",
interactive=False,
wrap=True,
)
with gr.Accordion("Raw JSON Output", open=False):
upload_json = gr.Code(
label="JSON",
language="json",
interactive=False,
)
upload_btn.click(
fn=handle_upload,
inputs=[file_input],
outputs=[upload_status, upload_table, upload_json],
)
# ── Tab 3: Database Browser ──────────────────────────────────────
with gr.Tab("πŸ—„οΈ Database Browser"):
gr.Markdown("Search and browse the existing datasheet database.")
with gr.Row():
db_query = gr.Textbox(
label="Search query",
placeholder="Free text search across all fields...",
scale=3,
)
db_manufacturer = gr.Textbox(
label="Filter: Manufacturer",
placeholder="e.g., BASF",
scale=2,
)
db_polymer = gr.Textbox(
label="Filter: Polymer Family",
placeholder="e.g., Polyamide",
scale=2,
)
with gr.Row():
db_search_btn = gr.Button("πŸ” Search Database", variant="primary")
db_refresh_btn = gr.Button("πŸ”„ Show All Records")
db_export_btn = gr.Button("πŸ“₯ Export to CSV")
db_info = gr.Textbox(label="Info", interactive=False)
db_results = gr.Dataframe(
label="Database Records",
interactive=False,
wrap=True,
)
export_file = gr.File(label="Download CSV", visible=True)
db_search_btn.click(
fn=handle_db_search,
inputs=[db_query, db_manufacturer, db_polymer],
outputs=[db_results],
)
db_refresh_btn.click(
fn=handle_db_summary,
inputs=[],
outputs=[db_results, db_info],
)
db_export_btn.click(
fn=handle_export_csv,
inputs=[],
outputs=[export_file],
)
# ── Tab 4: About / Help ──────────────────────────────────────────
with gr.Tab("ℹ️ About"):
gr.Markdown(
"""
## Architecture
This application is built with:
- **[LangGraph](https://github.com/langchain-ai/langgraph)** β€”
Orchestrates the agent workflow as a directed state graph.
- **[Tavily](https://tavily.com)** β€”
AI-optimized web search API for finding datasheets.
- **[LLaMA 3.1](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)** β€”
Open-source LLM via HuggingFace Inference API for structured extraction.
- **SQLite + SQLAlchemy** β€” Local relational database.
- **[Gradio](https://gradio.app)** β€” Web UI, deployable on HuggingFace Spaces.
## Workflow
```
User Input ──► Router ──► Web Search (Tavily) ──► LLM Parse (LLaMA 3.1) ──► Store DB ──► Output
β”‚ β–²
└──► Process Upload (PDF) β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
```
## Property Categories
The agent extracts properties across these categories:
- **General**: Material name, trade name, manufacturer, grade, applications
- **Mechanical**: Tensile/flexural strength, modulus, impact, hardness
- **Thermal**: Tm, Tg, HDT, Vicat, CTE, thermal conductivity
- **Physical**: Density, MFI, water absorption, specific gravity
- **Electrical**: Dielectric strength/constant, resistivity
- **Chemical Resistance**: Acid, alkali, solvent, UV resistance
- **Regulatory**: FDA, RoHS, REACH, UL94
## Data Sources
The crawler prioritizes trusted sources including:
MatWeb, Omnexus, UL Prospector, Campus Plastics,
and official manufacturer portals (SABIC, BASF, DuPont, Dow, etc.)
---
*Built for Plinity β€” Infinite Recyclable Polymers Project*
"""
)
return app
# ══════════════════════════════════════════════════════════════════════════════
# Main
# ══════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
app = create_app()
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
)