Spaces:

ravimohan19
/

polymer-datasheet-agent

Sleeping

App Files Files Community

polymer-datasheet-agent / app.py

ravimohan19

Upload app.py with huggingface_hub

4e03699 verified about 1 month ago

raw

history blame contribute delete

16.8 kB

	"""
	Gradio UI for the Polymer Datasheet Crawler Agent.
	Deployable as a HuggingFace Space.
	"""

	from __future__ import annotations

	import json
	import logging
	import os
	import tempfile
	from pathlib import Path

	import gradio as gr
	import pandas as pd

	from graph import (
	build_graph,
	db,
	run_search,
	run_upload,
	search_database,
	get_database_summary,
	)
	from pdf_extractor import extract_text_from_pdf
	from models import DatasheetRecord

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s \| %(name)s \| %(levelname)s \| %(message)s",
	)
	logger = logging.getLogger(__name__)


	# ══════════════════════════════════════════════════════════════════════════════
	# Handler Functions
	# ══════════════════════════════════════════════════════════════════════════════

	def handle_search(
	manufacturer: str,
	polymer_family: str,
	grade: str,
	progress=gr.Progress(),
	) -> tuple[str, pd.DataFrame, str]:
	"""
	Handle the 'Search & Add' tab: run the full LangGraph workflow
	to search, parse, and store a datasheet.
	"""
	if not manufacturer.strip() and not polymer_family.strip():
	return (
	"⚠️ Please provide at least a manufacturer or polymer family.",
	pd.DataFrame(),
	"",
	)

	progress(0.1, desc="Initializing search...")
	try:
	progress(0.3, desc="Searching the web with Tavily...")
	result = run_search(
	manufacturer=manufacturer.strip(),
	polymer_family=polymer_family.strip(),
	grade=grade.strip(),
	)
	progress(0.9, desc="Done!")

	status = result.get("status", "unknown")
	message = result.get("message", "")

	# Build display dataframe from parsed record
	parsed = result.get("parsed_datasheet")
	display_df = pd.DataFrame()
	json_output = ""

	if parsed:
	record = DatasheetRecord(**parsed) if isinstance(parsed, dict) else parsed
	flat = record.to_flat_dict()
	# Filter out empty values and metadata for display
	display_data = {
	k: v for k, v in flat.items()
	if v and k not in ("id", "created_at")
	}
	display_df = pd.DataFrame(
	list(display_data.items()),
	columns=["Property", "Value"],
	)
	json_output = json.dumps(flat, indent=2)

	status_icon = "✅" if status == "success" else "❌"
	return f"{status_icon} {message}", display_df, json_output

	except Exception as exc:
	logger.exception("Search handler error")
	return f"❌ Error: {exc}", pd.DataFrame(), ""


	def handle_upload(
	file_obj,
	progress=gr.Progress(),
	) -> tuple[str, pd.DataFrame, str]:
	"""
	Handle the 'Upload Datasheet' tab: extract text from PDF,
	then run the LangGraph workflow in upload mode.
	"""
	if file_obj is None:
	return "⚠️ Please upload a PDF file.", pd.DataFrame(), ""

	progress(0.1, desc="Reading PDF...")
	try:
	# Gradio gives us a file path
	file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
	extracted_text = extract_text_from_pdf(file_path)

	if not extracted_text.strip():
	return (
	"⚠️ Could not extract text from the PDF. "
	"It may be image-based (scanned). Try a text-based PDF.",
	pd.DataFrame(),
	"",
	)

	progress(0.4, desc="Parsing with LLM...")
	result = run_upload(uploaded_text=extracted_text)
	progress(0.9, desc="Done!")

	status = result.get("status", "unknown")
	message = result.get("message", "")

	parsed = result.get("parsed_datasheet")
	display_df = pd.DataFrame()
	json_output = ""

	if parsed:
	record = DatasheetRecord(**parsed) if isinstance(parsed, dict) else parsed
	flat = record.to_flat_dict()
	display_data = {
	k: v for k, v in flat.items()
	if v and k not in ("id", "created_at")
	}
	display_df = pd.DataFrame(
	list(display_data.items()),
	columns=["Property", "Value"],
	)
	json_output = json.dumps(flat, indent=2)

	status_icon = "✅" if status == "success" else "❌"
	return f"{status_icon} {message}", display_df, json_output

	except Exception as exc:
	logger.exception("Upload handler error")
	return f"❌ Error: {exc}", pd.DataFrame(), ""


	def handle_db_search(
	query: str,
	manufacturer: str,
	polymer_family: str,
	) -> pd.DataFrame:
	"""Search the database and return results."""
	try:
	df = search_database(
	query=query.strip(),
	manufacturer=manufacturer.strip(),
	polymer_family=polymer_family.strip(),
	)
	if df.empty:
	return pd.DataFrame({"Info": ["No matching records found."]})
	return df
	except Exception as exc:
	logger.exception("DB search error")
	return pd.DataFrame({"Error": [str(exc)]})


	def handle_db_summary() -> tuple[pd.DataFrame, str]:
	"""Get the full database summary."""
	try:
	df = get_database_summary()
	count = db.count()
	info = f"📊 Database contains {count} datasheet(s)."
	if df.empty:
	return pd.DataFrame({"Info": ["Database is empty."]}), info
	return df, info
	except Exception as exc:
	logger.exception("DB summary error")
	return pd.DataFrame({"Error": [str(exc)]}), f"❌ Error: {exc}"


	def handle_export_csv() -> str \| None:
	"""Export the entire database to a CSV file for download."""
	try:
	df = db.get_all_dataframe()
	if df.empty:
	return None
	tmp = tempfile.NamedTemporaryFile(
	suffix=".csv", delete=False, mode="w", encoding="utf-8",
	)
	df.to_csv(tmp.name, index=False)
	tmp.close()
	return tmp.name
	except Exception as exc:
	logger.exception("Export error")
	return None


	# ══════════════════════════════════════════════════════════════════════════════
	# Gradio App
	# ══════════════════════════════════════════════════════════════════════════════

	def create_app() -> gr.Blocks:
	"""Build the Gradio Blocks application."""

	with gr.Blocks(
	title="🧪 Polymer Datasheet Agent",
	theme=gr.themes.Soft(),
	css="""
	.header { text-align: center; margin-bottom: 1em; }
	.status-box { font-size: 1.1em; font-weight: 600; padding: 0.5em; }
	""",
	) as app:

	# ── Header ───────────────────────────────────────────────────────
	gr.Markdown(
	"""
	# 🧪 Polymer Datasheet Crawler Agent
	Build a searchable database of commercial polymer datasheets.

	This agent uses Tavily to search the web for technical datasheets,
	LLaMA 3.1 to extract structured properties, and stores results in
	a local SQLite database.

	---
	""",
	elem_classes=["header"],
	)

	# ── Tab 1: Search & Add ──────────────────────────────────────────
	with gr.Tab("🔍 Search & Add Datasheet"):
	gr.Markdown(
	"Enter a manufacturer and/or polymer family to search for "
	"datasheets online and add them to the database."
	)

	with gr.Row():
	manufacturer_input = gr.Textbox(
	label="Manufacturer",
	placeholder="e.g., SABIC, BASF, DuPont",
	scale=2,
	)
	polymer_input = gr.Textbox(
	label="Polymer Family",
	placeholder="e.g., Polycarbonate, Nylon 6,6, PEEK",
	scale=2,
	)
	grade_input = gr.Textbox(
	label="Grade (optional)",
	placeholder="e.g., Lexan 141R, Ultramid A3K",
	scale=2,
	)

	search_btn = gr.Button("🔍 Search & Add", variant="primary", size="lg")

	search_status = gr.Textbox(
	label="Status",
	interactive=False,
	elem_classes=["status-box"],
	)

	with gr.Accordion("Extracted Properties", open=True):
	search_table = gr.Dataframe(
	label="Parsed Datasheet",
	interactive=False,
	wrap=True,
	)

	with gr.Accordion("Raw JSON Output", open=False):
	search_json = gr.Code(
	label="JSON",
	language="json",
	interactive=False,
	)

	search_btn.click(
	fn=handle_search,
	inputs=[manufacturer_input, polymer_input, grade_input],
	outputs=[search_status, search_table, search_json],
	)

	# ── Tab 2: Upload Datasheet ──────────────────────────────────────
	with gr.Tab("📄 Upload Datasheet"):
	gr.Markdown(
	"Upload a PDF datasheet to extract properties and add to the database."
	)

	file_input = gr.File(
	label="Upload PDF Datasheet",
	file_types=[".pdf"],
	type="filepath",
	)
	upload_btn = gr.Button("📄 Parse & Add", variant="primary", size="lg")

	upload_status = gr.Textbox(
	label="Status",
	interactive=False,
	elem_classes=["status-box"],
	)

	with gr.Accordion("Extracted Properties", open=True):
	upload_table = gr.Dataframe(
	label="Parsed Datasheet",
	interactive=False,
	wrap=True,
	)

	with gr.Accordion("Raw JSON Output", open=False):
	upload_json = gr.Code(
	label="JSON",
	language="json",
	interactive=False,
	)

	upload_btn.click(
	fn=handle_upload,
	inputs=[file_input],
	outputs=[upload_status, upload_table, upload_json],
	)

	# ── Tab 3: Database Browser ──────────────────────────────────────
	with gr.Tab("🗄️ Database Browser"):
	gr.Markdown("Search and browse the existing datasheet database.")

	with gr.Row():
	db_query = gr.Textbox(
	label="Search query",
	placeholder="Free text search across all fields...",
	scale=3,
	)
	db_manufacturer = gr.Textbox(
	label="Filter: Manufacturer",
	placeholder="e.g., BASF",
	scale=2,
	)
	db_polymer = gr.Textbox(
	label="Filter: Polymer Family",
	placeholder="e.g., Polyamide",
	scale=2,
	)

	with gr.Row():
	db_search_btn = gr.Button("🔍 Search Database", variant="primary")
	db_refresh_btn = gr.Button("🔄 Show All Records")
	db_export_btn = gr.Button("📥 Export to CSV")

	db_info = gr.Textbox(label="Info", interactive=False)

	db_results = gr.Dataframe(
	label="Database Records",
	interactive=False,
	wrap=True,
	)

	export_file = gr.File(label="Download CSV", visible=True)

	db_search_btn.click(
	fn=handle_db_search,
	inputs=[db_query, db_manufacturer, db_polymer],
	outputs=[db_results],
	)

	db_refresh_btn.click(
	fn=handle_db_summary,
	inputs=[],
	outputs=[db_results, db_info],
	)

	db_export_btn.click(
	fn=handle_export_csv,
	inputs=[],
	outputs=[export_file],
	)

	# ── Tab 4: About / Help ──────────────────────────────────────────
	with gr.Tab("ℹ️ About"):
	gr.Markdown(
	"""
	## Architecture

	This application is built with:

	- [LangGraph](https://github.com/langchain-ai/langgraph) —
	Orchestrates the agent workflow as a directed state graph.
	- [Tavily](https://tavily.com) —
	AI-optimized web search API for finding datasheets.
	- [LLaMA 3.1](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) —
	Open-source LLM via HuggingFace Inference API for structured extraction.
	- SQLite + SQLAlchemy — Local relational database.
	- [Gradio](https://gradio.app) — Web UI, deployable on HuggingFace Spaces.

	## Workflow

	```
	User Input ──► Router ──► Web Search (Tavily) ──► LLM Parse (LLaMA 3.1) ──► Store DB ──► Output
	│ ▲
	└──► Process Upload (PDF) ─────────────────┘
	```

	## Property Categories

	The agent extracts properties across these categories:
	- General: Material name, trade name, manufacturer, grade, applications
	- Mechanical: Tensile/flexural strength, modulus, impact, hardness
	- Thermal: Tm, Tg, HDT, Vicat, CTE, thermal conductivity
	- Physical: Density, MFI, water absorption, specific gravity
	- Electrical: Dielectric strength/constant, resistivity
	- Chemical Resistance: Acid, alkali, solvent, UV resistance
	- Regulatory: FDA, RoHS, REACH, UL94

	## Data Sources

	The crawler prioritizes trusted sources including:
	MatWeb, Omnexus, UL Prospector, Campus Plastics,
	and official manufacturer portals (SABIC, BASF, DuPont, Dow, etc.)

	---
	Built for Plinity — Infinite Recyclable Polymers Project
	"""
	)

	return app


	# ══════════════════════════════════════════════════════════════════════════════
	# Main
	# ══════════════════════════════════════════════════════════════════════════════

	if __name__ == "__main__":
	app = create_app()
	app.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	)