polymer-datasheet-agent / web_crawler.py
ravimohan19's picture
Upload web_crawler.py with huggingface_hub
0056bc7 verified
"""
Tavily-powered web crawler for retrieving polymer datasheets.
Strategy:
Phase 1 β€” Open web search (no domain filter) for broad discovery.
Phase 2 β€” Targeted aggregator search (MatWeb, Omnexus, UL Prospector).
Phase 3 β€” Manufacturer-specific search on their own site.
Results are de-duplicated, PDF-only URLs are deprioritised (Tavily can't
read them), and content is scored by relevance before being sent to the LLM.
"""
from __future__ import annotations
import logging
import re
from typing import Any
from tavily import TavilyClient
import config
logger = logging.getLogger(__name__)
# ── Keywords that signal real datasheet content ──────────────────────────────
_QUALITY_KEYWORDS = [
"tensile", "flexural", "density", "melt flow", "elongation",
"modulus", "impact", "hardness", "HDT", "heat deflection",
"glass transition", "melting point", "dielectric", "flammability",
"ISO", "ASTM", "g/cm", "MPa", "kJ/m", "J/m", "Β°C", "shore",
]
# Domains that are database aggregators (best sources for structured data)
_AGGREGATOR_DOMAINS = [
"matweb.com",
"omnexus.specialchem.com",
"prospector.ides.com",
"campusplastics.com",
"plastics.ulprospector.com",
"polymerdatabase.com",
"matmatch.com",
"materialstoday.com",
]
# ══════════════════════════════════════════════════════════════════════════════
# Query builders
# ══════════════════════════════════════════════════════════════════════════════
def _build_open_queries(
manufacturer: str, polymer_family: str, grade: str,
) -> list[str]:
"""Phase 1: broad web queries with NO domain restriction."""
parts = [p for p in (manufacturer, polymer_family, grade) if p]
base = " ".join(parts)
queries = []
if grade:
# If a specific grade is given, lead with it
queries.append(f"{grade} technical data sheet material properties")
queries.append(f"{grade} {polymer_family} datasheet density tensile")
else:
queries.append(f"{base} technical data sheet material properties")
queries.append(
f"{base} datasheet density tensile modulus thermal"
)
# A query phrased as a question often surfaces different results
queries.append(
f"What are the mechanical and thermal properties of {base}?"
)
return queries
def _build_aggregator_queries(
manufacturer: str, polymer_family: str, grade: str,
) -> list[str]:
"""Phase 2: search restricted to well-known aggregator databases."""
parts = [p for p in (manufacturer, polymer_family, grade) if p]
base = " ".join(parts)
return [
f"{base} properties datasheet",
]
def _build_manufacturer_queries(
manufacturer: str, polymer_family: str, grade: str,
) -> list[str]:
"""Phase 3: search the manufacturer's own website."""
if not manufacturer:
return []
domain = _guess_domain(manufacturer)
parts = [p for p in (polymer_family, grade) if p]
material = " ".join(parts) if parts else "polymer"
return [
f"site:{domain} {material} datasheet properties",
]
def _guess_domain(manufacturer: str) -> str:
"""Best-effort manufacturer β†’ domain mapping."""
name = manufacturer.lower().replace(" ", "").replace("-", "")
for domain in config.TRUSTED_DOMAINS:
if name in domain.replace(".", ""):
return domain
return f"{name}.com"
# ══════════════════════════════════════════════════════════════════════════════
# Content quality helpers
# ══════════════════════════════════════════════════════════════════════════════
def _is_pdf_url(url: str) -> bool:
"""Return True if the URL likely points directly to a PDF file."""
return bool(re.search(r"\.pdf(\?|#|$)", url, re.IGNORECASE))
def _content_quality_score(text: str) -> int:
"""
Score how many datasheet-relevant keywords appear in the text.
Higher = more likely to contain useful property data.
"""
lower = text.lower()
return sum(1 for kw in _QUALITY_KEYWORDS if kw.lower() in lower)
def _pick_best_source_url(results: list[dict[str, Any]]) -> str:
"""Return the URL of the highest-quality non-PDF result."""
best_url, best_score = "", -1
for r in results:
url = r.get("url", "")
text = r.get("raw_content") or r.get("content", "")
if _is_pdf_url(url):
continue # Tavily rarely extracts useful text from PDFs
score = _content_quality_score(text)
if score > best_score:
best_score = score
best_url = url
return best_url or (results[0].get("url", "") if results else "")
# ══════════════════════════════════════════════════════════════════════════════
# Main search function
# ══════════════════════════════════════════════════════════════════════════════
def search_datasheets(
manufacturer: str,
polymer_family: str,
grade: str = "",
) -> tuple[list[dict[str, Any]], str]:
"""
Execute a multi-phase Tavily search and return
(results_list, aggregated_raw_content).
"""
client = TavilyClient(api_key=config.TAVILY_API_KEY)
all_results: list[dict[str, Any]] = []
seen_urls: set[str] = set()
raw_texts: list[str] = []
def _run_queries(
queries: list[str],
include_domains: list[str] | None = None,
max_results: int = 5,
) -> None:
"""Run a batch of queries and collect unique results."""
for query in queries:
try:
logger.info("Searching: %s (domains=%s)", query, include_domains or "any")
kwargs: dict[str, Any] = dict(
query=query,
search_depth=config.TAVILY_SEARCH_DEPTH,
max_results=max_results,
include_raw_content=config.TAVILY_INCLUDE_RAW_CONTENT,
)
if include_domains:
kwargs["include_domains"] = include_domains
response = client.search(**kwargs)
for result in response.get("results", []):
url = result.get("url", "")
if url in seen_urls:
continue
seen_urls.add(url)
# Skip direct PDF links β€” Tavily returns no useful text
if _is_pdf_url(url):
content = result.get("raw_content") or result.get("content", "")
if len(content.strip()) < 200:
logger.info("Skipping PDF URL with no text: %s", url)
continue
all_results.append(result)
raw = result.get("raw_content") or result.get("content", "")
if raw and raw.strip():
raw_texts.append(
f"--- Source: {url} ---\n{raw[:8000]}\n"
)
except Exception as exc:
logger.warning("Search failed for query '%s': %s", query, exc)
# Phase 1 β€” Open web (no domain filter) for broad discovery
open_queries = _build_open_queries(manufacturer, polymer_family, grade)
_run_queries(open_queries, include_domains=None, max_results=5)
# Phase 2 β€” Aggregator databases (MatWeb, Omnexus, etc.)
agg_queries = _build_aggregator_queries(manufacturer, polymer_family, grade)
_run_queries(agg_queries, include_domains=_AGGREGATOR_DOMAINS, max_results=5)
# Phase 3 β€” Manufacturer's own website
mfr_queries = _build_manufacturer_queries(manufacturer, polymer_family, grade)
if mfr_queries:
_run_queries(mfr_queries, include_domains=None, max_results=3)
# Sort raw_texts so highest-quality content comes first for the LLM
raw_texts.sort(
key=lambda t: _content_quality_score(t),
reverse=True,
)
aggregated = "\n".join(raw_texts)
# Truncate to ~30k chars to stay within LLM context window
if len(aggregated) > 30_000:
aggregated = aggregated[:30_000] + "\n\n[Content truncated]"
logger.info(
"Collected %d unique results, %d chars of raw content",
len(all_results),
len(aggregated),
)
return all_results, aggregated
# ══════════════════════════════════════════════════════════════════════════════
# Single-URL extraction
# ══════════════════════════════════════════════════════════════════════════════
def extract_from_url(url: str) -> tuple[list[dict[str, Any]], str]:
"""
Use Tavily extract to get content from a specific URL.
Useful when the user provides a direct datasheet link.
"""
client = TavilyClient(api_key=config.TAVILY_API_KEY)
try:
response = client.extract(urls=[url])
results = response.get("results", [])
raw_texts = []
for r in results:
raw = r.get("raw_content", "")
if raw:
raw_texts.append(raw[:15000])
return results, "\n".join(raw_texts)
except Exception as exc:
logger.error("URL extraction failed for %s: %s", url, exc)
return [], ""