""" Tavily-powered web crawler for retrieving polymer datasheets. Strategy: Phase 1 — Open web search (no domain filter) for broad discovery. Phase 2 — Targeted aggregator search (MatWeb, Omnexus, UL Prospector). Phase 3 — Manufacturer-specific search on their own site. Results are de-duplicated, PDF-only URLs are deprioritised (Tavily can't read them), and content is scored by relevance before being sent to the LLM. """ from __future__ import annotations import logging import re from typing import Any from tavily import TavilyClient import config logger = logging.getLogger(__name__) # ── Keywords that signal real datasheet content ────────────────────────────── _QUALITY_KEYWORDS = [ "tensile", "flexural", "density", "melt flow", "elongation", "modulus", "impact", "hardness", "HDT", "heat deflection", "glass transition", "melting point", "dielectric", "flammability", "ISO", "ASTM", "g/cm", "MPa", "kJ/m", "J/m", "°C", "shore", ] # Domains that are database aggregators (best sources for structured data) _AGGREGATOR_DOMAINS = [ "matweb.com", "omnexus.specialchem.com", "prospector.ides.com", "campusplastics.com", "plastics.ulprospector.com", "polymerdatabase.com", "matmatch.com", "materialstoday.com", ] # ══════════════════════════════════════════════════════════════════════════════ # Query builders # ══════════════════════════════════════════════════════════════════════════════ def _build_open_queries( manufacturer: str, polymer_family: str, grade: str, ) -> list[str]: """Phase 1: broad web queries with NO domain restriction.""" parts = [p for p in (manufacturer, polymer_family, grade) if p] base = " ".join(parts) queries = [] if grade: # If a specific grade is given, lead with it queries.append(f"{grade} technical data sheet material properties") queries.append(f"{grade} {polymer_family} datasheet density tensile") else: queries.append(f"{base} technical data sheet material properties") queries.append( f"{base} datasheet density tensile modulus thermal" ) # A query phrased as a question often surfaces different results queries.append( f"What are the mechanical and thermal properties of {base}?" ) return queries def _build_aggregator_queries( manufacturer: str, polymer_family: str, grade: str, ) -> list[str]: """Phase 2: search restricted to well-known aggregator databases.""" parts = [p for p in (manufacturer, polymer_family, grade) if p] base = " ".join(parts) return [ f"{base} properties datasheet", ] def _build_manufacturer_queries( manufacturer: str, polymer_family: str, grade: str, ) -> list[str]: """Phase 3: search the manufacturer's own website.""" if not manufacturer: return [] domain = _guess_domain(manufacturer) parts = [p for p in (polymer_family, grade) if p] material = " ".join(parts) if parts else "polymer" return [ f"site:{domain} {material} datasheet properties", ] def _guess_domain(manufacturer: str) -> str: """Best-effort manufacturer → domain mapping.""" name = manufacturer.lower().replace(" ", "").replace("-", "") for domain in config.TRUSTED_DOMAINS: if name in domain.replace(".", ""): return domain return f"{name}.com" # ══════════════════════════════════════════════════════════════════════════════ # Content quality helpers # ══════════════════════════════════════════════════════════════════════════════ def _is_pdf_url(url: str) -> bool: """Return True if the URL likely points directly to a PDF file.""" return bool(re.search(r"\.pdf(\?|#|$)", url, re.IGNORECASE)) def _content_quality_score(text: str) -> int: """ Score how many datasheet-relevant keywords appear in the text. Higher = more likely to contain useful property data. """ lower = text.lower() return sum(1 for kw in _QUALITY_KEYWORDS if kw.lower() in lower) def _pick_best_source_url(results: list[dict[str, Any]]) -> str: """Return the URL of the highest-quality non-PDF result.""" best_url, best_score = "", -1 for r in results: url = r.get("url", "") text = r.get("raw_content") or r.get("content", "") if _is_pdf_url(url): continue # Tavily rarely extracts useful text from PDFs score = _content_quality_score(text) if score > best_score: best_score = score best_url = url return best_url or (results[0].get("url", "") if results else "") # ══════════════════════════════════════════════════════════════════════════════ # Main search function # ══════════════════════════════════════════════════════════════════════════════ def search_datasheets( manufacturer: str, polymer_family: str, grade: str = "", ) -> tuple[list[dict[str, Any]], str]: """ Execute a multi-phase Tavily search and return (results_list, aggregated_raw_content). """ client = TavilyClient(api_key=config.TAVILY_API_KEY) all_results: list[dict[str, Any]] = [] seen_urls: set[str] = set() raw_texts: list[str] = [] def _run_queries( queries: list[str], include_domains: list[str] | None = None, max_results: int = 5, ) -> None: """Run a batch of queries and collect unique results.""" for query in queries: try: logger.info("Searching: %s (domains=%s)", query, include_domains or "any") kwargs: dict[str, Any] = dict( query=query, search_depth=config.TAVILY_SEARCH_DEPTH, max_results=max_results, include_raw_content=config.TAVILY_INCLUDE_RAW_CONTENT, ) if include_domains: kwargs["include_domains"] = include_domains response = client.search(**kwargs) for result in response.get("results", []): url = result.get("url", "") if url in seen_urls: continue seen_urls.add(url) # Skip direct PDF links — Tavily returns no useful text if _is_pdf_url(url): content = result.get("raw_content") or result.get("content", "") if len(content.strip()) < 200: logger.info("Skipping PDF URL with no text: %s", url) continue all_results.append(result) raw = result.get("raw_content") or result.get("content", "") if raw and raw.strip(): raw_texts.append( f"--- Source: {url} ---\n{raw[:8000]}\n" ) except Exception as exc: logger.warning("Search failed for query '%s': %s", query, exc) # Phase 1 — Open web (no domain filter) for broad discovery open_queries = _build_open_queries(manufacturer, polymer_family, grade) _run_queries(open_queries, include_domains=None, max_results=5) # Phase 2 — Aggregator databases (MatWeb, Omnexus, etc.) agg_queries = _build_aggregator_queries(manufacturer, polymer_family, grade) _run_queries(agg_queries, include_domains=_AGGREGATOR_DOMAINS, max_results=5) # Phase 3 — Manufacturer's own website mfr_queries = _build_manufacturer_queries(manufacturer, polymer_family, grade) if mfr_queries: _run_queries(mfr_queries, include_domains=None, max_results=3) # Sort raw_texts so highest-quality content comes first for the LLM raw_texts.sort( key=lambda t: _content_quality_score(t), reverse=True, ) aggregated = "\n".join(raw_texts) # Truncate to ~30k chars to stay within LLM context window if len(aggregated) > 30_000: aggregated = aggregated[:30_000] + "\n\n[Content truncated]" logger.info( "Collected %d unique results, %d chars of raw content", len(all_results), len(aggregated), ) return all_results, aggregated # ══════════════════════════════════════════════════════════════════════════════ # Single-URL extraction # ══════════════════════════════════════════════════════════════════════════════ def extract_from_url(url: str) -> tuple[list[dict[str, Any]], str]: """ Use Tavily extract to get content from a specific URL. Useful when the user provides a direct datasheet link. """ client = TavilyClient(api_key=config.TAVILY_API_KEY) try: response = client.extract(urls=[url]) results = response.get("results", []) raw_texts = [] for r in results: raw = r.get("raw_content", "") if raw: raw_texts.append(raw[:15000]) return results, "\n".join(raw_texts) except Exception as exc: logger.error("URL extraction failed for %s: %s", url, exc) return [], ""