Spaces:
Sleeping
Sleeping
| """ | |
| Tavily-powered web crawler for retrieving polymer datasheets. | |
| Strategy: | |
| Phase 1 β Open web search (no domain filter) for broad discovery. | |
| Phase 2 β Targeted aggregator search (MatWeb, Omnexus, UL Prospector). | |
| Phase 3 β Manufacturer-specific search on their own site. | |
| Results are de-duplicated, PDF-only URLs are deprioritised (Tavily can't | |
| read them), and content is scored by relevance before being sent to the LLM. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import re | |
| from typing import Any | |
| from tavily import TavilyClient | |
| import config | |
| logger = logging.getLogger(__name__) | |
| # ββ Keywords that signal real datasheet content ββββββββββββββββββββββββββββββ | |
| _QUALITY_KEYWORDS = [ | |
| "tensile", "flexural", "density", "melt flow", "elongation", | |
| "modulus", "impact", "hardness", "HDT", "heat deflection", | |
| "glass transition", "melting point", "dielectric", "flammability", | |
| "ISO", "ASTM", "g/cm", "MPa", "kJ/m", "J/m", "Β°C", "shore", | |
| ] | |
| # Domains that are database aggregators (best sources for structured data) | |
| _AGGREGATOR_DOMAINS = [ | |
| "matweb.com", | |
| "omnexus.specialchem.com", | |
| "prospector.ides.com", | |
| "campusplastics.com", | |
| "plastics.ulprospector.com", | |
| "polymerdatabase.com", | |
| "matmatch.com", | |
| "materialstoday.com", | |
| ] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Query builders | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _build_open_queries( | |
| manufacturer: str, polymer_family: str, grade: str, | |
| ) -> list[str]: | |
| """Phase 1: broad web queries with NO domain restriction.""" | |
| parts = [p for p in (manufacturer, polymer_family, grade) if p] | |
| base = " ".join(parts) | |
| queries = [] | |
| if grade: | |
| # If a specific grade is given, lead with it | |
| queries.append(f"{grade} technical data sheet material properties") | |
| queries.append(f"{grade} {polymer_family} datasheet density tensile") | |
| else: | |
| queries.append(f"{base} technical data sheet material properties") | |
| queries.append( | |
| f"{base} datasheet density tensile modulus thermal" | |
| ) | |
| # A query phrased as a question often surfaces different results | |
| queries.append( | |
| f"What are the mechanical and thermal properties of {base}?" | |
| ) | |
| return queries | |
| def _build_aggregator_queries( | |
| manufacturer: str, polymer_family: str, grade: str, | |
| ) -> list[str]: | |
| """Phase 2: search restricted to well-known aggregator databases.""" | |
| parts = [p for p in (manufacturer, polymer_family, grade) if p] | |
| base = " ".join(parts) | |
| return [ | |
| f"{base} properties datasheet", | |
| ] | |
| def _build_manufacturer_queries( | |
| manufacturer: str, polymer_family: str, grade: str, | |
| ) -> list[str]: | |
| """Phase 3: search the manufacturer's own website.""" | |
| if not manufacturer: | |
| return [] | |
| domain = _guess_domain(manufacturer) | |
| parts = [p for p in (polymer_family, grade) if p] | |
| material = " ".join(parts) if parts else "polymer" | |
| return [ | |
| f"site:{domain} {material} datasheet properties", | |
| ] | |
| def _guess_domain(manufacturer: str) -> str: | |
| """Best-effort manufacturer β domain mapping.""" | |
| name = manufacturer.lower().replace(" ", "").replace("-", "") | |
| for domain in config.TRUSTED_DOMAINS: | |
| if name in domain.replace(".", ""): | |
| return domain | |
| return f"{name}.com" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Content quality helpers | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _is_pdf_url(url: str) -> bool: | |
| """Return True if the URL likely points directly to a PDF file.""" | |
| return bool(re.search(r"\.pdf(\?|#|$)", url, re.IGNORECASE)) | |
| def _content_quality_score(text: str) -> int: | |
| """ | |
| Score how many datasheet-relevant keywords appear in the text. | |
| Higher = more likely to contain useful property data. | |
| """ | |
| lower = text.lower() | |
| return sum(1 for kw in _QUALITY_KEYWORDS if kw.lower() in lower) | |
| def _pick_best_source_url(results: list[dict[str, Any]]) -> str: | |
| """Return the URL of the highest-quality non-PDF result.""" | |
| best_url, best_score = "", -1 | |
| for r in results: | |
| url = r.get("url", "") | |
| text = r.get("raw_content") or r.get("content", "") | |
| if _is_pdf_url(url): | |
| continue # Tavily rarely extracts useful text from PDFs | |
| score = _content_quality_score(text) | |
| if score > best_score: | |
| best_score = score | |
| best_url = url | |
| return best_url or (results[0].get("url", "") if results else "") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Main search function | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def search_datasheets( | |
| manufacturer: str, | |
| polymer_family: str, | |
| grade: str = "", | |
| ) -> tuple[list[dict[str, Any]], str]: | |
| """ | |
| Execute a multi-phase Tavily search and return | |
| (results_list, aggregated_raw_content). | |
| """ | |
| client = TavilyClient(api_key=config.TAVILY_API_KEY) | |
| all_results: list[dict[str, Any]] = [] | |
| seen_urls: set[str] = set() | |
| raw_texts: list[str] = [] | |
| def _run_queries( | |
| queries: list[str], | |
| include_domains: list[str] | None = None, | |
| max_results: int = 5, | |
| ) -> None: | |
| """Run a batch of queries and collect unique results.""" | |
| for query in queries: | |
| try: | |
| logger.info("Searching: %s (domains=%s)", query, include_domains or "any") | |
| kwargs: dict[str, Any] = dict( | |
| query=query, | |
| search_depth=config.TAVILY_SEARCH_DEPTH, | |
| max_results=max_results, | |
| include_raw_content=config.TAVILY_INCLUDE_RAW_CONTENT, | |
| ) | |
| if include_domains: | |
| kwargs["include_domains"] = include_domains | |
| response = client.search(**kwargs) | |
| for result in response.get("results", []): | |
| url = result.get("url", "") | |
| if url in seen_urls: | |
| continue | |
| seen_urls.add(url) | |
| # Skip direct PDF links β Tavily returns no useful text | |
| if _is_pdf_url(url): | |
| content = result.get("raw_content") or result.get("content", "") | |
| if len(content.strip()) < 200: | |
| logger.info("Skipping PDF URL with no text: %s", url) | |
| continue | |
| all_results.append(result) | |
| raw = result.get("raw_content") or result.get("content", "") | |
| if raw and raw.strip(): | |
| raw_texts.append( | |
| f"--- Source: {url} ---\n{raw[:8000]}\n" | |
| ) | |
| except Exception as exc: | |
| logger.warning("Search failed for query '%s': %s", query, exc) | |
| # Phase 1 β Open web (no domain filter) for broad discovery | |
| open_queries = _build_open_queries(manufacturer, polymer_family, grade) | |
| _run_queries(open_queries, include_domains=None, max_results=5) | |
| # Phase 2 β Aggregator databases (MatWeb, Omnexus, etc.) | |
| agg_queries = _build_aggregator_queries(manufacturer, polymer_family, grade) | |
| _run_queries(agg_queries, include_domains=_AGGREGATOR_DOMAINS, max_results=5) | |
| # Phase 3 β Manufacturer's own website | |
| mfr_queries = _build_manufacturer_queries(manufacturer, polymer_family, grade) | |
| if mfr_queries: | |
| _run_queries(mfr_queries, include_domains=None, max_results=3) | |
| # Sort raw_texts so highest-quality content comes first for the LLM | |
| raw_texts.sort( | |
| key=lambda t: _content_quality_score(t), | |
| reverse=True, | |
| ) | |
| aggregated = "\n".join(raw_texts) | |
| # Truncate to ~30k chars to stay within LLM context window | |
| if len(aggregated) > 30_000: | |
| aggregated = aggregated[:30_000] + "\n\n[Content truncated]" | |
| logger.info( | |
| "Collected %d unique results, %d chars of raw content", | |
| len(all_results), | |
| len(aggregated), | |
| ) | |
| return all_results, aggregated | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Single-URL extraction | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_from_url(url: str) -> tuple[list[dict[str, Any]], str]: | |
| """ | |
| Use Tavily extract to get content from a specific URL. | |
| Useful when the user provides a direct datasheet link. | |
| """ | |
| client = TavilyClient(api_key=config.TAVILY_API_KEY) | |
| try: | |
| response = client.extract(urls=[url]) | |
| results = response.get("results", []) | |
| raw_texts = [] | |
| for r in results: | |
| raw = r.get("raw_content", "") | |
| if raw: | |
| raw_texts.append(raw[:15000]) | |
| return results, "\n".join(raw_texts) | |
| except Exception as exc: | |
| logger.error("URL extraction failed for %s: %s", url, exc) | |
| return [], "" | |