ravimohan19 commited on
Commit
d50b7b6
·
verified ·
1 Parent(s): 253b014

Upload web_crawler.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. web_crawler.py +140 -0
web_crawler.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tavily-powered web crawler for retrieving polymer datasheets.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ from typing import Any
9
+
10
+ from tavily import TavilyClient
11
+
12
+ import config
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def build_search_queries(
18
+ manufacturer: str,
19
+ polymer_family: str,
20
+ grade: str = "",
21
+ ) -> list[str]:
22
+ """
23
+ Generate multiple targeted search queries to maximize the chance
24
+ of finding the correct technical datasheet.
25
+ """
26
+ base_terms = []
27
+ if manufacturer:
28
+ base_terms.append(manufacturer)
29
+ if polymer_family:
30
+ base_terms.append(polymer_family)
31
+ if grade:
32
+ base_terms.append(grade)
33
+
34
+ base = " ".join(base_terms)
35
+
36
+ queries = [
37
+ f"{base} technical data sheet properties",
38
+ f"{base} TDS material properties datasheet",
39
+ f"{base} mechanical thermal physical properties",
40
+ ]
41
+
42
+ # Add manufacturer-specific datasheet portal query
43
+ if manufacturer:
44
+ queries.append(
45
+ f"site:{_guess_domain(manufacturer)} {polymer_family} {grade} datasheet"
46
+ )
47
+
48
+ # Add aggregator queries
49
+ queries.append(f"{base} datasheet matweb OR omnexus OR UL Prospector")
50
+
51
+ return queries
52
+
53
+
54
+ def _guess_domain(manufacturer: str) -> str:
55
+ """Attempt to guess manufacturer domain for site-scoped search."""
56
+ name = manufacturer.lower().replace(" ", "")
57
+ for domain in config.TRUSTED_DOMAINS:
58
+ if name in domain:
59
+ return domain
60
+ return f"{name}.com"
61
+
62
+
63
+ def search_datasheets(
64
+ manufacturer: str,
65
+ polymer_family: str,
66
+ grade: str = "",
67
+ ) -> tuple[list[dict[str, Any]], str]:
68
+ """
69
+ Execute Tavily searches and return (results_list, aggregated_raw_content).
70
+ """
71
+ client = TavilyClient(api_key=config.TAVILY_API_KEY)
72
+
73
+ queries = build_search_queries(manufacturer, polymer_family, grade)
74
+ all_results: list[dict[str, Any]] = []
75
+ seen_urls: set[str] = set()
76
+ raw_texts: list[str] = []
77
+
78
+ for query in queries:
79
+ try:
80
+ logger.info("Searching: %s", query)
81
+ response = client.search(
82
+ query=query,
83
+ search_depth=config.TAVILY_SEARCH_DEPTH,
84
+ max_results=config.TAVILY_MAX_RESULTS,
85
+ include_raw_content=config.TAVILY_INCLUDE_RAW_CONTENT,
86
+ include_domains=config.TRUSTED_DOMAINS,
87
+ )
88
+
89
+ for result in response.get("results", []):
90
+ url = result.get("url", "")
91
+ if url in seen_urls:
92
+ continue
93
+ seen_urls.add(url)
94
+
95
+ all_results.append(result)
96
+
97
+ # Collect raw content for LLM parsing
98
+ raw = result.get("raw_content") or result.get("content", "")
99
+ if raw:
100
+ raw_texts.append(
101
+ f"--- Source: {url} ---\n{raw[:8000]}\n"
102
+ )
103
+
104
+ except Exception as exc:
105
+ logger.warning("Search failed for query '%s': %s", query, exc)
106
+
107
+ aggregated = "\n".join(raw_texts)
108
+
109
+ # Truncate to ~30k chars to stay within LLM context window
110
+ if len(aggregated) > 30_000:
111
+ aggregated = aggregated[:30_000] + "\n\n[Content truncated]"
112
+
113
+ logger.info(
114
+ "Collected %d unique results, %d chars of raw content",
115
+ len(all_results),
116
+ len(aggregated),
117
+ )
118
+
119
+ return all_results, aggregated
120
+
121
+
122
+ def extract_from_url(url: str) -> tuple[list[dict[str, Any]], str]:
123
+ """
124
+ Use Tavily extract to get content from a specific URL.
125
+ Useful when the user provides a direct datasheet link.
126
+ """
127
+ client = TavilyClient(api_key=config.TAVILY_API_KEY)
128
+
129
+ try:
130
+ response = client.extract(urls=[url])
131
+ results = response.get("results", [])
132
+ raw_texts = []
133
+ for r in results:
134
+ raw = r.get("raw_content", "")
135
+ if raw:
136
+ raw_texts.append(raw[:15000])
137
+ return results, "\n".join(raw_texts)
138
+ except Exception as exc:
139
+ logger.error("URL extraction failed for %s: %s", url, exc)
140
+ return [], ""