Charles Azam commited on
Commit
4a7db54
·
1 Parent(s): f1368c4

feat: remove unnecessary file

Browse files
Files changed (1) hide show
  1. src/deepengineer/webcrawler/utils.py +0 -374
src/deepengineer/webcrawler/utils.py DELETED
@@ -1,374 +0,0 @@
1
-
2
-
3
-
4
- def get_config_value(value):
5
- """
6
- Helper function to handle string, dict, and enum cases of configuration values
7
- """
8
- if isinstance(value, str):
9
- return value
10
- elif isinstance(value, dict):
11
- return value
12
- else:
13
- return value.value
14
-
15
- def get_search_params(search_api: str, search_api_config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
16
- """
17
- Filters the search_api_config dictionary to include only parameters accepted by the specified search API.
18
-
19
- Args:
20
- search_api (str): The search API identifier (e.g., "exa", "tavily").
21
- search_api_config (Optional[Dict[str, Any]]): The configuration dictionary for the search API.
22
-
23
- Returns:
24
- Dict[str, Any]: A dictionary of parameters to pass to the search function.
25
- """
26
- # Define accepted parameters for each search API
27
- SEARCH_API_PARAMS = {
28
- "exa": ["max_characters", "num_results", "include_domains", "exclude_domains", "subpages"],
29
- "tavily": ["max_results", "topic"],
30
- "perplexity": [], # Perplexity accepts no additional parameters
31
- "arxiv": ["load_max_docs", "get_full_documents", "load_all_available_meta"],
32
- "pubmed": ["top_k_results", "email", "api_key", "doc_content_chars_max"],
33
- "linkup": ["depth"],
34
- "googlesearch": ["max_results"],
35
- }
36
-
37
- # Get the list of accepted parameters for the given search API
38
- accepted_params = SEARCH_API_PARAMS.get(search_api, [])
39
-
40
- # If no config provided, return an empty dict
41
- if not search_api_config:
42
- return {}
43
-
44
- # Filter the config to only include accepted parameters
45
- return {k: v for k, v in search_api_config.items() if k in accepted_params}
46
-
47
- def deduplicate_and_format_sources(
48
- search_responses: SearchResponses,
49
- config: Optional[DeduplicationConfig] = None
50
- ) -> str:
51
- """
52
- Takes a list of search responses and formats them into a readable string.
53
- Limits the raw_content to approximately max_tokens_per_source tokens.
54
-
55
- Args:
56
- search_responses: List of search responses
57
- config: Configuration for deduplication and formatting
58
-
59
- Returns:
60
- str: Formatted string with deduplicated sources
61
- """
62
- if config is None:
63
- config = DeduplicationConfig()
64
-
65
- # Collect all results
66
- sources_list: List[SearchResult] = []
67
- for response in search_responses:
68
- sources_list.extend(response.results)
69
-
70
- # Deduplicate by URL
71
- if config.deduplication_strategy == "keep_first":
72
- unique_sources: Dict[str, SearchResult] = {}
73
- for source in sources_list:
74
- if source.url not in unique_sources:
75
- unique_sources[source.url] = source
76
- elif config.deduplication_strategy == "keep_last":
77
- unique_sources = {source.url: source for source in sources_list}
78
- else:
79
- raise ValueError(f"Invalid deduplication strategy: {config.deduplication_strategy}")
80
-
81
- # Format output
82
- formatted_text = "Content from sources:\n"
83
- for i, source in enumerate(unique_sources.values(), 1):
84
- formatted_text += f"{'='*80}\n" # Clear section separator
85
- formatted_text += f"Source: {source.title}\n"
86
- formatted_text += f"{'-'*80}\n" # Subsection separator
87
- formatted_text += f"URL: {source.url}\n===\n"
88
- formatted_text += f"Most relevant content from source: {source.content}\n===\n"
89
- if config.include_raw_content:
90
- # Using rough estimate of 4 characters per token
91
- char_limit = config.max_tokens_per_source * 4
92
- # Handle None raw_content
93
- raw_content = source.raw_content or ''
94
- if len(raw_content) > char_limit:
95
- raw_content = raw_content[:char_limit] + "... [truncated]"
96
- formatted_text += f"Full source content limited to {config.max_tokens_per_source} tokens: {raw_content}\n\n"
97
- formatted_text += f"{'='*80}\n\n" # End section separator
98
-
99
- return formatted_text.strip()
100
-
101
- def format_sections(sections: list[Section]) -> str:
102
- """ Format a list of sections into a string """
103
- formatted_str = ""
104
- for idx, section in enumerate(sections, 1):
105
- formatted_str += f"""
106
- {'='*60}
107
- Section {idx}: {section.name}
108
- {'='*60}
109
- Description:
110
- {section.description}
111
- Requires Research:
112
- {section.research}
113
-
114
- Content:
115
- {section.content if section.content else '[Not yet written]'}
116
-
117
- """
118
- return formatted_str
119
- search_queries: SearchQueries,
120
- params: Optional[PubMedSearchParams] = None
121
-
122
-
123
- TAVILY_SEARCH_DESCRIPTION = (
124
- "A search engine optimized for comprehensive, accurate, and trusted results. "
125
- "Useful for when you need to answer questions about current events."
126
- )
127
-
128
- @tool(description=TAVILY_SEARCH_DESCRIPTION)
129
- async def tavily_search(
130
- queries: SearchQueries,
131
- max_results: Annotated[int, InjectedToolArg] = 5,
132
- topic: Annotated[Literal["general", "news", "finance"], InjectedToolArg] = "general",
133
- config: RunnableConfig = None
134
- ) -> str:
135
- """
136
- Fetches results from Tavily search API.
137
-
138
- Args:
139
- queries: List of search queries
140
- max_results: Maximum number of results to return
141
- topic: Topic to filter results by
142
-
143
- Returns:
144
- str: A formatted string of search results
145
- """
146
- # Use tavily_search_async with include_raw_content=True to get content directly
147
- params = TavilySearchParams(max_results=max_results, topic=topic, include_raw_content=True)
148
- search_results = await tavily_search_async(queries, params)
149
-
150
- # Format the search results directly using the raw_content already provided
151
- formatted_output = f"Search results: \n\n"
152
-
153
- # Deduplicate results by URL
154
- unique_results: Dict[str, Dict[str, Any]] = {}
155
- for response in search_results:
156
- for result in response.results:
157
- url = result.url
158
- if url not in unique_results:
159
- unique_results[url] = {
160
- "title": result.title,
161
- "url": result.url,
162
- "content": result.content,
163
- "raw_content": result.raw_content,
164
- "query": response.query
165
- }
166
-
167
- async def noop():
168
- return None
169
-
170
- configurable = Configuration.from_runnable_config(config)
171
- max_char_to_include = 30_000
172
- # TODO: share this behavior across all search implementations / tools
173
- if configurable.process_search_results == "summarize":
174
- if configurable.summarization_model_provider == "anthropic":
175
- extra_kwargs = {"betas": ["extended-cache-ttl-2025-04-11"]}
176
- else:
177
- extra_kwargs = {}
178
-
179
- summarization_model = init_chat_model(
180
- model=configurable.summarization_model,
181
- model_provider=configurable.summarization_model_provider,
182
- max_retries=configurable.max_structured_output_retries,
183
- **extra_kwargs
184
- )
185
- summarization_tasks = [
186
- noop() if not result.get("raw_content") else summarize_webpage(summarization_model, result['raw_content'][:max_char_to_include])
187
- for result in unique_results.values()
188
- ]
189
- summaries = await asyncio.gather(*summarization_tasks)
190
- unique_results = {
191
- url: {'title': result['title'], 'content': result['content'] if summary is None else summary}
192
- for url, result, summary in zip(unique_results.keys(), unique_results.values(), summaries)
193
- }
194
- elif configurable.process_search_results == "split_and_rerank":
195
- embeddings = init_embeddings("openai:text-embedding-3-small")
196
- results_by_query = itertools.groupby(unique_results.values(), key=lambda x: x['query'])
197
- all_retrieved_docs = []
198
- for query, query_results in results_by_query:
199
- retrieved_docs = split_and_rerank_search_results(embeddings, query, query_results)
200
- all_retrieved_docs.extend(retrieved_docs)
201
-
202
- stitched_docs = stitch_documents_by_url(all_retrieved_docs)
203
- unique_results = {
204
- doc.metadata['url']: {'title': doc.metadata['title'], 'content': doc.page_content}
205
- for doc in stitched_docs
206
- }
207
-
208
- # Format the unique results
209
- for i, (url, result) in enumerate(unique_results.items()):
210
- formatted_output += f"\n\n--- SOURCE {i+1}: {result['title']} ---\n"
211
- formatted_output += f"URL: {url}\n\n"
212
- formatted_output += f"SUMMARY:\n{result['content']}\n\n"
213
- if result.get('raw_content'):
214
- formatted_output += f"FULL CONTENT:\n{result['raw_content'][:max_char_to_include]}" # Limit content size
215
- formatted_output += "\n\n" + "-" * 80 + "\n"
216
-
217
- if unique_results:
218
- return formatted_output
219
- else:
220
- return "No valid search results found. Please try different search queries or use a different search API."
221
-
222
-
223
-
224
- async def select_and_execute_search(search_api: str, query_list: SearchQueries, params_to_pass: dict) -> str:
225
- """Select and execute the appropriate search API.
226
-
227
- Args:
228
- search_api: Name of the search API to use
229
- query_list: List of search queries to execute
230
- params_to_pass: Parameters to pass to the search API
231
-
232
- Returns:
233
- Formatted string containing search results
234
-
235
- Raises:
236
- ValueError: If an unsupported search API is specified
237
- """
238
- if search_api == "tavily":
239
- # Tavily search tool used with both workflow and agent
240
- # and returns a formatted source string
241
- return await tavily_search.ainvoke({'queries': query_list, **params_to_pass})
242
- elif search_api == "duckduckgo":
243
- # DuckDuckGo search tool used with both workflow and agent
244
- return await duckduckgo_search.ainvoke({'search_queries': query_list})
245
- elif search_api == "perplexity":
246
- search_results = perplexity_search(query_list)
247
- elif search_api == "exa":
248
- params = ExaSearchParams(**params_to_pass) if params_to_pass else None
249
- search_results = await exa_search(query_list, params)
250
- elif search_api == "arxiv":
251
- params = ArxivSearchParams(**params_to_pass) if params_to_pass else None
252
- search_results = await arxiv_search_async(query_list, params)
253
- elif search_api == "pubmed":
254
- params = PubMedSearchParams(**params_to_pass) if params_to_pass else None
255
- search_results = await pubmed_search_async(query_list, params)
256
- elif search_api == "linkup":
257
- params = LinkupSearchParams(**params_to_pass) if params_to_pass else None
258
- search_results = await linkup_search(query_list, params)
259
- elif search_api == "googlesearch":
260
- params = GoogleSearchParams(**params_to_pass) if params_to_pass else None
261
- search_results = await google_search_async(query_list, params)
262
- elif search_api == "azureaisearch":
263
- params = AzureAISearchParams(**params_to_pass) if params_to_pass else None
264
- search_results = await azureaisearch_search_async(query_list, params)
265
- else:
266
- raise ValueError(f"Unsupported search API: {search_api}")
267
-
268
- config = DeduplicationConfig(max_tokens_per_source=4000, deduplication_strategy="keep_first")
269
- return deduplicate_and_format_sources(search_results, config)
270
-
271
-
272
-
273
-
274
-
275
- async def summarize_webpage(model: BaseChatModel, webpage_content: str) -> str:
276
- """Summarize webpage content."""
277
- try:
278
- user_input_content = "Please summarize the article"
279
- if isinstance(model, ChatAnthropic):
280
- user_input_content = [{
281
- "type": "text",
282
- "text": user_input_content,
283
- "cache_control": {"type": "ephemeral", "ttl": "1h"}
284
- }]
285
-
286
- summary = await model.with_structured_output(Summary).with_retry(stop_after_attempt=2).ainvoke([
287
- {"role": "system", "content": SUMMARIZATION_PROMPT.format(webpage_content=webpage_content)},
288
- {"role": "user", "content": user_input_content},
289
- ])
290
- except:
291
- # fall back on the raw content
292
- return webpage_content
293
-
294
- def format_summary(summary: Summary):
295
- excerpts_str = "\n".join(f'- {e}' for e in summary.key_excerpts)
296
- return f"""<summary>\n{summary.summary}\n</summary>\n\n<key_excerpts>\n{excerpts_str}\n</key_excerpts>"""
297
-
298
- return format_summary(summary)
299
-
300
-
301
- def split_and_rerank_search_results(
302
- embeddings: Embeddings,
303
- query: str,
304
- search_results: List[SearchResult],
305
- config: Optional[SplitAndRerankConfig] = None
306
- ):
307
- """Split and rerank search results using embeddings."""
308
- if config is None:
309
- config = SplitAndRerankConfig()
310
-
311
- # split webpage content into chunks
312
- text_splitter = RecursiveCharacterTextSplitter(
313
- chunk_size=config.chunk_size,
314
- chunk_overlap=config.chunk_overlap,
315
- add_start_index=True
316
- )
317
- documents = [
318
- Document(
319
- page_content=result.raw_content or result.content,
320
- metadata={"url": result.url, "title": result.title}
321
- )
322
- for result in search_results
323
- ]
324
- all_splits = text_splitter.split_documents(documents)
325
-
326
- # index chunks
327
- vector_store = InMemoryVectorStore(embeddings)
328
- vector_store.add_documents(documents=all_splits)
329
-
330
- # retrieve relevant chunks
331
- retrieved_docs = vector_store.similarity_search(query, k=config.max_chunks)
332
- return retrieved_docs
333
-
334
-
335
- def stitch_documents_by_url(documents: list[Document]) -> list[Document]:
336
- url_to_docs: defaultdict[str, list[Document]] = defaultdict(list)
337
- url_to_snippet_hashes: defaultdict[str, set[str]] = defaultdict(set)
338
- for doc in documents:
339
- snippet_hash = hashlib.sha256(doc.page_content.encode()).hexdigest()
340
- url = doc.metadata['url']
341
- # deduplicate snippets by the content
342
- if snippet_hash in url_to_snippet_hashes[url]:
343
- continue
344
-
345
- url_to_docs[url].append(doc)
346
- url_to_snippet_hashes[url].add(snippet_hash)
347
-
348
- # stitch retrieved chunks into a single doc per URL
349
- stitched_docs = []
350
- for docs in url_to_docs.values():
351
- stitched_doc = Document(
352
- page_content="\n\n".join([f"...{doc.page_content}..." for doc in docs]),
353
- metadata=cast(Document, docs[0]).metadata
354
- )
355
- stitched_docs.append(stitched_doc)
356
-
357
- return stitched_docs
358
-
359
-
360
- def get_today_str() -> str:
361
- """Get current date in a human-readable format."""
362
- return datetime.datetime.now().strftime("%a %b %-d, %Y")
363
-
364
-
365
- async def load_mcp_server_config(path: str) -> dict:
366
- """Load MCP server configuration from a file."""
367
-
368
- def _load():
369
- with open(path, "r") as f:
370
- config = json.load(f)
371
- return config
372
-
373
- config = await asyncio.to_thread(_load)
374
- return config