Aakash jammula commited on
Commit
5e6bfdb
·
1 Parent(s): ed250d7

:deep reasearch

Browse files
Files changed (1) hide show
  1. deep_research.py +22 -52
deep_research.py CHANGED
@@ -10,7 +10,11 @@ from langchain_google_genai import ChatGoogleGenerativeAI
10
  from langchain_core.messages import HumanMessage, SystemMessage
11
  from typing import Any, List ,Dict, Any, List, Optional
12
  import os
 
13
 
 
 
 
14
  max_web_research_loops=3
15
  fetch_full_page: bool =False
16
 
@@ -169,8 +173,9 @@ def format_sources(search_results):
169
  for source in search_results['results']
170
  )
171
 
172
- def duckduckgo_search(query: str, max_results: int = 3, fetch_full_page: bool = False) -> Dict[str, List[Dict[str, str]]]:
173
- """Search the web using DuckDuckGo.
 
174
 
175
  Args:
176
  query (str): The search query to execute
@@ -182,57 +187,24 @@ def duckduckgo_search(query: str, max_results: int = 3, fetch_full_page: bool =
182
  - title (str): Title of the search result
183
  - url (str): URL of the search result
184
  - content (str): Snippet/summary of the content
185
- - raw_content (str): Same as content since DDG doesn't provide full page content
186
  """
187
  try:
188
- with DDGS() as ddgs:
189
- results = []
190
- search_results = list(ddgs.text(query, max_results=max_results))
191
-
192
- for r in search_results:
193
- url = r.get('href')
194
- title = r.get('title')
195
- content = r.get('body')
196
-
197
- if not all([url, title, content]):
198
- print(f"Warning: Incomplete result from DuckDuckGo: {r}")
199
- continue
200
-
201
- raw_content = content
202
- if fetch_full_page:
203
- try:
204
- # Add headers to mimic a browser request
205
- headers = {
206
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
207
- }
208
-
209
- # Use requests instead of urllib for better header support
210
- import requests
211
- from bs4 import BeautifulSoup
212
-
213
- response = requests.get(url, headers=headers)
214
- response.raise_for_status() # Raise an error for bad status codes
215
- soup = BeautifulSoup(response.text, 'html.parser')
216
- raw_content = soup.get_text()
217
-
218
- except Exception as e:
219
- print(f"Warning: Failed to fetch full page content for {url}: {str(e)}")
220
- raw_content = content # Fallback to the snippet if full page fetch fails
221
-
222
- # Add result to list
223
- result = {
224
- "title": title,
225
- "url": url,
226
- "content": content,
227
- "raw_content": raw_content
228
- }
229
- results.append(result)
230
-
231
- return {"results": results}
232
  except Exception as e:
233
- print(f"Error in DuckDuckGo search: {str(e)}")
234
- print(f"Full error details: {type(e).__name__}")
235
  return {"results": []}
 
236
 
237
 
238
  @dataclass(kw_only=True)
@@ -309,10 +281,8 @@ def reflect_on_summary(state: SummaryState):
309
  return {"search_query": f"Tell me more about {state.research_topic}"} # Fallback query
310
 
311
  def web_research(state: SummaryState):
312
- search_results = duckduckgo_search(state.search_query, max_results=3, fetch_full_page=fetch_full_page)
313
  search_str = deduplicate_and_format_sources(search_results, max_tokens_per_source=1000, include_raw_content=True)
314
-
315
-
316
  return {"sources_gathered": [format_sources(search_results)], "research_loop_count": state.research_loop_count + 1, "web_research_results": [search_str]}
317
 
318
  def summarize_sources(state: SummaryState):
 
10
  from langchain_core.messages import HumanMessage, SystemMessage
11
  from typing import Any, List ,Dict, Any, List, Optional
12
  import os
13
+ from tavily import TavilyClient
14
 
15
+ # Initialize Tavily client
16
+ tavily_api_key = os.getenv("TAVILY_API_KEY")
17
+ tavily_client = TavilyClient(api_key=tavily_api_key)
18
  max_web_research_loops=3
19
  fetch_full_page: bool =False
20
 
 
173
  for source in search_results['results']
174
  )
175
 
176
+
177
+ def tavily_search(query: str, max_results: int = 3, fetch_full_page: bool = False) -> Dict[str, List[Dict[str, str]]]:
178
+ """Search the web using Tavily.
179
 
180
  Args:
181
  query (str): The search query to execute
 
187
  - title (str): Title of the search result
188
  - url (str): URL of the search result
189
  - content (str): Snippet/summary of the content
190
+ - raw_content (str): Full content if available, else same as content
191
  """
192
  try:
193
+ response = tavily_client.search(query=query, max_results=max_results, include_raw_content=fetch_full_page)
194
+ results = []
195
+ for r in response["results"]:
196
+ result = {
197
+ "title": r.get("title"),
198
+ "url": r.get("url"),
199
+ "content": r.get("content"),
200
+ "raw_content": r.get("raw_content", r.get("content"))
201
+ }
202
+ results.append(result)
203
+ return {"results": results}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  except Exception as e:
205
+ print(f"Error in Tavily search: {str(e)}")
 
206
  return {"results": []}
207
+
208
 
209
 
210
  @dataclass(kw_only=True)
 
281
  return {"search_query": f"Tell me more about {state.research_topic}"} # Fallback query
282
 
283
  def web_research(state: SummaryState):
284
+ search_results = tavily_search(state.search_query, max_results=3, fetch_full_page=fetch_full_page)
285
  search_str = deduplicate_and_format_sources(search_results, max_tokens_per_source=1000, include_raw_content=True)
 
 
286
  return {"sources_gathered": [format_sources(search_results)], "research_loop_count": state.research_loop_count + 1, "web_research_results": [search_str]}
287
 
288
  def summarize_sources(state: SummaryState):