Spaces:
Sleeping
Sleeping
Aakash jammula commited on
Commit ·
5e6bfdb
1
Parent(s): ed250d7
:deep reasearch
Browse files- deep_research.py +22 -52
deep_research.py
CHANGED
|
@@ -10,7 +10,11 @@ from langchain_google_genai import ChatGoogleGenerativeAI
|
|
| 10 |
from langchain_core.messages import HumanMessage, SystemMessage
|
| 11 |
from typing import Any, List ,Dict, Any, List, Optional
|
| 12 |
import os
|
|
|
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
max_web_research_loops=3
|
| 15 |
fetch_full_page: bool =False
|
| 16 |
|
|
@@ -169,8 +173,9 @@ def format_sources(search_results):
|
|
| 169 |
for source in search_results['results']
|
| 170 |
)
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
|
|
|
| 174 |
|
| 175 |
Args:
|
| 176 |
query (str): The search query to execute
|
|
@@ -182,57 +187,24 @@ def duckduckgo_search(query: str, max_results: int = 3, fetch_full_page: bool =
|
|
| 182 |
- title (str): Title of the search result
|
| 183 |
- url (str): URL of the search result
|
| 184 |
- content (str): Snippet/summary of the content
|
| 185 |
-
- raw_content (str):
|
| 186 |
"""
|
| 187 |
try:
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
url
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
continue
|
| 200 |
-
|
| 201 |
-
raw_content = content
|
| 202 |
-
if fetch_full_page:
|
| 203 |
-
try:
|
| 204 |
-
# Add headers to mimic a browser request
|
| 205 |
-
headers = {
|
| 206 |
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
| 207 |
-
}
|
| 208 |
-
|
| 209 |
-
# Use requests instead of urllib for better header support
|
| 210 |
-
import requests
|
| 211 |
-
from bs4 import BeautifulSoup
|
| 212 |
-
|
| 213 |
-
response = requests.get(url, headers=headers)
|
| 214 |
-
response.raise_for_status() # Raise an error for bad status codes
|
| 215 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
| 216 |
-
raw_content = soup.get_text()
|
| 217 |
-
|
| 218 |
-
except Exception as e:
|
| 219 |
-
print(f"Warning: Failed to fetch full page content for {url}: {str(e)}")
|
| 220 |
-
raw_content = content # Fallback to the snippet if full page fetch fails
|
| 221 |
-
|
| 222 |
-
# Add result to list
|
| 223 |
-
result = {
|
| 224 |
-
"title": title,
|
| 225 |
-
"url": url,
|
| 226 |
-
"content": content,
|
| 227 |
-
"raw_content": raw_content
|
| 228 |
-
}
|
| 229 |
-
results.append(result)
|
| 230 |
-
|
| 231 |
-
return {"results": results}
|
| 232 |
except Exception as e:
|
| 233 |
-
print(f"Error in
|
| 234 |
-
print(f"Full error details: {type(e).__name__}")
|
| 235 |
return {"results": []}
|
|
|
|
| 236 |
|
| 237 |
|
| 238 |
@dataclass(kw_only=True)
|
|
@@ -309,10 +281,8 @@ def reflect_on_summary(state: SummaryState):
|
|
| 309 |
return {"search_query": f"Tell me more about {state.research_topic}"} # Fallback query
|
| 310 |
|
| 311 |
def web_research(state: SummaryState):
|
| 312 |
-
search_results =
|
| 313 |
search_str = deduplicate_and_format_sources(search_results, max_tokens_per_source=1000, include_raw_content=True)
|
| 314 |
-
|
| 315 |
-
|
| 316 |
return {"sources_gathered": [format_sources(search_results)], "research_loop_count": state.research_loop_count + 1, "web_research_results": [search_str]}
|
| 317 |
|
| 318 |
def summarize_sources(state: SummaryState):
|
|
|
|
| 10 |
from langchain_core.messages import HumanMessage, SystemMessage
|
| 11 |
from typing import Any, List ,Dict, Any, List, Optional
|
| 12 |
import os
|
| 13 |
+
from tavily import TavilyClient
|
| 14 |
|
| 15 |
+
# Initialize Tavily client
|
| 16 |
+
tavily_api_key = os.getenv("TAVILY_API_KEY")
|
| 17 |
+
tavily_client = TavilyClient(api_key=tavily_api_key)
|
| 18 |
max_web_research_loops=3
|
| 19 |
fetch_full_page: bool =False
|
| 20 |
|
|
|
|
| 173 |
for source in search_results['results']
|
| 174 |
)
|
| 175 |
|
| 176 |
+
|
| 177 |
+
def tavily_search(query: str, max_results: int = 3, fetch_full_page: bool = False) -> Dict[str, List[Dict[str, str]]]:
|
| 178 |
+
"""Search the web using Tavily.
|
| 179 |
|
| 180 |
Args:
|
| 181 |
query (str): The search query to execute
|
|
|
|
| 187 |
- title (str): Title of the search result
|
| 188 |
- url (str): URL of the search result
|
| 189 |
- content (str): Snippet/summary of the content
|
| 190 |
+
- raw_content (str): Full content if available, else same as content
|
| 191 |
"""
|
| 192 |
try:
|
| 193 |
+
response = tavily_client.search(query=query, max_results=max_results, include_raw_content=fetch_full_page)
|
| 194 |
+
results = []
|
| 195 |
+
for r in response["results"]:
|
| 196 |
+
result = {
|
| 197 |
+
"title": r.get("title"),
|
| 198 |
+
"url": r.get("url"),
|
| 199 |
+
"content": r.get("content"),
|
| 200 |
+
"raw_content": r.get("raw_content", r.get("content"))
|
| 201 |
+
}
|
| 202 |
+
results.append(result)
|
| 203 |
+
return {"results": results}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
except Exception as e:
|
| 205 |
+
print(f"Error in Tavily search: {str(e)}")
|
|
|
|
| 206 |
return {"results": []}
|
| 207 |
+
|
| 208 |
|
| 209 |
|
| 210 |
@dataclass(kw_only=True)
|
|
|
|
| 281 |
return {"search_query": f"Tell me more about {state.research_topic}"} # Fallback query
|
| 282 |
|
| 283 |
def web_research(state: SummaryState):
|
| 284 |
+
search_results = tavily_search(state.search_query, max_results=3, fetch_full_page=fetch_full_page)
|
| 285 |
search_str = deduplicate_and_format_sources(search_results, max_tokens_per_source=1000, include_raw_content=True)
|
|
|
|
|
|
|
| 286 |
return {"sources_gathered": [format_sources(search_results)], "research_loop_count": state.research_loop_count + 1, "web_research_results": [search_str]}
|
| 287 |
|
| 288 |
def summarize_sources(state: SummaryState):
|