Soham Waghmare commited on
Commit
81dc032
·
1 Parent(s): 2d96b3b

refactor: Improve error handling and update web scraping to use DuckDuckGo

Browse files
Files changed (3) hide show
  1. backend/app.py +3 -0
  2. backend/knet.py +195 -93
  3. backend/scraper.py +34 -27
backend/app.py CHANGED
@@ -57,6 +57,7 @@ def handle_research(data):
57
  )
58
  except Exception as e:
59
  logger.error(f"Error in progress callback: {str(e)}")
 
60
 
61
  try:
62
  research_results = knet.conduct_research(topic, progress_callback)
@@ -65,10 +66,12 @@ def handle_research(data):
65
  except Exception as e:
66
  logger.error(f"Research error: {str(e)}")
67
  socketio.emit("error", {"message": str(e)}, room=session_id)
 
68
 
69
  except Exception as e:
70
  logger.error(f"Error handling research request: {str(e)}")
71
  socketio.emit("error", {"message": str(e)}, room=request.sid)
 
72
 
73
 
74
  if __name__ == "__main__":
 
57
  )
58
  except Exception as e:
59
  logger.error(f"Error in progress callback: {str(e)}")
60
+ raise e
61
 
62
  try:
63
  research_results = knet.conduct_research(topic, progress_callback)
 
66
  except Exception as e:
67
  logger.error(f"Research error: {str(e)}")
68
  socketio.emit("error", {"message": str(e)}, room=session_id)
69
+ raise e
70
 
71
  except Exception as e:
72
  logger.error(f"Error handling research request: {str(e)}")
73
  socketio.emit("error", {"message": str(e)}, room=request.sid)
74
+ raise e
75
 
76
 
77
  if __name__ == "__main__":
backend/knet.py CHANGED
@@ -1,6 +1,8 @@
1
  from typing import Dict, List, Optional, Any
2
  import google.generativeai as genai
 
3
  import logging
 
4
  import os
5
  from datetime import datetime
6
  from dotenv import load_dotenv
@@ -46,7 +48,7 @@ class KNet:
46
  # Initialize scraper
47
  self.scraper = WebScraper()
48
  self.logger = logging.getLogger(__name__)
49
- self.max_depth = 5
50
  self.min_importance_score = 0.6
51
 
52
  self.search_prompt = """Generate 3-5 specific search queries to research the following topic: {topic}
@@ -58,145 +60,246 @@ class KNet:
58
  4. Format each query on a new line
59
  5. Return only the queries, no explanations"""
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def __del__(self):
62
  # Cleanup scraper when KNet instance is destroyed
63
  if hasattr(self, "scraper"):
64
  self.scraper.cleanup()
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  def conduct_research(self, topic: str, progress_callback=None) -> Dict[str, Any]:
 
67
  progress = ResearchProgress(progress_callback)
68
  self.logger.info(f"Starting research on topic: {topic}")
 
69
  try:
70
- # Setup aiohttp session at start of research
71
  self.scraper.setup()
72
  root_node = ResearchNode(topic)
73
- research_stack = deque([root_node])
74
  explored_queries = set()
 
 
 
 
 
 
75
 
76
- # Generate initial search queries
77
- self.logger.info("Generating search queries...")
78
- response = self.llm.generate_content(self.search_prompt.format(topic=topic))
79
- search_queries = response.text.strip().split("\n")
80
- self.logger.info(f"Generated queries: {search_queries}")
81
-
82
- progress.update(10, "Starting deep research exploration...")
83
- self.logger.info("Research exploration initiated")
84
-
85
- # Process each generated query
86
- for query in search_queries:
87
- if query.strip():
88
- data = self.scraper.search_and_scrape(query.strip())
89
- if data:
90
- root_node.data.extend(data)
91
-
92
- while research_stack:
93
- current_node = research_stack.pop()
94
-
95
- if (
96
- current_node.query in explored_queries
97
- or current_node.depth > self.max_depth
98
- ):
99
  continue
100
 
101
- self.logger.info(
102
- f"Exploring branch: {current_node.query} (Depth: {current_node.depth})"
103
- )
104
  progress.update(
105
- 30 + (len(explored_queries) * 50 / (self.max_depth * 3)),
106
  f"Exploring: {current_node.query}",
107
  )
108
 
109
- # Conduct research for current node
110
  current_node.data = self.scraper.search_and_scrape(current_node.query)
111
  explored_queries.add(current_node.query)
112
 
113
- # Generate and evaluate new branches
114
- if current_node.depth < self.max_depth:
115
- new_branches = self._analyze_and_branch(current_node)
116
- for branch in reversed(
117
- new_branches
118
- ): # Reverse to maintain DFS order
119
- research_stack.append(branch)
120
-
121
- self.logger.info("Generating final research report")
 
 
122
  progress.update(80, "Generating comprehensive report...")
123
  final_report = self._generate_final_report(root_node)
 
124
 
125
- self.logger.info("Research completed successfully")
 
 
126
  progress.update(100, "Research complete!")
127
 
128
  return final_report
129
 
130
  except Exception as e:
131
  self.logger.error(f"Research failed: {str(e)}")
132
- self.scraper.cleanup()
133
  raise e
134
  finally:
135
  self.scraper.cleanup()
136
 
137
  def _analyze_and_branch(self, node: ResearchNode) -> List[ResearchNode]:
138
- analysis_prompt = f"""Analyze the research data and suggest new branches for deeper exploration.
139
- Current topic: {node.query}
140
- Current depth: {node.depth}
141
- Path from root: {' -> '.join(node.get_path_to_root())}
142
-
143
- Suggest new research directions that:
144
- 1. Are specific and focused
145
- 2. Explore unexplored aspects
146
- 3. Follow promising leads from the current data
147
-
148
- For each suggestion, rate its importance (0-1) and explain why.
149
- Format: Importance Score | Query | Reason"""
150
-
151
- response = self.research_manager.generate_content(analysis_prompt)
152
- result = response.text
153
-
154
- new_nodes = []
155
- for line in result.split("\n"):
156
- if "|" not in line:
157
- continue
158
-
159
- parts = line.split("|")
160
- if len(parts) < 2:
161
- continue
162
-
163
- try:
164
- importance = float(parts[0].strip())
165
- query = parts[1].strip()
166
-
167
- if importance >= self.min_importance_score:
168
- child_node = node.add_child(query)
169
- child_node.importance_score = importance
 
 
 
 
 
170
  new_nodes.append(child_node)
171
- except ValueError:
172
- continue
173
 
174
- return new_nodes
 
 
 
 
175
 
176
  def _generate_final_report(self, root_node: ResearchNode) -> Dict[str, Any]:
177
  def collect_data(node: ResearchNode) -> List[Dict]:
178
- all_data = node.data.copy()
 
 
179
  for child in node.children:
180
  all_data.extend(collect_data(child))
181
  return all_data
182
 
183
  all_research_data = collect_data(root_node)
184
 
185
- # Generate structured report using LLM
186
- report_prompt = f"""Generate a comprehensive research report using the collected data.
187
  Main Topic: {root_node.query}
188
 
189
- Structure the report with:
190
- 1. Executive Summary
191
- 2. Key Findings
192
- 3. Detailed Analysis
193
- 4. Related Topics and Branches
194
- 5. Sources and References
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
- Include relevant quotes and citations."""
 
 
197
 
198
- response = self.research_manager.generate_content(report_prompt)
199
- report_content = response.text
 
200
 
201
  # Organize multimedia content
202
  media_content = {"images": [], "videos": [], "links": [], "references": []}
@@ -232,9 +335,8 @@ class KNet:
232
  "research_tree": build_tree_structure(root_node),
233
  "metadata": {
234
  "total_sources": len(all_research_data),
235
- "max_depth_reached": max(
236
- data.depth for data in collect_data(root_node)
237
- ),
238
- "total_branches": len(list(collect_data(root_node))),
239
  },
240
  }
 
1
  from typing import Dict, List, Optional, Any
2
  import google.generativeai as genai
3
+ from google.ai.generativelanguage_v1beta.types import content
4
  import logging
5
+ import json
6
  import os
7
  from datetime import datetime
8
  from dotenv import load_dotenv
 
48
  # Initialize scraper
49
  self.scraper = WebScraper()
50
  self.logger = logging.getLogger(__name__)
51
+ self.max_depth = 3
52
  self.min_importance_score = 0.6
53
 
54
  self.search_prompt = """Generate 3-5 specific search queries to research the following topic: {topic}
 
60
  4. Format each query on a new line
61
  5. Return only the queries, no explanations"""
62
 
63
+ self.token_count = 0
64
+ self.branch_decision_prompt = """Given the current research context and findings, should we explore this branch deeper?
65
+
66
+ Current Topic: {query}
67
+ Current Depth: {depth}
68
+ Path from Root: {path}
69
+ Key Findings: {findings}
70
+
71
+ Consider:
72
+ 1. Relevance to main topic
73
+ 2. Potential for new insights
74
+ 3. Depth vs breadth tradeoff
75
+ 4. Information saturation
76
+
77
+ Return only: {"decision": true/false}"""
78
+
79
+ # Simplified decision schema for branching
80
+ self.branch_schema = {
81
+ "response_schema": content.Schema(
82
+ type=content.Type.OBJECT,
83
+ required=["decision"],
84
+ properties={
85
+ "decision": content.Schema(type=content.Type.BOOLEAN),
86
+ },
87
+ ),
88
+ "response_mime_type": "application/json",
89
+ }
90
+
91
+ # Analysis schema without reason
92
+ self.analysis_schema = {
93
+ "response_schema": content.Schema(
94
+ type=content.Type.OBJECT,
95
+ required=["branches"],
96
+ properties={
97
+ "branches": content.Schema(
98
+ type=content.Type.ARRAY,
99
+ items=content.Schema(
100
+ type=content.Type.OBJECT,
101
+ required=["importance", "query"],
102
+ properties={
103
+ "importance": content.Schema(type=content.Type.NUMBER),
104
+ "query": content.Schema(type=content.Type.STRING),
105
+ },
106
+ ),
107
+ )
108
+ },
109
+ ),
110
+ "response_mime_type": "application/json",
111
+ }
112
+
113
  def __del__(self):
114
  # Cleanup scraper when KNet instance is destroyed
115
  if hasattr(self, "scraper"):
116
  self.scraper.cleanup()
117
 
118
+ def _track_tokens(self, tokens: int) -> None:
119
+ self.token_count += tokens
120
+
121
+ def _should_branch_deeper(self, node: ResearchNode) -> bool:
122
+ findings = ""
123
+ if node.data:
124
+ findings = "\n".join(
125
+ [
126
+ f"- {d.get('title', 'Untitled')}: {d.get('summary', '')}"
127
+ for d in node.data[:3]
128
+ if d
129
+ ]
130
+ )
131
+
132
+ prompt = self.branch_decision_prompt.format(
133
+ query=node.query,
134
+ depth=node.depth,
135
+ path=" -> ".join(node.get_path_to_root()),
136
+ findings=findings,
137
+ )
138
+
139
+ response = self.research_manager.generate_content(
140
+ prompt, generation_config={**self.branch_schema}
141
+ )
142
+ self._track_tokens(response.usage_metadata.total_token_count)
143
+
144
+ result = json.loads(response.text)
145
+ self.logger.info(f"Branch decision for '{node.query}': {result['decision']}")
146
+
147
+ return result["decision"]
148
+
149
  def conduct_research(self, topic: str, progress_callback=None) -> Dict[str, Any]:
150
+ self.token_count = 0
151
  progress = ResearchProgress(progress_callback)
152
  self.logger.info(f"Starting research on topic: {topic}")
153
+
154
  try:
 
155
  self.scraper.setup()
156
  root_node = ResearchNode(topic)
157
+ to_explore = deque([(root_node, 0)]) # (node, depth) pairs
158
  explored_queries = set()
159
+ max_branches = self.max_depth * 3
160
+
161
+ progress.update(10, "Starting research...")
162
+
163
+ while to_explore and len(explored_queries) < max_branches:
164
+ current_node, current_depth = to_explore.popleft()
165
 
166
+ if current_node.query in explored_queries or current_depth >= self.max_depth:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  continue
168
 
169
+ self.logger.info(f"Exploring: {current_node.query} (Depth: {current_depth})")
 
 
170
  progress.update(
171
+ 30 + (len(explored_queries) * 50 / max_branches),
172
  f"Exploring: {current_node.query}",
173
  )
174
 
175
+ # Search and scrape
176
  current_node.data = self.scraper.search_and_scrape(current_node.query)
177
  explored_queries.add(current_node.query)
178
 
179
+ # Only branch if we have data and haven't reached max depth
180
+ if current_node.data and current_depth < self.max_depth:
181
+ if self._should_branch_deeper(current_node):
182
+ new_branches = self._analyze_and_branch(current_node)
183
+ for branch in new_branches:
184
+ to_explore.append((branch, current_depth + 1))
185
+ self.logger.info(
186
+ f"Added {len(new_branches)} new branches at depth {current_depth + 1}"
187
+ )
188
+
189
+ # Generate final report
190
  progress.update(80, "Generating comprehensive report...")
191
  final_report = self._generate_final_report(root_node)
192
+ final_report["metadata"]["total_tokens"] = self.token_count
193
 
194
+ self.logger.info(
195
+ f"Research completed. Explored {len(explored_queries)} queries across {root_node.depth + 1} levels"
196
+ )
197
  progress.update(100, "Research complete!")
198
 
199
  return final_report
200
 
201
  except Exception as e:
202
  self.logger.error(f"Research failed: {str(e)}")
 
203
  raise e
204
  finally:
205
  self.scraper.cleanup()
206
 
207
  def _analyze_and_branch(self, node: ResearchNode) -> List[ResearchNode]:
208
+ if not node.data:
209
+ return []
210
+
211
+ findings = "\n".join([
212
+ f"- {d.get('title', 'Untitled')}: {d.get('summary', d.get('text', '')[:200])}"
213
+ for d in node.data[:3] if d
214
+ ])
215
+
216
+ analysis_prompt = f"""Based on the following findings about "{node.query}", suggest new research directions.
217
+
218
+ Findings:
219
+ {findings}
220
+
221
+ Suggest up to 3 specific research queries that:
222
+ 1. Build upon these findings
223
+ 2. Explore different aspects
224
+ 3. Go deeper into important details
225
+
226
+ Return as JSON array of objects with only:
227
+ - importance (0.0-1.0)
228
+ - query (string)"""
229
+
230
+ try:
231
+ response = self.research_manager.generate_content(
232
+ analysis_prompt,
233
+ generation_config={**self.analysis_schema},
234
+ )
235
+ self._track_tokens(response.usage_metadata.total_token_count)
236
+
237
+ result = json.loads(response.text)
238
+ self.logger.info(f"New branches for '{node.query}': {result['branches']}")
239
+
240
+ new_nodes = []
241
+ for branch in result.get("branches", []):
242
+ if branch["importance"] >= self.min_importance_score:
243
+ child_node = node.add_child(branch["query"])
244
+ child_node.importance_score = branch["importance"]
245
  new_nodes.append(child_node)
 
 
246
 
247
+ return new_nodes
248
+
249
+ except Exception as e:
250
+ self.logger.error(f"Branch analysis failed: {str(e)}")
251
+ return []
252
 
253
  def _generate_final_report(self, root_node: ResearchNode) -> Dict[str, Any]:
254
  def collect_data(node: ResearchNode) -> List[Dict]:
255
+ all_data = []
256
+ if node.data:
257
+ all_data.extend(node.data)
258
  for child in node.children:
259
  all_data.extend(collect_data(child))
260
  return all_data
261
 
262
  all_research_data = collect_data(root_node)
263
 
264
+ # Generate part 1 of the report
265
+ part1_prompt = f"""Generate part 1 of a research report focusing on overview and key findings.
266
  Main Topic: {root_node.query}
267
 
268
+ Structure for Part 1:
269
+ 1. Executive Summary (brief overview)
270
+ 2. Key Findings (main discoveries and insights)
271
+
272
+ Keep it concise and focused. Part 2 will cover detailed analysis and references."""
273
+
274
+ response1 = self.research_manager.generate_content(part1_prompt)
275
+ self._track_tokens(response1.usage_metadata.total_token_count)
276
+ part1_content = response1.text
277
+
278
+ # Generate part 2 with awareness of part 1
279
+ part2_prompt = f"""Generate part 2 of the research report. Here's part 1 for context:
280
+
281
+ {part1_content}
282
+
283
+ Now continue with:
284
+ 1. Detailed Analysis (expand on the key findings)
285
+ 2. Related Topics and Branches (explore connections)
286
+ 3. Sources and References (cite sources)
287
+
288
+ Focus on details that complement part 1 without repeating the same information."""
289
+
290
+ response2 = self.research_manager.generate_content(part2_prompt)
291
+ self._track_tokens(response2.usage_metadata.total_token_count)
292
+
293
+ # Combine reports with clear section separation
294
+ report_content = f"""# Research Report: {root_node.query}
295
 
296
+ Part 1: Overview and Key Findings
297
+ --------------------------------
298
+ {part1_content}
299
 
300
+ Part 2: Detailed Analysis and References
301
+ --------------------------------------
302
+ {response2.text}"""
303
 
304
  # Organize multimedia content
305
  media_content = {"images": [], "videos": [], "links": [], "references": []}
 
335
  "research_tree": build_tree_structure(root_node),
336
  "metadata": {
337
  "total_sources": len(all_research_data),
338
+ "max_depth_reached": root_node.depth,
339
+ "total_branches": len(root_node.children),
340
+ "total_tokens": self.token_count,
 
341
  },
342
  }
backend/scraper.py CHANGED
@@ -1,34 +1,34 @@
1
  from bs4 import BeautifulSoup
2
- from selenium import webdriver
3
  import logging
4
  from typing import List, Dict, Any
5
  import newspaper
6
  from newspaper import Article
7
  import re
8
  import requests
 
9
 
10
 
11
  class WebScraper:
12
  def __init__(self):
13
- self.chrome_options = webdriver.ChromeOptions()
14
- # self.chrome_options.add_argument("--headless")
15
- self.driver = webdriver.Chrome(options=self.chrome_options)
16
  self.logger = logging.getLogger(__name__)
17
  self.newspaper_config = newspaper.Config()
18
  self.newspaper_config.browser_user_agent = "Mozilla/5.0"
19
  self.newspaper_config.request_timeout = 10
20
  self.session = requests.Session()
21
- self.timeout = 30
 
 
 
 
22
 
23
  def setup(self):
24
- pass # No setup needed for synchronous operation
25
 
26
  def cleanup(self):
27
- if self.driver:
28
- self.driver.quit()
29
 
30
  def search_and_scrape(
31
- self, query: str, num_sites: int = 10
32
  ) -> List[Dict[str, Any]]:
33
  self.logger.info(f"Starting search for: {query}")
34
  search_results = self._google_search(query, num_sites)
@@ -50,27 +50,34 @@ class WebScraper:
50
  return scraped_data
51
 
52
  def _google_search(self, query: str, num_results: int) -> List[str]:
53
- self.logger.info("Performing Google search...")
54
  try:
55
- self.driver.get(
56
- f"https://www.google.com/search?q={query.replace(' ', '+')}&num={num_results}"
57
- )
58
- self.driver.implicitly_wait(5)
 
59
 
60
- elements = self.driver.find_elements("css selector", "div.g div.yuRUbf > a")
61
  search_results = []
62
- for element in elements:
63
- url = element.get_attribute("href")
64
- if url and url.startswith("http"):
65
- search_results.append(url)
66
- if len(search_results) >= num_results:
67
- break
 
 
 
68
 
69
  self.logger.info(f"Found {len(search_results)} URLs")
70
  return search_results
71
 
72
- except Exception as e:
73
- self.logger.error(f"Google search error: {str(e)}")
 
 
 
74
  return []
75
 
76
  def _scrape_url(self, url: str) -> Dict[str, Any]:
@@ -79,16 +86,16 @@ class WebScraper:
79
  article.download()
80
  article.parse()
81
  article.nlp()
 
 
82
 
83
  data = {
84
  "url": url,
85
  "title": article.title,
86
  "text": article.text,
87
- "summary": article.summary,
88
- "keywords": article.keywords,
89
  "images": article.images,
90
- "videos": [],
91
- "links": article.links,
92
  "authors": article.authors,
93
  "publish_date": article.publish_date,
94
  "metadata": {"language": article.meta_lang, "tags": article.tags},
 
1
  from bs4 import BeautifulSoup
 
2
  import logging
3
  from typing import List, Dict, Any
4
  import newspaper
5
  from newspaper import Article
6
  import re
7
  import requests
8
+ from urllib.parse import quote_plus
9
 
10
 
11
  class WebScraper:
12
  def __init__(self):
 
 
 
13
  self.logger = logging.getLogger(__name__)
14
  self.newspaper_config = newspaper.Config()
15
  self.newspaper_config.browser_user_agent = "Mozilla/5.0"
16
  self.newspaper_config.request_timeout = 10
17
  self.session = requests.Session()
18
+ self.timeout = 10
19
+ # Set up headers for requests
20
+ self.headers = {
21
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
22
+ }
23
 
24
  def setup(self):
25
+ pass
26
 
27
  def cleanup(self):
28
+ pass
 
29
 
30
  def search_and_scrape(
31
+ self, query: str, num_sites: int = 3
32
  ) -> List[Dict[str, Any]]:
33
  self.logger.info(f"Starting search for: {query}")
34
  search_results = self._google_search(query, num_sites)
 
50
  return scraped_data
51
 
52
  def _google_search(self, query: str, num_results: int) -> List[str]:
53
+ self.logger.info("Performing DuckDuckGo search...")
54
  try:
55
+ encoded_query = quote_plus(query)
56
+ url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
57
+
58
+ response = self.session.get(url, headers=self.headers, timeout=self.timeout)
59
+ response.raise_for_status()
60
 
61
+ soup = BeautifulSoup(response.text, "html.parser")
62
  search_results = []
63
+
64
+ # DuckDuckGo search results are in elements with class 'result__url'
65
+ for result in soup.select(".result__url"):
66
+ url = result.get("href").replace(" ", "").replace("\\n", "")
67
+ if not url.startswith(("http://", "https://")):
68
+ url = "https://" + url
69
+ search_results.append(url)
70
+ if len(search_results) >= num_results:
71
+ break
72
 
73
  self.logger.info(f"Found {len(search_results)} URLs")
74
  return search_results
75
 
76
+ except requests.exceptions.RequestException as e: # Catch network errors specifically
77
+ self.logger.error(f"DuckDuckGo search error: {str(e)}")
78
+ return []
79
+ except Exception as e: # Catch any other errors
80
+ self.logger.error(f"DuckDuckGo search error: {str(e)}")
81
  return []
82
 
83
  def _scrape_url(self, url: str) -> Dict[str, Any]:
 
86
  article.download()
87
  article.parse()
88
  article.nlp()
89
+ soup = BeautifulSoup(article.html, "html.parser")
90
+ links = self._extract_links(soup)
91
 
92
  data = {
93
  "url": url,
94
  "title": article.title,
95
  "text": article.text,
 
 
96
  "images": article.images,
97
+ "videos": article.movies,
98
+ "links": links,
99
  "authors": article.authors,
100
  "publish_date": article.publish_date,
101
  "metadata": {"language": article.meta_lang, "tags": article.tags},