Soham Waghmare commited on
Commit
0a3d9b7
·
1 Parent(s): 4e3ab6e

feat: migration from google-generativeai to google-genai

Browse files
backend/app.py CHANGED
@@ -101,9 +101,7 @@ async def start_research(sid, data):
101
  room=session_id,
102
  )
103
 
104
- research_results = await knet.conduct_research(
105
- topic, progress_callback, max_depth, max_breadth, num_sites_per_query
106
- )
107
  logger.info(f"Research completed for topic: {topic}")
108
  await sio.emit("research_complete", research_results, room=session_id)
109
 
 
101
  room=session_id,
102
  )
103
 
104
+ research_results = await knet.conduct_research(topic, progress_callback, max_depth, max_breadth, num_sites_per_query)
 
 
105
  logger.info(f"Research completed for topic: {topic}")
106
  await sio.emit("research_complete", research_results, room=session_id)
107
 
backend/knet.py CHANGED
@@ -6,9 +6,9 @@ from datetime import datetime
6
  from textwrap import dedent
7
  from typing import Any, Dict, List
8
 
9
- import google.generativeai as genai
10
  from dotenv import load_dotenv
11
- from google.ai.generativelanguage_v1beta.types import content
 
12
 
13
  from research_node import ResearchNode
14
  from scraper import CrawlForAIScraper
@@ -39,7 +39,7 @@ class Prompt:
39
  Findings:
40
  {ctx_manager}
41
 
42
- Suggest up to {max_breadth} specific google search queries that would help data which:
43
  - Builds upon these findings
44
  - Explores different aspects
45
  - Goes deeper into important details
@@ -50,37 +50,30 @@ class Prompt:
50
 
51
  class Schema:
52
  def __init__(self) -> None:
53
- self.continue_branch = {
54
- "response_schema": content.Schema(
55
- type=content.Type.OBJECT,
56
- required=["decision"],
57
- properties={
58
- "decision": content.Schema(type=content.Type.BOOLEAN),
59
- },
60
- ),
61
- "response_mime_type": "application/json",
62
- }
63
-
64
- self.search_query = {
65
- "response_schema": content.Schema(
66
- type=content.Type.OBJECT,
67
- required=["branches"],
68
- properties={
69
- "branches": content.Schema(
70
- type=content.Type.ARRAY,
71
- items=content.Schema(
72
- type=content.Type.OBJECT,
73
- required=["importance", "query"],
74
- properties={
75
- "importance": content.Schema(type=content.Type.NUMBER),
76
- "query": content.Schema(type=content.Type.STRING),
77
- },
78
- ),
79
- )
80
- },
81
- ),
82
- "response_mime_type": "application/json",
83
- }
84
 
85
 
86
  class ResearchProgress:
@@ -97,13 +90,7 @@ class ResearchProgress:
97
 
98
 
99
  class KNet:
100
- def __init__(
101
- self,
102
- scraper_instance: CrawlForAIScraper,
103
- max_depth: int = 1,
104
- max_breadth: int = 1,
105
- num_sites_per_query: int = 5,
106
- ):
107
  self.api_key = os.getenv("GOOGLE_API_KEY")
108
  assert self.api_key, "Google API key is required"
109
  self.scraper = scraper_instance
@@ -111,26 +98,8 @@ class KNet:
111
  self.prompt = Prompt()
112
  self.schema = Schema()
113
 
114
- # Init Agents' Base Model
115
- genai.configure(api_key=self.api_key)
116
- generation_config = {"temperature": 0.9}
117
- safe = [
118
- {"category": "HARM_CATEGORY_DANGEROUS", "threshold": "BLOCK_NONE"},
119
- {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
120
- {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
121
- {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
122
- {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
123
- ]
124
- self.researcher = genai.GenerativeModel(
125
- "gemini-2.0-flash",
126
- generation_config=generation_config,
127
- safety_settings=safe,
128
- )
129
- self.research_manager = genai.GenerativeModel(
130
- "gemini-2.0-flash",
131
- generation_config=generation_config,
132
- safety_settings=safe,
133
- )
134
 
135
  # Parameters
136
  self.max_depth = max_depth
@@ -142,14 +111,7 @@ class KNet:
142
  self.ctx_manager: list[str] = []
143
  self.token_count: int = 0
144
 
145
- async def conduct_research(
146
- self,
147
- topic: str,
148
- progress_callback,
149
- max_depth: int,
150
- max_breadth: int,
151
- num_sites_per_query: int,
152
- ) -> dict:
153
  # Local Runtime State
154
  progress = ResearchProgress(progress_callback)
155
  self.max_depth = max_depth
@@ -162,7 +124,12 @@ class KNet:
162
  self.token_count = 0
163
 
164
  try:
165
- root_node = ResearchNode(topic)
 
 
 
 
 
166
  to_explore = deque([(root_node, 0)]) # (node, depth) pairs
167
  explored_queries = set() # {string, string, ...}
168
 
@@ -171,15 +138,10 @@ class KNet:
171
  while to_explore:
172
  current_node, current_depth = to_explore.popleft()
173
 
174
- if (
175
- current_node.query in explored_queries
176
- or current_depth > self.max_depth
177
- ):
178
  continue
179
 
180
- self.logger.info(
181
- f"Exploring: {current_node.query} (Depth: {current_depth})"
182
- )
183
  await progress.update(5, f"Exploring: {current_node.query}")
184
 
185
  # Search and scrape
@@ -190,8 +152,8 @@ class KNet:
190
  explored_queries.add(current_node.query)
191
 
192
  # Only branch if we have data and haven't reached max depth
193
- if current_node.data and current_depth < self.max_depth:
194
- if self._should_continue_branch(current_node, topic):
195
  new_branches = self._gen_queries(current_node, topic)
196
  for branch in new_branches:
197
  to_explore.append((branch, current_depth + 1))
@@ -200,9 +162,7 @@ class KNet:
200
  await progress.update(30, "Generating comprehensive report...")
201
  final_report = self._generate_final_report(root_node)
202
 
203
- self.logger.info(
204
- f"Research completed. Explored {len(explored_queries)} queries across {root_node.max_depth()} levels"
205
- )
206
  await progress.update(100, "Research complete!")
207
 
208
  with open("output.json", "a", encoding="utf-8") as f:
@@ -213,12 +173,10 @@ class KNet:
213
  self.logger.error("Research failed", exc_info=True)
214
  raise
215
 
216
- def _generate_final_report(
217
- self, root_node: ResearchNode, retry_count: int = 1
218
- ) -> Dict[str, Any]:
219
  try:
220
  findings = "\n".join(self.ctx_manager)
221
- with open("output.json", "w") as f:
222
  f.write(findings)
223
  prompt = f"""Generate a comprehensive report on the topic "{root_node.query}" based on the following research findings:
224
  {findings}
@@ -234,18 +192,11 @@ class KNet:
234
  if data.get("videos"):
235
  media_content["videos"].extend(data["videos"])
236
  if data.get("links"):
237
- media_content["links"].extend(
238
- [
239
- {"url": link["href"], "text": link["text"]}
240
- for link in data["links"]
241
- ]
242
- )
243
  # Dedupe
244
  media_content["images"] = list(set(media_content["images"]))
245
  media_content["videos"] = list(set(media_content["videos"]))
246
- media_content["links"] = list(
247
- {json.dumps(d, sort_keys=True) for d in media_content["links"]}
248
- )
249
  media_content["links"] = [json.loads(d) for d in media_content["links"]]
250
 
251
  # Build research tree structure
@@ -258,9 +209,7 @@ class KNet:
258
  "query": node.query,
259
  "depth": node.depth,
260
  "sources": sources,
261
- "children": [
262
- build_tree_structure(child) for child in node.children
263
- ],
264
  }
265
 
266
  return {
@@ -278,17 +227,13 @@ class KNet:
278
  }
279
 
280
  except Exception as e:
281
- if e == "GEMINI_RECITATION" and retry_count < 3:
282
- self.logger.error(
283
- f"Retrying final report:C:{retry_count / 3}", exc_info=True
284
- )
285
  self._generate_final_report(root_node, retry_count + 1)
286
  self.logger.error("Error generating final report", exc_info=True)
287
  raise
288
 
289
- def _gen_queries(
290
- self, node: ResearchNode, topic: str, retry_count: int = 1
291
- ) -> List[ResearchNode]:
292
  try:
293
  if not node.data or node.depth > self.max_depth:
294
  return []
@@ -296,14 +241,10 @@ class KNet:
296
  prompt = self.prompt.search_query.format(
297
  topic=topic,
298
  ctx_manager=json.dumps(self.ctx_manager, indent=2),
299
- max_breadth=self.max_breadth,
300
- )
301
- response = self.generate_content(
302
- prompt, generation_config=self.schema.search_query
303
- )
304
- self.logger.info(
305
- f"Spawn branches '{node.query}':\n{json.dumps(response['branches'], indent=2)}"
306
  )
 
 
307
 
308
  # Add children to current node
309
  # |-> child
@@ -318,29 +259,21 @@ class KNet:
318
  return new_nodes
319
 
320
  except Exception as e:
321
- if e == "GEMINI_RECITATION" and retry_count < 3:
322
- self.logger.error(
323
- f"Retrying _gen_queries | C:{retry_count / 3}", exc_info=True
324
- )
325
  self._gen_queries(node, topic, retry_count + 1)
326
  self.logger.error("_gen_queries failed", exc_info=True)
327
  raise
328
 
329
- def _should_continue_branch(
330
- self, node: ResearchNode, topic: str, retry_count: int = 1
331
- ) -> bool:
332
  try:
333
  if node.depth > self.max_depth:
334
  return False
335
 
336
  # Generate summary of key findings into the manager's context
337
  if node.data:
338
- findings = ("\n" + "-" * 10 + "Next data" + "-" * 10 + "\n").join(
339
- [json.dumps(d, indent=2) for d in node.data]
340
- )
341
- response = self.generate_content(
342
- f"Extract key findings from the following data related to the topic '{topic}':\n{findings}"
343
- )
344
  self.ctx_manager.append(response)
345
 
346
  # Research manager takes decision to proceed or not
@@ -350,35 +283,42 @@ class KNet:
350
  path=" -> ".join(node.get_path_to_root()),
351
  findings="\n".join(self.ctx_manager),
352
  )
353
- response = self.generate_content(
354
- prompt, generation_config=self.schema.continue_branch
355
- )
356
  self.logger.info(f"Branch decision '{node.query}': {response['decision']}")
357
 
358
  return response["decision"]
359
 
360
  except Exception as e:
361
- if e == "GEMINI_RECITATION" and retry_count < 3:
362
- self.logger.error(
363
- f"Retrying branch decision:C:{retry_count / 3}", exc_info=True
364
- )
365
  self._should_continue_branch(node, topic, retry_count + 1)
366
  self.logger.error("Branch decision failed:", exc_info=True)
367
  raise
368
 
369
- def generate_content(
370
- self, prompt: str, generation_config: Dict[str, Any] = {}
371
- ) -> Dict[str, Any] | str:
372
- try:
373
- response = self.researcher.generate_content(
374
- prompt, generation_config=generation_config
 
 
 
 
 
375
  )
 
 
 
 
 
 
 
 
376
  self.token_count += response.usage_metadata.total_token_count
377
- if generation_config:
378
- return json.loads(response.text)
379
- return response.text
380
 
381
  except Exception:
382
- if response["candidates"][0]["finishReason"] == "RECITATION":
383
  raise Exception("GEMINI_RECITATION")
384
  raise
 
6
  from textwrap import dedent
7
  from typing import Any, Dict, List
8
 
 
9
  from dotenv import load_dotenv
10
+ from google import genai
11
+ from google.genai import types
12
 
13
  from research_node import ResearchNode
14
  from scraper import CrawlForAIScraper
 
39
  Findings:
40
  {ctx_manager}
41
 
42
+ Suggest up to {n} specific google search queries that would help data which:
43
  - Builds upon these findings
44
  - Explores different aspects
45
  - Goes deeper into important details
 
50
 
51
  class Schema:
52
  def __init__(self) -> None:
53
+ self.continue_branch = genai.types.Schema(
54
+ type=genai.types.Type.OBJECT,
55
+ required=["decision"],
56
+ properties={
57
+ "decision": genai.types.Schema(type=genai.types.Type.BOOLEAN),
58
+ },
59
+ )
60
+
61
+ self.search_query = genai.types.Schema(
62
+ type=genai.types.Type.OBJECT,
63
+ required=["branches"],
64
+ properties={
65
+ "branches": genai.types.Schema(
66
+ type=genai.types.Type.ARRAY,
67
+ items=genai.types.Schema(
68
+ type=genai.types.Type.OBJECT,
69
+ required=["query"],
70
+ properties={
71
+ "query": genai.types.Schema(type=genai.types.Type.STRING),
72
+ },
73
+ ),
74
+ )
75
+ },
76
+ )
 
 
 
 
 
 
 
77
 
78
 
79
  class ResearchProgress:
 
90
 
91
 
92
  class KNet:
93
+ def __init__(self, scraper_instance: CrawlForAIScraper, max_depth: int = 1, max_breadth: int = 1, num_sites_per_query: int = 5):
 
 
 
 
 
 
94
  self.api_key = os.getenv("GOOGLE_API_KEY")
95
  assert self.api_key, "Google API key is required"
96
  self.scraper = scraper_instance
 
98
  self.prompt = Prompt()
99
  self.schema = Schema()
100
 
101
+ # Init Google GenAI client
102
+ self.genai_client = genai.Client(api_key=self.api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  # Parameters
105
  self.max_depth = max_depth
 
111
  self.ctx_manager: list[str] = []
112
  self.token_count: int = 0
113
 
114
+ async def conduct_research(self, topic: str, progress_callback, max_depth: int, max_breadth: int, num_sites_per_query: int) -> dict:
 
 
 
 
 
 
 
115
  # Local Runtime State
116
  progress = ResearchProgress(progress_callback)
117
  self.max_depth = max_depth
 
124
  self.token_count = 0
125
 
126
  try:
127
+ # Generate initial search query
128
+ query = self.generate_content(
129
+ self.prompt.search_query.format(topic=topic, ctx_manager=json.dumps(self.ctx_manager, indent=2), n=1),
130
+ schema=self.schema.search_query,
131
+ )
132
+ root_node = ResearchNode(query.get("branches")[0]["query"])
133
  to_explore = deque([(root_node, 0)]) # (node, depth) pairs
134
  explored_queries = set() # {string, string, ...}
135
 
 
138
  while to_explore:
139
  current_node, current_depth = to_explore.popleft()
140
 
141
+ if current_node.query in explored_queries or current_depth > self.max_depth:
 
 
 
142
  continue
143
 
144
+ self.logger.info(f"Exploring: {current_node.query} (Depth: {current_depth})")
 
 
145
  await progress.update(5, f"Exploring: {current_node.query}")
146
 
147
  # Search and scrape
 
152
  explored_queries.add(current_node.query)
153
 
154
  # Only branch if we have data and haven't reached max depth
155
+ if self._should_continue_branch(current_node, topic):
156
+ if current_node.data and current_depth < self.max_depth:
157
  new_branches = self._gen_queries(current_node, topic)
158
  for branch in new_branches:
159
  to_explore.append((branch, current_depth + 1))
 
162
  await progress.update(30, "Generating comprehensive report...")
163
  final_report = self._generate_final_report(root_node)
164
 
165
+ self.logger.info(f"Research completed. Explored {len(explored_queries)} queries across {root_node.max_depth()} levels")
 
 
166
  await progress.update(100, "Research complete!")
167
 
168
  with open("output.json", "a", encoding="utf-8") as f:
 
173
  self.logger.error("Research failed", exc_info=True)
174
  raise
175
 
176
+ def _generate_final_report(self, root_node: ResearchNode, retry_count: int = 1) -> Dict[str, Any]:
 
 
177
  try:
178
  findings = "\n".join(self.ctx_manager)
179
+ with open("output.json", "w", encoding="utf-8") as f:
180
  f.write(findings)
181
  prompt = f"""Generate a comprehensive report on the topic "{root_node.query}" based on the following research findings:
182
  {findings}
 
192
  if data.get("videos"):
193
  media_content["videos"].extend(data["videos"])
194
  if data.get("links"):
195
+ media_content["links"].extend([{"url": link["href"], "text": link["text"]} for link in data["links"]])
 
 
 
 
 
196
  # Dedupe
197
  media_content["images"] = list(set(media_content["images"]))
198
  media_content["videos"] = list(set(media_content["videos"]))
199
+ media_content["links"] = list({json.dumps(d, sort_keys=True) for d in media_content["links"]})
 
 
200
  media_content["links"] = [json.loads(d) for d in media_content["links"]]
201
 
202
  # Build research tree structure
 
209
  "query": node.query,
210
  "depth": node.depth,
211
  "sources": sources,
212
+ "children": [build_tree_structure(child) for child in node.children],
 
 
213
  }
214
 
215
  return {
 
227
  }
228
 
229
  except Exception as e:
230
+ if e in ["GEMINI_RECITATION", "NO_RESPONSE"] and retry_count < 3:
231
+ self.logger.error(f"Retrying final report:C:{retry_count / 3}", exc_info=True)
 
 
232
  self._generate_final_report(root_node, retry_count + 1)
233
  self.logger.error("Error generating final report", exc_info=True)
234
  raise
235
 
236
+ def _gen_queries(self, node: ResearchNode, topic: str, retry_count: int = 1) -> List[ResearchNode]:
 
 
237
  try:
238
  if not node.data or node.depth > self.max_depth:
239
  return []
 
241
  prompt = self.prompt.search_query.format(
242
  topic=topic,
243
  ctx_manager=json.dumps(self.ctx_manager, indent=2),
244
+ n=self.max_breadth,
 
 
 
 
 
 
245
  )
246
+ response = self.generate_content(prompt, schema=self.schema.search_query)
247
+ self.logger.info(f"Spawn branches '{node.query}':\n{json.dumps(response['branches'], indent=2)}")
248
 
249
  # Add children to current node
250
  # |-> child
 
259
  return new_nodes
260
 
261
  except Exception as e:
262
+ if e in ["GEMINI_RECITATION", "NO_RESPONSE"] and retry_count < 3:
263
+ self.logger.error(f"Retrying _gen_queries | C:{retry_count / 3}", exc_info=True)
 
 
264
  self._gen_queries(node, topic, retry_count + 1)
265
  self.logger.error("_gen_queries failed", exc_info=True)
266
  raise
267
 
268
+ def _should_continue_branch(self, node: ResearchNode, topic: str, retry_count: int = 1) -> bool:
 
 
269
  try:
270
  if node.depth > self.max_depth:
271
  return False
272
 
273
  # Generate summary of key findings into the manager's context
274
  if node.data:
275
+ findings = ("\n" + "-" * 10 + "Next data" + "-" * 10 + "\n").join([json.dumps(d, indent=2) for d in node.data])
276
+ response = self.generate_content(f"Extract key findings from the following data related to the topic '{topic}':\n{findings}")
 
 
 
 
277
  self.ctx_manager.append(response)
278
 
279
  # Research manager takes decision to proceed or not
 
283
  path=" -> ".join(node.get_path_to_root()),
284
  findings="\n".join(self.ctx_manager),
285
  )
286
+ response = self.generate_content(prompt, schema=self.schema.continue_branch)
 
 
287
  self.logger.info(f"Branch decision '{node.query}': {response['decision']}")
288
 
289
  return response["decision"]
290
 
291
  except Exception as e:
292
+ if e in ["GEMINI_RECITATION", "NO_RESPONSE"] and retry_count < 3:
293
+ self.logger.error(f"Retrying branch decision:C:{retry_count / 3}", exc_info=True)
 
 
294
  self._should_continue_branch(node, topic, retry_count + 1)
295
  self.logger.error("Branch decision failed:", exc_info=True)
296
  raise
297
 
298
+ def generate_content(self, prompt: str, schema: Dict[str, Any] = {}) -> Dict[str, Any] | str:
299
+ safe = [
300
+ types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold=types.HarmBlockThreshold.BLOCK_NONE),
301
+ types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HARASSMENT, threshold=types.HarmBlockThreshold.BLOCK_NONE),
302
+ types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold=types.HarmBlockThreshold.BLOCK_NONE),
303
+ types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold=types.HarmBlockThreshold.BLOCK_NONE),
304
+ types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY, threshold=types.HarmBlockThreshold.BLOCK_NONE),
305
+ ]
306
+ if schema:
307
+ generate_content_config = types.GenerateContentConfig(
308
+ temperature=0.9, response_mime_type="application/json", safety_settings=safe, response_schema=schema
309
  )
310
+ else:
311
+ generate_content_config = types.GenerateContentConfig(temperature=0.9, response_mime_type="text/plain", safety_settings=safe)
312
+
313
+ try:
314
+ response = self.genai_client.models.generate_content(model="gemini-2.0-flash", contents=prompt, config=generate_content_config)
315
+ if not response:
316
+ raise Exception("NO_RESPONSE")
317
+
318
  self.token_count += response.usage_metadata.total_token_count
319
+ return json.loads(response.text) if schema else response.text
 
 
320
 
321
  except Exception:
322
+ if response.candidates[0].finish_reason == types.FinishReason.RECITATION:
323
  raise Exception("GEMINI_RECITATION")
324
  raise
backend/pyproject.toml CHANGED
@@ -43,8 +43,7 @@ dependencies = [
43
  "google-api-python-client==2.160.0",
44
  "google-auth==2.38.0",
45
  "google-auth-httplib2==0.2.0",
46
- "google-genai==1.0.0",
47
- "google-generativeai==0.8.4",
48
  "googleapis-common-protos==1.66.0",
49
  "greenlet==3.1.1",
50
  "grpcio==1.70.0",
@@ -52,7 +51,6 @@ dependencies = [
52
  "h11==0.14.0",
53
  "httpcore==1.0.7",
54
  "httplib2==0.22.0",
55
- "httpx==0.27.2",
56
  "httpx-sse==0.4.0",
57
  "huggingface-hub==0.28.1",
58
  "idna==3.10",
 
43
  "google-api-python-client==2.160.0",
44
  "google-auth==2.38.0",
45
  "google-auth-httplib2==0.2.0",
46
+ "google-genai==1.2.0",
 
47
  "googleapis-common-protos==1.66.0",
48
  "greenlet==3.1.1",
49
  "grpcio==1.70.0",
 
51
  "h11==0.14.0",
52
  "httpcore==1.0.7",
53
  "httplib2==0.22.0",
 
54
  "httpx-sse==0.4.0",
55
  "huggingface-hub==0.28.1",
56
  "idna==3.10",
backend/research_node.py CHANGED
@@ -15,7 +15,7 @@ class ResearchNode:
15
  def add_child(self, query: str) -> "ResearchNode":
16
  child = ResearchNode(query, parent=self, depth=self.depth + 1)
17
  self.children.append(child)
18
- return copy.deepcopy(child)
19
 
20
  def get_path_to_root(self) -> List[str]:
21
  path = [self.query]
 
15
  def add_child(self, query: str) -> "ResearchNode":
16
  child = ResearchNode(query, parent=self, depth=self.depth + 1)
17
  self.children.append(child)
18
+ return child
19
 
20
  def get_path_to_root(self) -> List[str]:
21
  path = [self.query]
backend/scraper.py CHANGED
@@ -68,9 +68,7 @@ class WebScraper:
68
  self.logger.info(f"Found {len(search_results)} URLs")
69
  return search_results
70
 
71
- except (
72
- requests.exceptions.RequestException
73
- ) as e: # Catch network errors specifically
74
  self.logger.error(f"DuckDuckGo search error: {str(e)}")
75
  return []
76
  except Exception as e: # Catch any other errors
@@ -136,9 +134,7 @@ class WebScraper:
136
  def _extract_links(self, soup: BeautifulSoup) -> List[str]:
137
  return [a.get("href") for a in soup.find_all("a") if a.get("href")]
138
 
139
- def _merge_extraction_results(
140
- self, news_data: Dict, selenium_data: Dict
141
- ) -> Dict[str, Any]:
142
  merged = selenium_data.copy()
143
 
144
  if news_data:
@@ -184,9 +180,7 @@ class CrawlForAIScraper:
184
  await self.crawler.close()
185
  self._is_started = False
186
 
187
- async def search_and_scrape(
188
- self, query: str, num_sites: int = 10
189
- ) -> List[Dict[str, Any]]:
190
  await self.start()
191
  self.logger.info(f"Querying: {query}")
192
 
@@ -279,21 +273,12 @@ class CrawlForAIScraper:
279
  if "width" in img.attrs and img.get("width").lower() == "auto":
280
  images.append((src, 999, 0))
281
  # Remove units from width and height: get start of the entity till the first non-digit character
282
- width = "".join(
283
- [i for i in img.get("width", "0") if i.isdigit() or i == "."]
284
- )
285
- height = "".join(
286
- [i for i in img.get("height", "0") if i.isdigit() or i == "."]
287
- )
288
  if width == "" or height == "":
289
  continue
290
  width, height = float(width), float(height)
291
- if (
292
- width > 300
293
- and height > 300
294
- and "pixel" not in src
295
- and "icon" not in src
296
- ):
297
  images.append((src, width, height))
298
  images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
299
  images = [img[0] for img in images]
@@ -306,11 +291,7 @@ class CrawlForAIScraper:
306
  def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
307
  # Extract videos from iframes and video tags
308
  videos = []
309
- nodes = (
310
- list(soup.find_all("iframe"))
311
- + list(soup.find_all("video"))
312
- + list(soup.find_all("a"))
313
- )
314
  for node in nodes:
315
  if node.name == "iframe":
316
  src = node.get("src", "")
 
68
  self.logger.info(f"Found {len(search_results)} URLs")
69
  return search_results
70
 
71
+ except requests.exceptions.RequestException as e: # Catch network errors specifically
 
 
72
  self.logger.error(f"DuckDuckGo search error: {str(e)}")
73
  return []
74
  except Exception as e: # Catch any other errors
 
134
  def _extract_links(self, soup: BeautifulSoup) -> List[str]:
135
  return [a.get("href") for a in soup.find_all("a") if a.get("href")]
136
 
137
+ def _merge_extraction_results(self, news_data: Dict, selenium_data: Dict) -> Dict[str, Any]:
 
 
138
  merged = selenium_data.copy()
139
 
140
  if news_data:
 
180
  await self.crawler.close()
181
  self._is_started = False
182
 
183
+ async def search_and_scrape(self, query: str, num_sites: int = 10) -> List[Dict[str, Any]]:
 
 
184
  await self.start()
185
  self.logger.info(f"Querying: {query}")
186
 
 
273
  if "width" in img.attrs and img.get("width").lower() == "auto":
274
  images.append((src, 999, 0))
275
  # Remove units from width and height: get start of the entity till the first non-digit character
276
+ width = "".join([i for i in img.get("width", "0") if i.isdigit() or i == "."])
277
+ height = "".join([i for i in img.get("height", "0") if i.isdigit() or i == "."])
 
 
 
 
278
  if width == "" or height == "":
279
  continue
280
  width, height = float(width), float(height)
281
+ if width > 300 and height > 300 and "pixel" not in src and "icon" not in src:
 
 
 
 
 
282
  images.append((src, width, height))
283
  images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
284
  images = [img[0] for img in images]
 
291
  def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
292
  # Extract videos from iframes and video tags
293
  videos = []
294
+ nodes = list(soup.find_all("iframe")) + list(soup.find_all("video")) + list(soup.find_all("a"))
 
 
 
 
295
  for node in nodes:
296
  if node.name == "iframe":
297
  src = node.get("src", "")
backend/uv.lock CHANGED
@@ -200,7 +200,6 @@ dependencies = [
200
  { name = "google-auth" },
201
  { name = "google-auth-httplib2" },
202
  { name = "google-genai" },
203
- { name = "google-generativeai" },
204
  { name = "googleapis-common-protos" },
205
  { name = "greenlet" },
206
  { name = "grpcio" },
@@ -208,7 +207,6 @@ dependencies = [
208
  { name = "h11" },
209
  { name = "httpcore" },
210
  { name = "httplib2" },
211
- { name = "httpx" },
212
  { name = "httpx-sse" },
213
  { name = "huggingface-hub" },
214
  { name = "idna" },
@@ -347,8 +345,7 @@ requires-dist = [
347
  { name = "google-api-python-client", specifier = "==2.160.0" },
348
  { name = "google-auth", specifier = "==2.38.0" },
349
  { name = "google-auth-httplib2", specifier = "==0.2.0" },
350
- { name = "google-genai", specifier = "==1.0.0" },
351
- { name = "google-generativeai", specifier = "==0.8.4" },
352
  { name = "googleapis-common-protos", specifier = "==1.66.0" },
353
  { name = "greenlet", specifier = "==3.1.1" },
354
  { name = "grpcio", specifier = "==1.70.0" },
@@ -356,7 +353,6 @@ requires-dist = [
356
  { name = "h11", specifier = "==0.14.0" },
357
  { name = "httpcore", specifier = "==1.0.7" },
358
  { name = "httplib2", specifier = "==0.22.0" },
359
- { name = "httpx", specifier = "==0.27.2" },
360
  { name = "httpx-sse", specifier = "==0.4.0" },
361
  { name = "huggingface-hub", specifier = "==0.28.1" },
362
  { name = "idna", specifier = "==3.10" },
@@ -956,35 +952,17 @@ wheels = [
956
 
957
  [[package]]
958
  name = "google-genai"
959
- version = "1.0.0"
960
  source = { registry = "https://pypi.org/simple" }
961
  dependencies = [
962
  { name = "google-auth" },
963
  { name = "pydantic" },
964
  { name = "requests" },
965
- { name = "websockets" },
966
- ]
967
- sdist = { url = "https://files.pythonhosted.org/packages/2f/c3/fba38ba11a9b97b0a6ca6d46ec0dcd3c7bdf3ecf83eec6e6117ac25106c7/google_genai-1.0.0.tar.gz", hash = "sha256:15712abb808f891a14eafc9edf21b8cf92ea952f627dd0e2e939657efd234acd", size = 122958 }
968
- wheels = [
969
- { url = "https://files.pythonhosted.org/packages/24/9d/63dbd2b6c630f44cbbf09c4e04b4c9012da01f6e585d34ae53d07931bb67/google_genai-1.0.0-py3-none-any.whl", hash = "sha256:e9c3abd48f46ecb2b0a51efa7f65c6830b50f9784df603a91019b43918a7531f", size = 129418 },
970
- ]
971
-
972
- [[package]]
973
- name = "google-generativeai"
974
- version = "0.8.4"
975
- source = { registry = "https://pypi.org/simple" }
976
- dependencies = [
977
- { name = "google-ai-generativelanguage" },
978
- { name = "google-api-core" },
979
- { name = "google-api-python-client" },
980
- { name = "google-auth" },
981
- { name = "protobuf" },
982
- { name = "pydantic" },
983
- { name = "tqdm" },
984
  { name = "typing-extensions" },
 
985
  ]
986
  wheels = [
987
- { url = "https://files.pythonhosted.org/packages/9b/b0/6c6af327a8a6ef3be6fe79be1d6f1e2914d6c363aa6b081b93396f4460a7/google_generativeai-0.8.4-py3-none-any.whl", hash = "sha256:e987b33ea6decde1e69191ddcaec6ef974458864d243de7191db50c21a7c5b82", size = 175409 },
988
  ]
989
 
990
  [[package]]
 
200
  { name = "google-auth" },
201
  { name = "google-auth-httplib2" },
202
  { name = "google-genai" },
 
203
  { name = "googleapis-common-protos" },
204
  { name = "greenlet" },
205
  { name = "grpcio" },
 
207
  { name = "h11" },
208
  { name = "httpcore" },
209
  { name = "httplib2" },
 
210
  { name = "httpx-sse" },
211
  { name = "huggingface-hub" },
212
  { name = "idna" },
 
345
  { name = "google-api-python-client", specifier = "==2.160.0" },
346
  { name = "google-auth", specifier = "==2.38.0" },
347
  { name = "google-auth-httplib2", specifier = "==0.2.0" },
348
+ { name = "google-genai", specifier = "==1.2.0" },
 
349
  { name = "googleapis-common-protos", specifier = "==1.66.0" },
350
  { name = "greenlet", specifier = "==3.1.1" },
351
  { name = "grpcio", specifier = "==1.70.0" },
 
353
  { name = "h11", specifier = "==0.14.0" },
354
  { name = "httpcore", specifier = "==1.0.7" },
355
  { name = "httplib2", specifier = "==0.22.0" },
 
356
  { name = "httpx-sse", specifier = "==0.4.0" },
357
  { name = "huggingface-hub", specifier = "==0.28.1" },
358
  { name = "idna", specifier = "==3.10" },
 
952
 
953
  [[package]]
954
  name = "google-genai"
955
+ version = "1.2.0"
956
  source = { registry = "https://pypi.org/simple" }
957
  dependencies = [
958
  { name = "google-auth" },
959
  { name = "pydantic" },
960
  { name = "requests" },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
961
  { name = "typing-extensions" },
962
+ { name = "websockets" },
963
  ]
964
  wheels = [
965
+ { url = "https://files.pythonhosted.org/packages/0d/ed/985f2d2e2b5fbd912ab0fdb11d6dc48c22553a6c4edffabb8146d53b974a/google_genai-1.2.0-py3-none-any.whl", hash = "sha256:609d61bee73f1a6ae5b47e9c7dd4b469d50318f050c5ceacf835b0f80f79d2d9", size = 130744 },
966
  ]
967
 
968
  [[package]]