Soham Waghmare commited on
Commit
4e3ab6e
·
1 Parent(s): 63a0765

feat: improves logging

Browse files
Files changed (4) hide show
  1. backend/app.py +11 -14
  2. backend/knet.py +96 -28
  3. backend/research_node.py +1 -1
  4. backend/scraper.py +30 -10
backend/app.py CHANGED
@@ -84,7 +84,7 @@ async def health_check(sid, data):
84
  async def start_research(sid, data):
85
  try:
86
  data = json.loads(data) if type(data) is not dict else data
87
- topic = data.get("topic")
88
  max_depth: int = data.get("max_depth")
89
  max_breadth: int = data.get("max_breadth")
90
  num_sites_per_query: int = data.get("num_sites_per_query")
@@ -92,21 +92,18 @@ async def start_research(sid, data):
92
  knet, _ = await session_manager.get_or_create_session(sid)
93
 
94
  session_id = sid
95
- logger.info(f"Starting research for client {session_id} on topic: {topic}")
96
 
97
  async def progress_callback(status):
98
- try:
99
- logger.debug(f"Progress update: {status['progress']}% - {status['message']}")
100
- await sio.emit(
101
- "status",
102
- {"message": status["message"], "progress": status["progress"]},
103
- room=session_id,
104
- )
105
- except Exception as e:
106
- logger.error(f"Error in progress callback: {str(e)}")
107
- raise e
108
-
109
- research_results = await knet.conduct_research(topic, progress_callback, max_depth, max_breadth, num_sites_per_query)
110
  logger.info(f"Research completed for topic: {topic}")
111
  await sio.emit("research_complete", research_results, room=session_id)
112
 
 
84
  async def start_research(sid, data):
85
  try:
86
  data = json.loads(data) if type(data) is not dict else data
87
+ topic = data.get("topic").strip()
88
  max_depth: int = data.get("max_depth")
89
  max_breadth: int = data.get("max_breadth")
90
  num_sites_per_query: int = data.get("num_sites_per_query")
 
92
  knet, _ = await session_manager.get_or_create_session(sid)
93
 
94
  session_id = sid
95
+ logger.info(f"Starting research for client {session_id}.\nTopic '{topic}'")
96
 
97
  async def progress_callback(status):
98
+ await sio.emit(
99
+ "status",
100
+ {"message": status["message"], "progress": status["progress"]},
101
+ room=session_id,
102
+ )
103
+
104
+ research_results = await knet.conduct_research(
105
+ topic, progress_callback, max_depth, max_breadth, num_sites_per_query
106
+ )
 
 
 
107
  logger.info(f"Research completed for topic: {topic}")
108
  await sio.emit("research_complete", research_results, room=session_id)
109
 
backend/knet.py CHANGED
@@ -11,6 +11,7 @@ from dotenv import load_dotenv
11
  from google.ai.generativelanguage_v1beta.types import content
12
 
13
  from research_node import ResearchNode
 
14
 
15
  # Load environment variables
16
  load_dotenv()
@@ -96,7 +97,13 @@ class ResearchProgress:
96
 
97
 
98
  class KNet:
99
- def __init__(self, scraper_instance, max_depth: int = 1, max_breadth: int = 1, num_sites_per_query: int = 5):
 
 
 
 
 
 
100
  self.api_key = os.getenv("GOOGLE_API_KEY")
101
  assert self.api_key, "Google API key is required"
102
  self.scraper = scraper_instance
@@ -114,8 +121,16 @@ class KNet:
114
  {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
115
  {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
116
  ]
117
- self.researcher = genai.GenerativeModel("gemini-2.0-flash", generation_config=generation_config, safety_settings=safe)
118
- self.research_manager = genai.GenerativeModel("gemini-2.0-flash", generation_config=generation_config, safety_settings=safe)
 
 
 
 
 
 
 
 
119
 
120
  # Parameters
121
  self.max_depth = max_depth
@@ -127,7 +142,14 @@ class KNet:
127
  self.ctx_manager: list[str] = []
128
  self.token_count: int = 0
129
 
130
- async def conduct_research(self, topic: str, progress_callback, max_depth: int, max_breadth: int, num_sites_per_query: int) -> dict:
 
 
 
 
 
 
 
131
  # Local Runtime State
132
  progress = ResearchProgress(progress_callback)
133
  self.max_depth = max_depth
@@ -138,7 +160,6 @@ class KNet:
138
  self.ctx_researcher = []
139
  self.ctx_manager = []
140
  self.token_count = 0
141
- self.logger.info(f"Starting research on topic: {topic}")
142
 
143
  try:
144
  root_node = ResearchNode(topic)
@@ -150,10 +171,15 @@ class KNet:
150
  while to_explore:
151
  current_node, current_depth = to_explore.popleft()
152
 
153
- if current_node.query in explored_queries or current_depth > self.max_depth:
 
 
 
154
  continue
155
 
156
- self.logger.info(f"Exploring: {current_node.query} (Depth: {current_depth})")
 
 
157
  await progress.update(5, f"Exploring: {current_node.query}")
158
 
159
  # Search and scrape
@@ -169,13 +195,14 @@ class KNet:
169
  new_branches = self._gen_queries(current_node, topic)
170
  for branch in new_branches:
171
  to_explore.append((branch, current_depth + 1))
172
- self.logger.info(f"Added {len(new_branches)} new branch(es) at depth {current_depth + 1}")
173
 
174
  # Generate final report
175
  await progress.update(30, "Generating comprehensive report...")
176
  final_report = self._generate_final_report(root_node)
177
 
178
- self.logger.info(f"Research completed. Explored {len(explored_queries)} queries across {root_node.max_depth()} levels")
 
 
179
  await progress.update(100, "Research complete!")
180
 
181
  with open("output.json", "a", encoding="utf-8") as f:
@@ -186,7 +213,9 @@ class KNet:
186
  self.logger.error("Research failed", exc_info=True)
187
  raise
188
 
189
- def _generate_final_report(self, root_node: ResearchNode, retry_count: int = 1) -> Dict[str, Any]:
 
 
190
  try:
191
  findings = "\n".join(self.ctx_manager)
192
  with open("output.json", "w") as f:
@@ -205,11 +234,18 @@ class KNet:
205
  if data.get("videos"):
206
  media_content["videos"].extend(data["videos"])
207
  if data.get("links"):
208
- media_content["links"].extend([{"url": link["href"], "text": link["text"]} for link in data["links"]])
 
 
 
 
 
209
  # Dedupe
210
  media_content["images"] = list(set(media_content["images"]))
211
  media_content["videos"] = list(set(media_content["videos"]))
212
- media_content["links"] = list({json.dumps(d, sort_keys=True) for d in media_content["links"]})
 
 
213
  media_content["links"] = [json.loads(d) for d in media_content["links"]]
214
 
215
  # Build research tree structure
@@ -222,7 +258,9 @@ class KNet:
222
  "query": node.query,
223
  "depth": node.depth,
224
  "sources": sources,
225
- "children": [build_tree_structure(child) for child in node.children],
 
 
226
  }
227
 
228
  return {
@@ -241,19 +279,31 @@ class KNet:
241
 
242
  except Exception as e:
243
  if e == "GEMINI_RECITATION" and retry_count < 3:
244
- self.logger.error(f"Retrying final report:C:{retry_count / 3}", exc_info=True)
 
 
245
  self._generate_final_report(root_node, retry_count + 1)
246
  self.logger.error("Error generating final report", exc_info=True)
247
  raise
248
 
249
- def _gen_queries(self, node: ResearchNode, topic: str, retry_count: int = 1) -> List[ResearchNode]:
 
 
250
  try:
251
  if not node.data or node.depth > self.max_depth:
252
  return []
253
 
254
- prompt = self.prompt.search_query.format(topic=topic, ctx_manager=json.dumps(self.ctx_manager, indent=2), max_breadth=self.max_breadth)
255
- response = self.generate_content(prompt, generation_config=self.schema.search_query)
256
- self.logger.info(f"New branches for '{node.query}': {response['branches']}")
 
 
 
 
 
 
 
 
257
 
258
  # Add children to current node
259
  # |-> child
@@ -263,24 +313,34 @@ class KNet:
263
  for branch in response.get("branches", []):
264
  child_node = node.add_child(branch["query"])
265
  new_nodes.append(child_node)
 
 
266
  return new_nodes
267
 
268
  except Exception as e:
269
  if e == "GEMINI_RECITATION" and retry_count < 3:
270
- self.logger.error(f"Retrying analysis:C:{retry_count / 3}", exc_info=True)
 
 
271
  self._gen_queries(node, topic, retry_count + 1)
272
- self.logger.error("Branch analysis failed:", exc_info=True)
273
  raise
274
 
275
- def _should_continue_branch(self, node: ResearchNode, topic: str, retry_count: int = 1) -> bool:
 
 
276
  try:
277
  if node.depth > self.max_depth:
278
  return False
279
 
280
  # Generate summary of key findings into the manager's context
281
  if node.data:
282
- findings = ("\n" + "-" * 10 + "Next data" + "-" * 10 + "\n").join([json.dumps(d, indent=2) for d in node.data])
283
- response = self.generate_content(f"Extract key findings from the following data related to the topic '{topic}':\n{findings}")
 
 
 
 
284
  self.ctx_manager.append(response)
285
 
286
  # Research manager takes decision to proceed or not
@@ -290,21 +350,29 @@ class KNet:
290
  path=" -> ".join(node.get_path_to_root()),
291
  findings="\n".join(self.ctx_manager),
292
  )
293
- response = self.generate_content(prompt, generation_config=self.schema.continue_branch)
294
- self.logger.info(f"Branch decision for '{node.query}': {response['decision']}")
 
 
295
 
296
  return response["decision"]
297
 
298
  except Exception as e:
299
  if e == "GEMINI_RECITATION" and retry_count < 3:
300
- self.logger.error(f"Retrying branch decision:C:{retry_count / 3}", exc_info=True)
 
 
301
  self._should_continue_branch(node, topic, retry_count + 1)
302
  self.logger.error("Branch decision failed:", exc_info=True)
303
  raise
304
 
305
- def generate_content(self, prompt: str, generation_config: Dict[str, Any] = {}) -> Dict[str, Any] | str:
 
 
306
  try:
307
- response = self.researcher.generate_content(prompt, generation_config=generation_config)
 
 
308
  self.token_count += response.usage_metadata.total_token_count
309
  if generation_config:
310
  return json.loads(response.text)
 
11
  from google.ai.generativelanguage_v1beta.types import content
12
 
13
  from research_node import ResearchNode
14
+ from scraper import CrawlForAIScraper
15
 
16
  # Load environment variables
17
  load_dotenv()
 
97
 
98
 
99
  class KNet:
100
+ def __init__(
101
+ self,
102
+ scraper_instance: CrawlForAIScraper,
103
+ max_depth: int = 1,
104
+ max_breadth: int = 1,
105
+ num_sites_per_query: int = 5,
106
+ ):
107
  self.api_key = os.getenv("GOOGLE_API_KEY")
108
  assert self.api_key, "Google API key is required"
109
  self.scraper = scraper_instance
 
121
  {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
122
  {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
123
  ]
124
+ self.researcher = genai.GenerativeModel(
125
+ "gemini-2.0-flash",
126
+ generation_config=generation_config,
127
+ safety_settings=safe,
128
+ )
129
+ self.research_manager = genai.GenerativeModel(
130
+ "gemini-2.0-flash",
131
+ generation_config=generation_config,
132
+ safety_settings=safe,
133
+ )
134
 
135
  # Parameters
136
  self.max_depth = max_depth
 
142
  self.ctx_manager: list[str] = []
143
  self.token_count: int = 0
144
 
145
+ async def conduct_research(
146
+ self,
147
+ topic: str,
148
+ progress_callback,
149
+ max_depth: int,
150
+ max_breadth: int,
151
+ num_sites_per_query: int,
152
+ ) -> dict:
153
  # Local Runtime State
154
  progress = ResearchProgress(progress_callback)
155
  self.max_depth = max_depth
 
160
  self.ctx_researcher = []
161
  self.ctx_manager = []
162
  self.token_count = 0
 
163
 
164
  try:
165
  root_node = ResearchNode(topic)
 
171
  while to_explore:
172
  current_node, current_depth = to_explore.popleft()
173
 
174
+ if (
175
+ current_node.query in explored_queries
176
+ or current_depth > self.max_depth
177
+ ):
178
  continue
179
 
180
+ self.logger.info(
181
+ f"Exploring: {current_node.query} (Depth: {current_depth})"
182
+ )
183
  await progress.update(5, f"Exploring: {current_node.query}")
184
 
185
  # Search and scrape
 
195
  new_branches = self._gen_queries(current_node, topic)
196
  for branch in new_branches:
197
  to_explore.append((branch, current_depth + 1))
 
198
 
199
  # Generate final report
200
  await progress.update(30, "Generating comprehensive report...")
201
  final_report = self._generate_final_report(root_node)
202
 
203
+ self.logger.info(
204
+ f"Research completed. Explored {len(explored_queries)} queries across {root_node.max_depth()} levels"
205
+ )
206
  await progress.update(100, "Research complete!")
207
 
208
  with open("output.json", "a", encoding="utf-8") as f:
 
213
  self.logger.error("Research failed", exc_info=True)
214
  raise
215
 
216
+ def _generate_final_report(
217
+ self, root_node: ResearchNode, retry_count: int = 1
218
+ ) -> Dict[str, Any]:
219
  try:
220
  findings = "\n".join(self.ctx_manager)
221
  with open("output.json", "w") as f:
 
234
  if data.get("videos"):
235
  media_content["videos"].extend(data["videos"])
236
  if data.get("links"):
237
+ media_content["links"].extend(
238
+ [
239
+ {"url": link["href"], "text": link["text"]}
240
+ for link in data["links"]
241
+ ]
242
+ )
243
  # Dedupe
244
  media_content["images"] = list(set(media_content["images"]))
245
  media_content["videos"] = list(set(media_content["videos"]))
246
+ media_content["links"] = list(
247
+ {json.dumps(d, sort_keys=True) for d in media_content["links"]}
248
+ )
249
  media_content["links"] = [json.loads(d) for d in media_content["links"]]
250
 
251
  # Build research tree structure
 
258
  "query": node.query,
259
  "depth": node.depth,
260
  "sources": sources,
261
+ "children": [
262
+ build_tree_structure(child) for child in node.children
263
+ ],
264
  }
265
 
266
  return {
 
279
 
280
  except Exception as e:
281
  if e == "GEMINI_RECITATION" and retry_count < 3:
282
+ self.logger.error(
283
+ f"Retrying final report:C:{retry_count / 3}", exc_info=True
284
+ )
285
  self._generate_final_report(root_node, retry_count + 1)
286
  self.logger.error("Error generating final report", exc_info=True)
287
  raise
288
 
289
+ def _gen_queries(
290
+ self, node: ResearchNode, topic: str, retry_count: int = 1
291
+ ) -> List[ResearchNode]:
292
  try:
293
  if not node.data or node.depth > self.max_depth:
294
  return []
295
 
296
+ prompt = self.prompt.search_query.format(
297
+ topic=topic,
298
+ ctx_manager=json.dumps(self.ctx_manager, indent=2),
299
+ max_breadth=self.max_breadth,
300
+ )
301
+ response = self.generate_content(
302
+ prompt, generation_config=self.schema.search_query
303
+ )
304
+ self.logger.info(
305
+ f"Spawn branches '{node.query}':\n{json.dumps(response['branches'], indent=2)}"
306
+ )
307
 
308
  # Add children to current node
309
  # |-> child
 
313
  for branch in response.get("branches", []):
314
  child_node = node.add_child(branch["query"])
315
  new_nodes.append(child_node)
316
+
317
+ self.logger.info(f"Spawned {len(new_nodes)} new branch(es)")
318
  return new_nodes
319
 
320
  except Exception as e:
321
  if e == "GEMINI_RECITATION" and retry_count < 3:
322
+ self.logger.error(
323
+ f"Retrying _gen_queries | C:{retry_count / 3}", exc_info=True
324
+ )
325
  self._gen_queries(node, topic, retry_count + 1)
326
+ self.logger.error("_gen_queries failed", exc_info=True)
327
  raise
328
 
329
+ def _should_continue_branch(
330
+ self, node: ResearchNode, topic: str, retry_count: int = 1
331
+ ) -> bool:
332
  try:
333
  if node.depth > self.max_depth:
334
  return False
335
 
336
  # Generate summary of key findings into the manager's context
337
  if node.data:
338
+ findings = ("\n" + "-" * 10 + "Next data" + "-" * 10 + "\n").join(
339
+ [json.dumps(d, indent=2) for d in node.data]
340
+ )
341
+ response = self.generate_content(
342
+ f"Extract key findings from the following data related to the topic '{topic}':\n{findings}"
343
+ )
344
  self.ctx_manager.append(response)
345
 
346
  # Research manager takes decision to proceed or not
 
350
  path=" -> ".join(node.get_path_to_root()),
351
  findings="\n".join(self.ctx_manager),
352
  )
353
+ response = self.generate_content(
354
+ prompt, generation_config=self.schema.continue_branch
355
+ )
356
+ self.logger.info(f"Branch decision '{node.query}': {response['decision']}")
357
 
358
  return response["decision"]
359
 
360
  except Exception as e:
361
  if e == "GEMINI_RECITATION" and retry_count < 3:
362
+ self.logger.error(
363
+ f"Retrying branch decision:C:{retry_count / 3}", exc_info=True
364
+ )
365
  self._should_continue_branch(node, topic, retry_count + 1)
366
  self.logger.error("Branch decision failed:", exc_info=True)
367
  raise
368
 
369
+ def generate_content(
370
+ self, prompt: str, generation_config: Dict[str, Any] = {}
371
+ ) -> Dict[str, Any] | str:
372
  try:
373
+ response = self.researcher.generate_content(
374
+ prompt, generation_config=generation_config
375
+ )
376
  self.token_count += response.usage_metadata.total_token_count
377
  if generation_config:
378
  return json.loads(response.text)
backend/research_node.py CHANGED
@@ -15,7 +15,7 @@ class ResearchNode:
15
  def add_child(self, query: str) -> "ResearchNode":
16
  child = ResearchNode(query, parent=self, depth=self.depth + 1)
17
  self.children.append(child)
18
- return child
19
 
20
  def get_path_to_root(self) -> List[str]:
21
  path = [self.query]
 
15
  def add_child(self, query: str) -> "ResearchNode":
16
  child = ResearchNode(query, parent=self, depth=self.depth + 1)
17
  self.children.append(child)
18
+ return copy.deepcopy(child)
19
 
20
  def get_path_to_root(self) -> List[str]:
21
  path = [self.query]
backend/scraper.py CHANGED
@@ -68,7 +68,9 @@ class WebScraper:
68
  self.logger.info(f"Found {len(search_results)} URLs")
69
  return search_results
70
 
71
- except requests.exceptions.RequestException as e: # Catch network errors specifically
 
 
72
  self.logger.error(f"DuckDuckGo search error: {str(e)}")
73
  return []
74
  except Exception as e: # Catch any other errors
@@ -134,7 +136,9 @@ class WebScraper:
134
  def _extract_links(self, soup: BeautifulSoup) -> List[str]:
135
  return [a.get("href") for a in soup.find_all("a") if a.get("href")]
136
 
137
- def _merge_extraction_results(self, news_data: Dict, selenium_data: Dict) -> Dict[str, Any]:
 
 
138
  merged = selenium_data.copy()
139
 
140
  if news_data:
@@ -164,6 +168,7 @@ class CrawlForAIScraper:
164
  viewport_width=1920,
165
  viewport_height=1080,
166
  accept_downloads=True,
 
167
  )
168
  self.crawler = AsyncWebCrawler(config=self.base_browser)
169
  self._is_started = False
@@ -179,9 +184,11 @@ class CrawlForAIScraper:
179
  await self.crawler.close()
180
  self._is_started = False
181
 
182
- async def search_and_scrape(self, query: str, num_sites: int = 10) -> List[Dict[str, Any]]:
 
 
183
  await self.start()
184
- self.logger.info(f"Starting search for: {query}")
185
 
186
  # Perform a search to get a list of webpages
187
  search_results = await self._search(query, num_sites)
@@ -197,7 +204,6 @@ class CrawlForAIScraper:
197
  return scraped_data
198
 
199
  async def _search(self, query: str, num_results: int) -> List[str]:
200
- self.logger.info("Performing Google search...")
201
  try:
202
  encoded_query = quote_plus(query)
203
  search_uri = f"https://www.google.com/search?q={encoded_query}"
@@ -219,7 +225,7 @@ class CrawlForAIScraper:
219
  url = "https://" + url
220
  search_results.append(url)
221
 
222
- self.logger.info(f"Found {len(search_results)} results.")
223
  return search_results[:num_results]
224
 
225
  except requests.exceptions.RequestException as e:
@@ -255,6 +261,7 @@ class CrawlForAIScraper:
255
  "links": result.links["external"],
256
  }
257
  scraped_sites.append(data)
 
258
  return scraped_sites
259
 
260
  except Exception as e:
@@ -272,12 +279,21 @@ class CrawlForAIScraper:
272
  if "width" in img.attrs and img.get("width").lower() == "auto":
273
  images.append((src, 999, 0))
274
  # Remove units from width and height: get start of the entity till the first non-digit character
275
- width = "".join([i for i in img.get("width", "0") if i.isdigit() or i == "."])
276
- height = "".join([i for i in img.get("height", "0") if i.isdigit() or i == "."])
 
 
 
 
277
  if width == "" or height == "":
278
  continue
279
  width, height = float(width), float(height)
280
- if width > 300 and height > 300 and "pixel" not in src and "icon" not in src:
 
 
 
 
 
281
  images.append((src, width, height))
282
  images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
283
  images = [img[0] for img in images]
@@ -290,7 +306,11 @@ class CrawlForAIScraper:
290
  def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
291
  # Extract videos from iframes and video tags
292
  videos = []
293
- nodes = list(soup.find_all("iframe")) + list(soup.find_all("video")) + list(soup.find_all("a"))
 
 
 
 
294
  for node in nodes:
295
  if node.name == "iframe":
296
  src = node.get("src", "")
 
68
  self.logger.info(f"Found {len(search_results)} URLs")
69
  return search_results
70
 
71
+ except (
72
+ requests.exceptions.RequestException
73
+ ) as e: # Catch network errors specifically
74
  self.logger.error(f"DuckDuckGo search error: {str(e)}")
75
  return []
76
  except Exception as e: # Catch any other errors
 
136
  def _extract_links(self, soup: BeautifulSoup) -> List[str]:
137
  return [a.get("href") for a in soup.find_all("a") if a.get("href")]
138
 
139
+ def _merge_extraction_results(
140
+ self, news_data: Dict, selenium_data: Dict
141
+ ) -> Dict[str, Any]:
142
  merged = selenium_data.copy()
143
 
144
  if news_data:
 
168
  viewport_width=1920,
169
  viewport_height=1080,
170
  accept_downloads=True,
171
+ verbose=False,
172
  )
173
  self.crawler = AsyncWebCrawler(config=self.base_browser)
174
  self._is_started = False
 
184
  await self.crawler.close()
185
  self._is_started = False
186
 
187
+ async def search_and_scrape(
188
+ self, query: str, num_sites: int = 10
189
+ ) -> List[Dict[str, Any]]:
190
  await self.start()
191
+ self.logger.info(f"Querying: {query}")
192
 
193
  # Perform a search to get a list of webpages
194
  search_results = await self._search(query, num_sites)
 
204
  return scraped_data
205
 
206
  async def _search(self, query: str, num_results: int) -> List[str]:
 
207
  try:
208
  encoded_query = quote_plus(query)
209
  search_uri = f"https://www.google.com/search?q={encoded_query}"
 
225
  url = "https://" + url
226
  search_results.append(url)
227
 
228
+ self.logger.info(f"Found {len(search_results)} results")
229
  return search_results[:num_results]
230
 
231
  except requests.exceptions.RequestException as e:
 
261
  "links": result.links["external"],
262
  }
263
  scraped_sites.append(data)
264
+ self.logger.info(f" - {result.url[:80]}...")
265
  return scraped_sites
266
 
267
  except Exception as e:
 
279
  if "width" in img.attrs and img.get("width").lower() == "auto":
280
  images.append((src, 999, 0))
281
  # Remove units from width and height: get start of the entity till the first non-digit character
282
+ width = "".join(
283
+ [i for i in img.get("width", "0") if i.isdigit() or i == "."]
284
+ )
285
+ height = "".join(
286
+ [i for i in img.get("height", "0") if i.isdigit() or i == "."]
287
+ )
288
  if width == "" or height == "":
289
  continue
290
  width, height = float(width), float(height)
291
+ if (
292
+ width > 300
293
+ and height > 300
294
+ and "pixel" not in src
295
+ and "icon" not in src
296
+ ):
297
  images.append((src, width, height))
298
  images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
299
  images = [img[0] for img in images]
 
306
  def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
307
  # Extract videos from iframes and video tags
308
  videos = []
309
+ nodes = (
310
+ list(soup.find_all("iframe"))
311
+ + list(soup.find_all("video"))
312
+ + list(soup.find_all("a"))
313
+ )
314
  for node in nodes:
315
  if node.name == "iframe":
316
  src = node.get("src", "")