Spaces:

sohamw03
/

knowledge-net

Paused

App Files Files Community

Soham Waghmare commited on Apr 4, 2025

Commit

0a3d9b7

1 Parent(s): 4e3ab6e

feat: migration from google-generativeai to google-genai

Browse files

Files changed (6) hide show

backend/app.py +1 -3
backend/knet.py +82 -142
backend/pyproject.toml +1 -3
backend/research_node.py +1 -1
backend/scraper.py +7 -26
backend/uv.lock +4 -26

backend/app.py CHANGED Viewed

@@ -101,9 +101,7 @@ async def start_research(sid, data):
                 room=session_id,
             )
-        research_results = await knet.conduct_research(
-            topic, progress_callback, max_depth, max_breadth, num_sites_per_query
-        )
         logger.info(f"Research completed for topic: {topic}")
         await sio.emit("research_complete", research_results, room=session_id)

                 room=session_id,
             )
+        research_results = await knet.conduct_research(topic, progress_callback, max_depth, max_breadth, num_sites_per_query)
         logger.info(f"Research completed for topic: {topic}")
         await sio.emit("research_complete", research_results, room=session_id)

backend/knet.py CHANGED Viewed

@@ -6,9 +6,9 @@ from datetime import datetime
 from textwrap import dedent
 from typing import Any, Dict, List
-import google.generativeai as genai
 from dotenv import load_dotenv
-from google.ai.generativelanguage_v1beta.types import content
 from research_node import ResearchNode
 from scraper import CrawlForAIScraper
@@ -39,7 +39,7 @@ class Prompt:
         Findings:
         {ctx_manager}
-        Suggest up to {max_breadth} specific google search queries that would help data which:
         - Builds upon these findings
         - Explores different aspects
         - Goes deeper into important details
@@ -50,37 +50,30 @@ class Prompt:
 class Schema:
     def __init__(self) -> None:
-        self.continue_branch = {
-            "response_schema": content.Schema(
-                type=content.Type.OBJECT,
-                required=["decision"],
-                properties={
-                    "decision": content.Schema(type=content.Type.BOOLEAN),
-                },
-            ),
-            "response_mime_type": "application/json",
-        }
-        self.search_query = {
-            "response_schema": content.Schema(
-                type=content.Type.OBJECT,
-                required=["branches"],
-                properties={
-                    "branches": content.Schema(
-                        type=content.Type.ARRAY,
-                        items=content.Schema(
-                            type=content.Type.OBJECT,
-                            required=["importance", "query"],
-                            properties={
-                                "importance": content.Schema(type=content.Type.NUMBER),
-                                "query": content.Schema(type=content.Type.STRING),
-                            },
-                        ),
-                    )
-                },
-            ),
-            "response_mime_type": "application/json",
-        }
 class ResearchProgress:
@@ -97,13 +90,7 @@ class ResearchProgress:
 class KNet:
-    def __init__(
-        self,
-        scraper_instance: CrawlForAIScraper,
-        max_depth: int = 1,
-        max_breadth: int = 1,
-        num_sites_per_query: int = 5,
-    ):
         self.api_key = os.getenv("GOOGLE_API_KEY")
         assert self.api_key, "Google API key is required"
         self.scraper = scraper_instance
@@ -111,26 +98,8 @@ class KNet:
         self.prompt = Prompt()
         self.schema = Schema()
-        # Init Agents' Base Model
-        genai.configure(api_key=self.api_key)
-        generation_config = {"temperature": 0.9}
-        safe = [
-            {"category": "HARM_CATEGORY_DANGEROUS", "threshold": "BLOCK_NONE"},
-            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
-            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
-            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
-            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
-        ]
-        self.researcher = genai.GenerativeModel(
-            "gemini-2.0-flash",
-            generation_config=generation_config,
-            safety_settings=safe,
-        )
-        self.research_manager = genai.GenerativeModel(
-            "gemini-2.0-flash",
-            generation_config=generation_config,
-            safety_settings=safe,
-        )
         # Parameters
         self.max_depth = max_depth
@@ -142,14 +111,7 @@ class KNet:
         self.ctx_manager: list[str] = []
         self.token_count: int = 0
-    async def conduct_research(
-        self,
-        topic: str,
-        progress_callback,
-        max_depth: int,
-        max_breadth: int,
-        num_sites_per_query: int,
-    ) -> dict:
         # Local Runtime State
         progress = ResearchProgress(progress_callback)
         self.max_depth = max_depth
@@ -162,7 +124,12 @@ class KNet:
         self.token_count = 0
         try:
-            root_node = ResearchNode(topic)
             to_explore = deque([(root_node, 0)])  # (node, depth) pairs
             explored_queries = set()  # {string, string, ...}
@@ -171,15 +138,10 @@ class KNet:
             while to_explore:
                 current_node, current_depth = to_explore.popleft()
-                if (
-                    current_node.query in explored_queries
-                    or current_depth > self.max_depth
-                ):
                     continue
-                self.logger.info(
-                    f"Exploring: {current_node.query} (Depth: {current_depth})"
-                )
                 await progress.update(5, f"Exploring: {current_node.query}")
                 # Search and scrape
@@ -190,8 +152,8 @@ class KNet:
                 explored_queries.add(current_node.query)
                 # Only branch if we have data and haven't reached max depth
-                if current_node.data and current_depth < self.max_depth:
-                    if self._should_continue_branch(current_node, topic):
                         new_branches = self._gen_queries(current_node, topic)
                         for branch in new_branches:
                             to_explore.append((branch, current_depth + 1))
@@ -200,9 +162,7 @@ class KNet:
             await progress.update(30, "Generating comprehensive report...")
             final_report = self._generate_final_report(root_node)
-            self.logger.info(
-                f"Research completed. Explored {len(explored_queries)} queries across {root_node.max_depth()} levels"
-            )
             await progress.update(100, "Research complete!")
             with open("output.json", "a", encoding="utf-8") as f:
@@ -213,12 +173,10 @@ class KNet:
             self.logger.error("Research failed", exc_info=True)
             raise
-    def _generate_final_report(
-        self, root_node: ResearchNode, retry_count: int = 1
-    ) -> Dict[str, Any]:
         try:
             findings = "\n".join(self.ctx_manager)
-            with open("output.json", "w") as f:
                 f.write(findings)
             prompt = f"""Generate a comprehensive report on the topic "{root_node.query}" based on the following research findings:
             {findings}
@@ -234,18 +192,11 @@ class KNet:
                 if data.get("videos"):
                     media_content["videos"].extend(data["videos"])
                 if data.get("links"):
-                    media_content["links"].extend(
-                        [
-                            {"url": link["href"], "text": link["text"]}
-                            for link in data["links"]
-                        ]
-                    )
             # Dedupe
             media_content["images"] = list(set(media_content["images"]))
             media_content["videos"] = list(set(media_content["videos"]))
-            media_content["links"] = list(
-                {json.dumps(d, sort_keys=True) for d in media_content["links"]}
-            )
             media_content["links"] = [json.loads(d) for d in media_content["links"]]
             # Build research tree structure
@@ -258,9 +209,7 @@ class KNet:
                     "query": node.query,
                     "depth": node.depth,
                     "sources": sources,
-                    "children": [
-                        build_tree_structure(child) for child in node.children
-                    ],
                 }
             return {
@@ -278,17 +227,13 @@ class KNet:
             }
         except Exception as e:
-            if e == "GEMINI_RECITATION" and retry_count < 3:
-                self.logger.error(
-                    f"Retrying final report:C:{retry_count / 3}", exc_info=True
-                )
                 self._generate_final_report(root_node, retry_count + 1)
             self.logger.error("Error generating final report", exc_info=True)
             raise
-    def _gen_queries(
-        self, node: ResearchNode, topic: str, retry_count: int = 1
-    ) -> List[ResearchNode]:
         try:
             if not node.data or node.depth > self.max_depth:
                 return []
@@ -296,14 +241,10 @@ class KNet:
             prompt = self.prompt.search_query.format(
                 topic=topic,
                 ctx_manager=json.dumps(self.ctx_manager, indent=2),
-                max_breadth=self.max_breadth,
-            )
-            response = self.generate_content(
-                prompt, generation_config=self.schema.search_query
-            )
-            self.logger.info(
-                f"Spawn branches '{node.query}':\n{json.dumps(response['branches'], indent=2)}"
             )
             # Add children to current node
             #       |-> child
@@ -318,29 +259,21 @@ class KNet:
             return new_nodes
         except Exception as e:
-            if e == "GEMINI_RECITATION" and retry_count < 3:
-                self.logger.error(
-                    f"Retrying _gen_queries | C:{retry_count / 3}", exc_info=True
-                )
                 self._gen_queries(node, topic, retry_count + 1)
             self.logger.error("_gen_queries failed", exc_info=True)
             raise
-    def _should_continue_branch(
-        self, node: ResearchNode, topic: str, retry_count: int = 1
-    ) -> bool:
         try:
             if node.depth > self.max_depth:
                 return False
             # Generate summary of key findings into the manager's context
             if node.data:
-                findings = ("\n" + "-" * 10 + "Next data" + "-" * 10 + "\n").join(
-                    [json.dumps(d, indent=2) for d in node.data]
-                )
-                response = self.generate_content(
-                    f"Extract key findings from the following data related to the topic '{topic}':\n{findings}"
-                )
                 self.ctx_manager.append(response)
             # Research manager takes decision to proceed or not
@@ -350,35 +283,42 @@ class KNet:
                 path=" -> ".join(node.get_path_to_root()),
                 findings="\n".join(self.ctx_manager),
             )
-            response = self.generate_content(
-                prompt, generation_config=self.schema.continue_branch
-            )
             self.logger.info(f"Branch decision '{node.query}': {response['decision']}")
             return response["decision"]
         except Exception as e:
-            if e == "GEMINI_RECITATION" and retry_count < 3:
-                self.logger.error(
-                    f"Retrying branch decision:C:{retry_count / 3}", exc_info=True
-                )
                 self._should_continue_branch(node, topic, retry_count + 1)
             self.logger.error("Branch decision failed:", exc_info=True)
             raise
-    def generate_content(
-        self, prompt: str, generation_config: Dict[str, Any] = {}
-    ) -> Dict[str, Any] | str:
-        try:
-            response = self.researcher.generate_content(
-                prompt, generation_config=generation_config
             )
             self.token_count += response.usage_metadata.total_token_count
-            if generation_config:
-                return json.loads(response.text)
-            return response.text
         except Exception:
-            if response["candidates"][0]["finishReason"] == "RECITATION":
                 raise Exception("GEMINI_RECITATION")
             raise

 from textwrap import dedent
 from typing import Any, Dict, List
 from dotenv import load_dotenv
+from google import genai
+from google.genai import types
 from research_node import ResearchNode
 from scraper import CrawlForAIScraper
         Findings:
         {ctx_manager}
+        Suggest up to {n} specific google search queries that would help data which:
         - Builds upon these findings
         - Explores different aspects
         - Goes deeper into important details
 class Schema:
     def __init__(self) -> None:
+        self.continue_branch = genai.types.Schema(
+            type=genai.types.Type.OBJECT,
+            required=["decision"],
+            properties={
+                "decision": genai.types.Schema(type=genai.types.Type.BOOLEAN),
+            },
+        )
+        self.search_query = genai.types.Schema(
+            type=genai.types.Type.OBJECT,
+            required=["branches"],
+            properties={
+                "branches": genai.types.Schema(
+                    type=genai.types.Type.ARRAY,
+                    items=genai.types.Schema(
+                        type=genai.types.Type.OBJECT,
+                        required=["query"],
+                        properties={
+                            "query": genai.types.Schema(type=genai.types.Type.STRING),
+                        },
+                    ),
+                )
+            },
+        )
 class ResearchProgress:
 class KNet:
+    def __init__(self, scraper_instance: CrawlForAIScraper, max_depth: int = 1, max_breadth: int = 1, num_sites_per_query: int = 5):
         self.api_key = os.getenv("GOOGLE_API_KEY")
         assert self.api_key, "Google API key is required"
         self.scraper = scraper_instance
         self.prompt = Prompt()
         self.schema = Schema()
+        # Init Google GenAI client
+        self.genai_client = genai.Client(api_key=self.api_key)
         # Parameters
         self.max_depth = max_depth
         self.ctx_manager: list[str] = []
         self.token_count: int = 0
+    async def conduct_research(self, topic: str, progress_callback, max_depth: int, max_breadth: int, num_sites_per_query: int) -> dict:
         # Local Runtime State
         progress = ResearchProgress(progress_callback)
         self.max_depth = max_depth
         self.token_count = 0
         try:
+            # Generate initial search query
+            query = self.generate_content(
+                self.prompt.search_query.format(topic=topic, ctx_manager=json.dumps(self.ctx_manager, indent=2), n=1),
+                schema=self.schema.search_query,
+            )
+            root_node = ResearchNode(query.get("branches")[0]["query"])
             to_explore = deque([(root_node, 0)])  # (node, depth) pairs
             explored_queries = set()  # {string, string, ...}
             while to_explore:
                 current_node, current_depth = to_explore.popleft()
+                if current_node.query in explored_queries or current_depth > self.max_depth:
                     continue
+                self.logger.info(f"Exploring: {current_node.query} (Depth: {current_depth})")
                 await progress.update(5, f"Exploring: {current_node.query}")
                 # Search and scrape
                 explored_queries.add(current_node.query)
                 # Only branch if we have data and haven't reached max depth
+                if self._should_continue_branch(current_node, topic):
+                    if current_node.data and current_depth < self.max_depth:
                         new_branches = self._gen_queries(current_node, topic)
                         for branch in new_branches:
                             to_explore.append((branch, current_depth + 1))
             await progress.update(30, "Generating comprehensive report...")
             final_report = self._generate_final_report(root_node)
+            self.logger.info(f"Research completed. Explored {len(explored_queries)} queries across {root_node.max_depth()} levels")
             await progress.update(100, "Research complete!")
             with open("output.json", "a", encoding="utf-8") as f:
             self.logger.error("Research failed", exc_info=True)
             raise
+    def _generate_final_report(self, root_node: ResearchNode, retry_count: int = 1) -> Dict[str, Any]:
         try:
             findings = "\n".join(self.ctx_manager)
+            with open("output.json", "w", encoding="utf-8") as f:
                 f.write(findings)
             prompt = f"""Generate a comprehensive report on the topic "{root_node.query}" based on the following research findings:
             {findings}
                 if data.get("videos"):
                     media_content["videos"].extend(data["videos"])
                 if data.get("links"):
+                    media_content["links"].extend([{"url": link["href"], "text": link["text"]} for link in data["links"]])
             # Dedupe
             media_content["images"] = list(set(media_content["images"]))
             media_content["videos"] = list(set(media_content["videos"]))
+            media_content["links"] = list({json.dumps(d, sort_keys=True) for d in media_content["links"]})
             media_content["links"] = [json.loads(d) for d in media_content["links"]]
             # Build research tree structure
                     "query": node.query,
                     "depth": node.depth,
                     "sources": sources,
+                    "children": [build_tree_structure(child) for child in node.children],
                 }
             return {
             }
         except Exception as e:
+            if e in ["GEMINI_RECITATION", "NO_RESPONSE"] and retry_count < 3:
+                self.logger.error(f"Retrying final report:C:{retry_count / 3}", exc_info=True)
                 self._generate_final_report(root_node, retry_count + 1)
             self.logger.error("Error generating final report", exc_info=True)
             raise
+    def _gen_queries(self, node: ResearchNode, topic: str, retry_count: int = 1) -> List[ResearchNode]:
         try:
             if not node.data or node.depth > self.max_depth:
                 return []
             prompt = self.prompt.search_query.format(
                 topic=topic,
                 ctx_manager=json.dumps(self.ctx_manager, indent=2),
+                n=self.max_breadth,
             )
+            response = self.generate_content(prompt, schema=self.schema.search_query)
+            self.logger.info(f"Spawn branches '{node.query}':\n{json.dumps(response['branches'], indent=2)}")
             # Add children to current node
             #       |-> child
             return new_nodes
         except Exception as e:
+            if e in ["GEMINI_RECITATION", "NO_RESPONSE"] and retry_count < 3:
+                self.logger.error(f"Retrying _gen_queries | C:{retry_count / 3}", exc_info=True)
                 self._gen_queries(node, topic, retry_count + 1)
             self.logger.error("_gen_queries failed", exc_info=True)
             raise
+    def _should_continue_branch(self, node: ResearchNode, topic: str, retry_count: int = 1) -> bool:
         try:
             if node.depth > self.max_depth:
                 return False
             # Generate summary of key findings into the manager's context
             if node.data:
+                findings = ("\n" + "-" * 10 + "Next data" + "-" * 10 + "\n").join([json.dumps(d, indent=2) for d in node.data])
+                response = self.generate_content(f"Extract key findings from the following data related to the topic '{topic}':\n{findings}")
                 self.ctx_manager.append(response)
             # Research manager takes decision to proceed or not
                 path=" -> ".join(node.get_path_to_root()),
                 findings="\n".join(self.ctx_manager),
             )
+            response = self.generate_content(prompt, schema=self.schema.continue_branch)
             self.logger.info(f"Branch decision '{node.query}': {response['decision']}")
             return response["decision"]
         except Exception as e:
+            if e in ["GEMINI_RECITATION", "NO_RESPONSE"] and retry_count < 3:
+                self.logger.error(f"Retrying branch decision:C:{retry_count / 3}", exc_info=True)
                 self._should_continue_branch(node, topic, retry_count + 1)
             self.logger.error("Branch decision failed:", exc_info=True)
             raise
+    def generate_content(self, prompt: str, schema: Dict[str, Any] = {}) -> Dict[str, Any] | str:
+        safe = [
+            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold=types.HarmBlockThreshold.BLOCK_NONE),
+            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HARASSMENT, threshold=types.HarmBlockThreshold.BLOCK_NONE),
+            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold=types.HarmBlockThreshold.BLOCK_NONE),
+            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold=types.HarmBlockThreshold.BLOCK_NONE),
+            types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY, threshold=types.HarmBlockThreshold.BLOCK_NONE),
+        ]
+        if schema:
+            generate_content_config = types.GenerateContentConfig(
+                temperature=0.9, response_mime_type="application/json", safety_settings=safe, response_schema=schema
             )
+        else:
+            generate_content_config = types.GenerateContentConfig(temperature=0.9, response_mime_type="text/plain", safety_settings=safe)
+        try:
+            response = self.genai_client.models.generate_content(model="gemini-2.0-flash", contents=prompt, config=generate_content_config)
+            if not response:
+                raise Exception("NO_RESPONSE")
             self.token_count += response.usage_metadata.total_token_count
+            return json.loads(response.text) if schema else response.text
         except Exception:
+            if response.candidates[0].finish_reason == types.FinishReason.RECITATION:
                 raise Exception("GEMINI_RECITATION")
             raise

backend/pyproject.toml CHANGED Viewed

@@ -43,8 +43,7 @@ dependencies = [
     "google-api-python-client==2.160.0",
     "google-auth==2.38.0",
     "google-auth-httplib2==0.2.0",
-    "google-genai==1.0.0",
-    "google-generativeai==0.8.4",
     "googleapis-common-protos==1.66.0",
     "greenlet==3.1.1",
     "grpcio==1.70.0",
@@ -52,7 +51,6 @@ dependencies = [
     "h11==0.14.0",
     "httpcore==1.0.7",
     "httplib2==0.22.0",
-    "httpx==0.27.2",
     "httpx-sse==0.4.0",
     "huggingface-hub==0.28.1",
     "idna==3.10",

     "google-api-python-client==2.160.0",
     "google-auth==2.38.0",
     "google-auth-httplib2==0.2.0",
+    "google-genai==1.2.0",
     "googleapis-common-protos==1.66.0",
     "greenlet==3.1.1",
     "grpcio==1.70.0",
     "h11==0.14.0",
     "httpcore==1.0.7",
     "httplib2==0.22.0",
     "httpx-sse==0.4.0",
     "huggingface-hub==0.28.1",
     "idna==3.10",

backend/research_node.py CHANGED Viewed

@@ -15,7 +15,7 @@ class ResearchNode:
     def add_child(self, query: str) -> "ResearchNode":
         child = ResearchNode(query, parent=self, depth=self.depth + 1)
         self.children.append(child)
-        return copy.deepcopy(child)
     def get_path_to_root(self) -> List[str]:
         path = [self.query]

     def add_child(self, query: str) -> "ResearchNode":
         child = ResearchNode(query, parent=self, depth=self.depth + 1)
         self.children.append(child)
+        return child
     def get_path_to_root(self) -> List[str]:
         path = [self.query]

backend/scraper.py CHANGED Viewed

@@ -68,9 +68,7 @@ class WebScraper:
             self.logger.info(f"Found {len(search_results)} URLs")
             return search_results
-        except (
-            requests.exceptions.RequestException
-        ) as e:  # Catch network errors specifically
             self.logger.error(f"DuckDuckGo search error: {str(e)}")
             return []
         except Exception as e:  # Catch any other errors
@@ -136,9 +134,7 @@ class WebScraper:
     def _extract_links(self, soup: BeautifulSoup) -> List[str]:
         return [a.get("href") for a in soup.find_all("a") if a.get("href")]
-    def _merge_extraction_results(
-        self, news_data: Dict, selenium_data: Dict
-    ) -> Dict[str, Any]:
         merged = selenium_data.copy()
         if news_data:
@@ -184,9 +180,7 @@ class CrawlForAIScraper:
             await self.crawler.close()
             self._is_started = False
-    async def search_and_scrape(
-        self, query: str, num_sites: int = 10
-    ) -> List[Dict[str, Any]]:
         await self.start()
         self.logger.info(f"Querying: {query}")
@@ -279,21 +273,12 @@ class CrawlForAIScraper:
                 if "width" in img.attrs and img.get("width").lower() == "auto":
                     images.append((src, 999, 0))
                 # Remove units from width and height: get start of the entity till the first non-digit character
-                width = "".join(
-                    [i for i in img.get("width", "0") if i.isdigit() or i == "."]
-                )
-                height = "".join(
-                    [i for i in img.get("height", "0") if i.isdigit() or i == "."]
-                )
                 if width == "" or height == "":
                     continue
                 width, height = float(width), float(height)
-                if (
-                    width > 300
-                    and height > 300
-                    and "pixel" not in src
-                    and "icon" not in src
-                ):
                     images.append((src, width, height))
         images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
         images = [img[0] for img in images]
@@ -306,11 +291,7 @@ class CrawlForAIScraper:
     def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
         # Extract videos from iframes and video tags
         videos = []
-        nodes = (
-            list(soup.find_all("iframe"))
-            + list(soup.find_all("video"))
-            + list(soup.find_all("a"))
-        )
         for node in nodes:
             if node.name == "iframe":
                 src = node.get("src", "")

             self.logger.info(f"Found {len(search_results)} URLs")
             return search_results
+        except requests.exceptions.RequestException as e:  # Catch network errors specifically
             self.logger.error(f"DuckDuckGo search error: {str(e)}")
             return []
         except Exception as e:  # Catch any other errors
     def _extract_links(self, soup: BeautifulSoup) -> List[str]:
         return [a.get("href") for a in soup.find_all("a") if a.get("href")]
+    def _merge_extraction_results(self, news_data: Dict, selenium_data: Dict) -> Dict[str, Any]:
         merged = selenium_data.copy()
         if news_data:
             await self.crawler.close()
             self._is_started = False
+    async def search_and_scrape(self, query: str, num_sites: int = 10) -> List[Dict[str, Any]]:
         await self.start()
         self.logger.info(f"Querying: {query}")
                 if "width" in img.attrs and img.get("width").lower() == "auto":
                     images.append((src, 999, 0))
                 # Remove units from width and height: get start of the entity till the first non-digit character
+                width = "".join([i for i in img.get("width", "0") if i.isdigit() or i == "."])
+                height = "".join([i for i in img.get("height", "0") if i.isdigit() or i == "."])
                 if width == "" or height == "":
                     continue
                 width, height = float(width), float(height)
+                if width > 300 and height > 300 and "pixel" not in src and "icon" not in src:
                     images.append((src, width, height))
         images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
         images = [img[0] for img in images]
     def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
         # Extract videos from iframes and video tags
         videos = []
+        nodes = list(soup.find_all("iframe")) + list(soup.find_all("video")) + list(soup.find_all("a"))
         for node in nodes:
             if node.name == "iframe":
                 src = node.get("src", "")

backend/uv.lock CHANGED Viewed

@@ -200,7 +200,6 @@ dependencies = [
     { name = "google-auth" },
     { name = "google-auth-httplib2" },
     { name = "google-genai" },
-    { name = "google-generativeai" },
     { name = "googleapis-common-protos" },
     { name = "greenlet" },
     { name = "grpcio" },
@@ -208,7 +207,6 @@ dependencies = [
     { name = "h11" },
     { name = "httpcore" },
     { name = "httplib2" },
-    { name = "httpx" },
     { name = "httpx-sse" },
     { name = "huggingface-hub" },
     { name = "idna" },
@@ -347,8 +345,7 @@ requires-dist = [
     { name = "google-api-python-client", specifier = "==2.160.0" },
     { name = "google-auth", specifier = "==2.38.0" },
     { name = "google-auth-httplib2", specifier = "==0.2.0" },
-    { name = "google-genai", specifier = "==1.0.0" },
-    { name = "google-generativeai", specifier = "==0.8.4" },
     { name = "googleapis-common-protos", specifier = "==1.66.0" },
     { name = "greenlet", specifier = "==3.1.1" },
     { name = "grpcio", specifier = "==1.70.0" },
@@ -356,7 +353,6 @@ requires-dist = [
     { name = "h11", specifier = "==0.14.0" },
     { name = "httpcore", specifier = "==1.0.7" },
     { name = "httplib2", specifier = "==0.22.0" },
-    { name = "httpx", specifier = "==0.27.2" },
     { name = "httpx-sse", specifier = "==0.4.0" },
     { name = "huggingface-hub", specifier = "==0.28.1" },
     { name = "idna", specifier = "==3.10" },
@@ -956,35 +952,17 @@ wheels = [
 [[package]]
 name = "google-genai"
-version = "1.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "google-auth" },
     { name = "pydantic" },
     { name = "requests" },
-    { name = "websockets" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/2f/c3/fba38ba11a9b97b0a6ca6d46ec0dcd3c7bdf3ecf83eec6e6117ac25106c7/google_genai-1.0.0.tar.gz", hash = "sha256:15712abb808f891a14eafc9edf21b8cf92ea952f627dd0e2e939657efd234acd", size = 122958 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/24/9d/63dbd2b6c630f44cbbf09c4e04b4c9012da01f6e585d34ae53d07931bb67/google_genai-1.0.0-py3-none-any.whl", hash = "sha256:e9c3abd48f46ecb2b0a51efa7f65c6830b50f9784df603a91019b43918a7531f", size = 129418 },
-]
-[[package]]
-name = "google-generativeai"
-version = "0.8.4"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "google-ai-generativelanguage" },
-    { name = "google-api-core" },
-    { name = "google-api-python-client" },
-    { name = "google-auth" },
-    { name = "protobuf" },
-    { name = "pydantic" },
-    { name = "tqdm" },
     { name = "typing-extensions" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/9b/b0/6c6af327a8a6ef3be6fe79be1d6f1e2914d6c363aa6b081b93396f4460a7/google_generativeai-0.8.4-py3-none-any.whl", hash = "sha256:e987b33ea6decde1e69191ddcaec6ef974458864d243de7191db50c21a7c5b82", size = 175409 },
 ]
 [[package]]

     { name = "google-auth" },
     { name = "google-auth-httplib2" },
     { name = "google-genai" },
     { name = "googleapis-common-protos" },
     { name = "greenlet" },
     { name = "grpcio" },
     { name = "h11" },
     { name = "httpcore" },
     { name = "httplib2" },
     { name = "httpx-sse" },
     { name = "huggingface-hub" },
     { name = "idna" },
     { name = "google-api-python-client", specifier = "==2.160.0" },
     { name = "google-auth", specifier = "==2.38.0" },
     { name = "google-auth-httplib2", specifier = "==0.2.0" },
+    { name = "google-genai", specifier = "==1.2.0" },
     { name = "googleapis-common-protos", specifier = "==1.66.0" },
     { name = "greenlet", specifier = "==3.1.1" },
     { name = "grpcio", specifier = "==1.70.0" },
     { name = "h11", specifier = "==0.14.0" },
     { name = "httpcore", specifier = "==1.0.7" },
     { name = "httplib2", specifier = "==0.22.0" },
     { name = "httpx-sse", specifier = "==0.4.0" },
     { name = "huggingface-hub", specifier = "==0.28.1" },
     { name = "idna", specifier = "==3.10" },
 [[package]]
 name = "google-genai"
+version = "1.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "google-auth" },
     { name = "pydantic" },
     { name = "requests" },
     { name = "typing-extensions" },
+    { name = "websockets" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/0d/ed/985f2d2e2b5fbd912ab0fdb11d6dc48c22553a6c4edffabb8146d53b974a/google_genai-1.2.0-py3-none-any.whl", hash = "sha256:609d61bee73f1a6ae5b47e9c7dd4b469d50318f050c5ceacf835b0f80f79d2d9", size = 130744 },
 ]
 [[package]]