Spaces:

sohamw03
/

knowledge-net

Paused

App Files Files Community

Soham Waghmare commited on Mar 24, 2025

Commit

9155a62

1 Parent(s): d1e806c

format

Browse files

Files changed (5) hide show

backend/app.py +19 -8
backend/crawl_ai.py +8 -2
backend/knet.py +30 -19
backend/research_node.py +5 -4
backend/scraper.py +47 -30

backend/app.py CHANGED Viewed

@@ -1,12 +1,17 @@
-# pip install asyncio eventlet
 # pip install google-genai beautifulsoup4 selenium newspaper3k lxml_html_clean
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
-import socketio
-import json, logging
 from knet import KNet
 from scraper import CrawlForAIScraper, WebScraper
-from dotenv import load_dotenv
 load_dotenv()
 # Configure logging
@@ -14,10 +19,12 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = FastAPI()
-app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
 sio = socketio.AsyncServer(cors_allowed_origins="*", ping_timeout=60, ping_interval=10, async_mode="asgi")
-app.mount('/', socketio.ASGIApp(sio))
 # Initialize the scraper and KNet
 scraper_instance = CrawlForAIScraper()
@@ -52,7 +59,9 @@ async def start_research(sid, data):
         async def progress_callback(status):
             try:
                 logger.debug(f"Progress update: {status['progress']}% - {status['message']}")
-                await sio.emit("status", {"message": status["message"], "progress": status["progress"]}, room=session_id)
             except Exception as e:
                 logger.error(f"Error in progress callback: {str(e)}")
                 raise e
@@ -75,7 +84,9 @@ async def test(sid, data):
     await scraper_instance.close()
     await sio.emit("test", res, room=sid)
 if __name__ == "__main__":
     logger.info("Starting KnowledgeNet server...")
     import uvicorn
-    uvicorn.run(app, host='127.0.0.1', port=5000)

+# pip install asyncio eventlet
 # pip install google-genai beautifulsoup4 selenium newspaper3k lxml_html_clean
+import json
+import logging
+import socketio
+from dotenv import load_dotenv
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from knet import KNet
 from scraper import CrawlForAIScraper, WebScraper
 load_dotenv()
 # Configure logging
 logger = logging.getLogger(__name__)
 app = FastAPI()
+app.add_middleware(
+    CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
+)
 sio = socketio.AsyncServer(cors_allowed_origins="*", ping_timeout=60, ping_interval=10, async_mode="asgi")
+app.mount("/", socketio.ASGIApp(sio))
 # Initialize the scraper and KNet
 scraper_instance = CrawlForAIScraper()
         async def progress_callback(status):
             try:
                 logger.debug(f"Progress update: {status['progress']}% - {status['message']}")
+                await sio.emit(
+                    "status", {"message": status["message"], "progress": status["progress"]}, room=session_id
+                )
             except Exception as e:
                 logger.error(f"Error in progress callback: {str(e)}")
                 raise e
     await scraper_instance.close()
     await sio.emit("test", res, room=sid)
 if __name__ == "__main__":
     logger.info("Starting KnowledgeNet server...")
     import uvicorn
+    uvicorn.run(app, host="127.0.0.1", port=5000)

backend/crawl_ai.py CHANGED Viewed

@@ -1,8 +1,13 @@
 import asyncio
-from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig
-import json, sys
 # from base64 import b64decode
 async def main():
     base_browser = BrowserConfig(
         browser_type="chromium",
@@ -43,5 +48,6 @@ async def main():
         # else:
         #     print("[ERROR]", result.error_message)
 if __name__ == "__main__":
     asyncio.run(main())

 import asyncio
+import json
+import sys
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
 # from base64 import b64decode
 async def main():
     base_browser = BrowserConfig(
         browser_type="chromium",
         # else:
         #     print("[ERROR]", result.error_message)
 if __name__ == "__main__":
     asyncio.run(main())

backend/knet.py CHANGED Viewed

@@ -1,14 +1,17 @@
-from typing import Dict, List, Any
-from textwrap import dedent
-import google.generativeai as genai
-from google.ai.generativelanguage_v1beta.types import content
-import logging
 import json
 import os
 from datetime import datetime
 from dotenv import load_dotenv
 from research_node import ResearchNode
-from collections import deque
 # Load environment variables
 load_dotenv()
@@ -147,8 +150,12 @@ class KNet:
         try:
             # Generate summary of key findings into research_manager's context
             if node.data:
-                findings = ("\n" + "-"*10 + "Next data" + "-"*10 + "\n").join([json.dumps(d, indent=2) for d in node.data])
-                response = self.llm.generate_content(f"Extract key findings from the following data related to the topic '{topic}':\n{findings}")
                 self._track_tokens(response.usage_metadata.total_token_count)
                 findings = response.text
                 self.ctx_manager.append(findings)
@@ -160,9 +167,7 @@ class KNet:
                 path=" -> ".join(node.get_path_to_root()),
                 findings="\n".join(self.ctx_manager),
             )
-            response = self.research_manager.generate_content(
-                prompt, generation_config={**self.branch_schema}
-            )
             self._track_tokens(response.usage_metadata.total_token_count)
             result = json.loads(response.text)
             self.logger.info(f"Branch decision for '{node.query}': {result['decision']}")
@@ -171,7 +176,7 @@ class KNet:
         except Exception as e:
             if result["candidates"][0]["finishReason"] == "RECITATION":
                 self.logger.error(f"Retrying branch decision: {str(e)}\nC:{retry_count/3}")
-                self._should_branch_deeper(node, topic, retry_count+1)
             self.logger.error(f"Branch decision failed: {str(e)}")
             raise e
@@ -190,14 +195,16 @@ class KNet:
             while to_explore:
                 current_node, current_depth = to_explore.popleft()
-                if (current_node.query in explored_queries or current_depth >= self.max_depth):
                     continue
                 self.logger.info(f"Exploring: {current_node.query} (Depth: {current_depth})")
                 await progress.update(5, f"Exploring: {current_node.query}")
                 # Search and scrape
-                current_node.data = await self.scraper.search_and_scrape(current_node.query, 3)  # node -> data = [{url:...}, {url:...}, ...]
                 self.ctx_researcher.append(json.dumps(current_node.data, indent=2))
                 explored_queries.add(current_node.query)
@@ -213,7 +220,9 @@ class KNet:
             await progress.update(30, "Generating comprehensive report...")
             final_report = self._generate_final_report(root_node)
-            self.logger.info(f"Research completed. Explored {len(explored_queries)} queries across {root_node.max_depth()} levels")
             await progress.update(100, "Research complete!")
             with open("output.json", "a") as f:
@@ -229,7 +238,8 @@ class KNet:
             if not node.data:
                 return []
-            analysis_prompt = dedent(f"""Based on the following findings about "{topic}", suggest new research directions.
             Findings:
             {json.dumps(self.ctx_manager, indent=2)}
@@ -239,7 +249,8 @@ class KNet:
             - Goes deeper into important details
             Return as JSON array of objects with properties:
-            - query (string)""")
             response = self.research_manager.generate_content(
                 analysis_prompt, generation_config={**self.analysis_schema}
@@ -261,7 +272,7 @@ class KNet:
         except Exception as e:
             if result["candidates"][0]["finishReason"] == "RECITATION" and retry_count <= 3:
                 self.logger.error(f"Retrying analysis: {str(e)}\nC:{retry_count/3}")
-                self._analyze_and_branch(node, topic, retry_count+1)
             self.logger.error(f"Branch analysis failed: {str(e)}")
             raise e
@@ -318,6 +329,6 @@ class KNet:
         except Exception as e:
             if response["candidates"][0]["finishReason"] == "RECITATION":
                 self.logger.error(f"Retrying final report: {str(e)}\nC:{retry_count/3}")
-                self._generate_final_report(root_node, retry_count+1)
             self.logger.error(f"Error generating final report: {str(e)}")
             raise e

 import json
+import logging
 import os
+from collections import deque
 from datetime import datetime
+from textwrap import dedent
+from typing import Any, Dict, List
+import google.generativeai as genai
 from dotenv import load_dotenv
+from google.ai.generativelanguage_v1beta.types import content
 from research_node import ResearchNode
 # Load environment variables
 load_dotenv()
         try:
             # Generate summary of key findings into research_manager's context
             if node.data:
+                findings = ("\n" + "-" * 10 + "Next data" + "-" * 10 + "\n").join(
+                    [json.dumps(d, indent=2) for d in node.data]
+                )
+                response = self.llm.generate_content(
+                    f"Extract key findings from the following data related to the topic '{topic}':\n{findings}"
+                )
                 self._track_tokens(response.usage_metadata.total_token_count)
                 findings = response.text
                 self.ctx_manager.append(findings)
                 path=" -> ".join(node.get_path_to_root()),
                 findings="\n".join(self.ctx_manager),
             )
+            response = self.research_manager.generate_content(prompt, generation_config={**self.branch_schema})
             self._track_tokens(response.usage_metadata.total_token_count)
             result = json.loads(response.text)
             self.logger.info(f"Branch decision for '{node.query}': {result['decision']}")
         except Exception as e:
             if result["candidates"][0]["finishReason"] == "RECITATION":
                 self.logger.error(f"Retrying branch decision: {str(e)}\nC:{retry_count/3}")
+                self._should_branch_deeper(node, topic, retry_count + 1)
             self.logger.error(f"Branch decision failed: {str(e)}")
             raise e
             while to_explore:
                 current_node, current_depth = to_explore.popleft()
+                if current_node.query in explored_queries or current_depth >= self.max_depth:
                     continue
                 self.logger.info(f"Exploring: {current_node.query} (Depth: {current_depth})")
                 await progress.update(5, f"Exploring: {current_node.query}")
                 # Search and scrape
+                current_node.data = await self.scraper.search_and_scrape(
+                    current_node.query, 3
+                )  # node -> data = [{url:...}, {url:...}, ...]
                 self.ctx_researcher.append(json.dumps(current_node.data, indent=2))
                 explored_queries.add(current_node.query)
             await progress.update(30, "Generating comprehensive report...")
             final_report = self._generate_final_report(root_node)
+            self.logger.info(
+                f"Research completed. Explored {len(explored_queries)} queries across {root_node.max_depth()} levels"
+            )
             await progress.update(100, "Research complete!")
             with open("output.json", "a") as f:
             if not node.data:
                 return []
+            analysis_prompt = dedent(
+                f"""Based on the following findings about "{topic}", suggest new research directions.
             Findings:
             {json.dumps(self.ctx_manager, indent=2)}
             - Goes deeper into important details
             Return as JSON array of objects with properties:
+            - query (string)"""
+            )
             response = self.research_manager.generate_content(
                 analysis_prompt, generation_config={**self.analysis_schema}
         except Exception as e:
             if result["candidates"][0]["finishReason"] == "RECITATION" and retry_count <= 3:
                 self.logger.error(f"Retrying analysis: {str(e)}\nC:{retry_count/3}")
+                self._analyze_and_branch(node, topic, retry_count + 1)
             self.logger.error(f"Branch analysis failed: {str(e)}")
             raise e
         except Exception as e:
             if response["candidates"][0]["finishReason"] == "RECITATION":
                 self.logger.error(f"Retrying final report: {str(e)}\nC:{retry_count/3}")
+                self._generate_final_report(root_node, retry_count + 1)
             self.logger.error(f"Error generating final report: {str(e)}")
             raise e

backend/research_node.py CHANGED Viewed

@@ -1,15 +1,16 @@
-from typing import List, Dict, Any, Optional
 from datetime import datetime
 class ResearchNode:
-    def __init__(self, query: str, parent: Optional['ResearchNode'] = None, depth: int = 0):
         self.query = query
         self.parent = parent
         self.depth = depth
         self.children: List[ResearchNode] = []
         self.data: List[Dict[str, Any]] = []
-    def add_child(self, query: str) -> 'ResearchNode':
         child = ResearchNode(query, parent=self, depth=self.depth + 1)
         self.children.append(child)
         return child
@@ -36,4 +37,4 @@ class ResearchNode:
         data = self.data
         for child in self.children:
             data.extend(child.get_all_data())
-        return data

 from datetime import datetime
+from typing import Any, Dict, List, Optional
 class ResearchNode:
+    def __init__(self, query: str, parent: Optional["ResearchNode"] = None, depth: int = 0):
         self.query = query
         self.parent = parent
         self.depth = depth
         self.children: List[ResearchNode] = []
         self.data: List[Dict[str, Any]] = []
+    def add_child(self, query: str) -> "ResearchNode":
         child = ResearchNode(query, parent=self, depth=self.depth + 1)
         self.children.append(child)
         return child
         data = self.data
         for child in self.children:
             data.extend(child.get_all_data())
+        return data

backend/scraper.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import asyncio
 import json
 import logging
 from typing import Any, Dict, List
 from urllib.parse import quote_plus
 from bs4 import BeautifulSoup
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
-import newspaper
 from newspaper import Article
-import requests
-import time
 class WebScraper:
@@ -154,17 +155,16 @@ class WebScraper:
         return merged
 class CrawlForAIScraper:
     def __init__(self) -> None:
         self.logger = logging.getLogger(__name__)
         self.base_browser = BrowserConfig(
-                browser_type="chromium",
-                headless=True,
-                viewport_width=1920,
-                viewport_height=1080,
-                accept_downloads=True,
-            )
         self.crawler = AsyncWebCrawler(config=self.base_browser)
         self._is_started = False
@@ -209,7 +209,14 @@ class CrawlForAIScraper:
             encoded_query = quote_plus(query)
             search_uri = f"https://www.google.com/search?q={encoded_query}"
-            result = await self.crawler.arun(url=search_uri, screenshot=False, cache_mode=CacheMode.BYPASS, delay_before_return_html=2, page_timeout=25000, scan_full_page=True)
             soup = BeautifulSoup(result.html, "html.parser")
             search_results = []
@@ -237,7 +244,14 @@ class CrawlForAIScraper:
         try:
             # Run the crawler on a URL
-            result = await self.crawler.arun(url=url, screenshot=False, cache_mode=CacheMode.BYPASS, delay_before_return_html=2, page_timeout=25000, scan_full_page=True)
             soup = BeautifulSoup(result.html, "html.parser")
             data = {
                 "url": url,
@@ -257,47 +271,49 @@ class CrawlForAIScraper:
     def _extract_images(self, soup: BeautifulSoup, url: str) -> List[str]:
         # Extract images with width and height greater than 300 pixels
         images = []
-        for img in soup.find_all('img'):
-            if 'src' in img.attrs:
-                src = img['src']
                 # remove px or any characters from width and height
-                width = int(''.join(filter(str.isdigit, img.get('width', '0'))))
-                height = int(''.join(filter(str.isdigit, img.get('height', '0'))))
-                if width > 300 and height > 300 and 'pixel' not in src and 'icon' not in src:
                     images.append((src, width, height))
         images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
         images = [img[0] for img in images]
         # Add base URL to relative URLs
-        base_url = '/'.join(url.split('/')[:3])
-        images = [img if img.startswith('http') else base_url + img for img in images]
         return images
     def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
         # Extract videos from iframes and video tags
         videos = []
-        nodes = list(soup.find_all('iframe')) + list(soup.find_all('video')) + list(soup.find_all('a'))
         for node in nodes:
-            if node.name == 'iframe':
-                src = node.get('src', '')
-                if 'youtube.com' in src or 'youtu.be' in src:
                     videos.append(src)
-            elif node.name == 'video':
-                src = node.get('src', '')
-                if 'youtube.com' in src or 'youtu.be' in src:
                     videos.append(src)
-            elif node.name == 'a':
-                href = node.get('href', '')
-                if 'youtube.com' in href or 'youtu.be' in href:
                     videos.append(href)
         return videos
 if __name__ == "__main__":
     import sys
     url = "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview"
     if len(sys.argv) > 1:
         url = sys.argv[1]
     async def main():
         scraper = CrawlForAIScraper()
         await scraper.start()
@@ -306,4 +322,5 @@ if __name__ == "__main__":
         with open("output.json", "w") as f:
             f.write(json.dumps(data, indent=2))
         print(json.dumps(data, indent=2))
     asyncio.run(main())

 import asyncio
 import json
 import logging
+import time
 from typing import Any, Dict, List
 from urllib.parse import quote_plus
+import newspaper
+import requests
 from bs4 import BeautifulSoup
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
 from newspaper import Article
 class WebScraper:
         return merged
 class CrawlForAIScraper:
     def __init__(self) -> None:
         self.logger = logging.getLogger(__name__)
         self.base_browser = BrowserConfig(
+            browser_type="chromium",
+            headless=True,
+            viewport_width=1920,
+            viewport_height=1080,
+            accept_downloads=True,
+        )
         self.crawler = AsyncWebCrawler(config=self.base_browser)
         self._is_started = False
             encoded_query = quote_plus(query)
             search_uri = f"https://www.google.com/search?q={encoded_query}"
+            result = await self.crawler.arun(
+                url=search_uri,
+                screenshot=False,
+                cache_mode=CacheMode.BYPASS,
+                delay_before_return_html=2,
+                page_timeout=25000,
+                scan_full_page=True,
+            )
             soup = BeautifulSoup(result.html, "html.parser")
             search_results = []
         try:
             # Run the crawler on a URL
+            result = await self.crawler.arun(
+                url=url,
+                screenshot=False,
+                cache_mode=CacheMode.BYPASS,
+                delay_before_return_html=2,
+                page_timeout=25000,
+                scan_full_page=True,
+            )
             soup = BeautifulSoup(result.html, "html.parser")
             data = {
                 "url": url,
     def _extract_images(self, soup: BeautifulSoup, url: str) -> List[str]:
         # Extract images with width and height greater than 300 pixels
         images = []
+        for img in soup.find_all("img"):
+            if "src" in img.attrs:
+                src = img["src"]
                 # remove px or any characters from width and height
+                width = int("".join(filter(str.isdigit, img.get("width", "0"))))
+                height = int("".join(filter(str.isdigit, img.get("height", "0"))))
+                if width > 300 and height > 300 and "pixel" not in src and "icon" not in src:
                     images.append((src, width, height))
         images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
         images = [img[0] for img in images]
         # Add base URL to relative URLs
+        base_url = "/".join(url.split("/")[:3])
+        images = [img if img.startswith("http") else base_url + img for img in images]
         return images
     def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
         # Extract videos from iframes and video tags
         videos = []
+        nodes = list(soup.find_all("iframe")) + list(soup.find_all("video")) + list(soup.find_all("a"))
         for node in nodes:
+            if node.name == "iframe":
+                src = node.get("src", "")
+                if "youtube.com" in src or "youtu.be" in src:
                     videos.append(src)
+            elif node.name == "video":
+                src = node.get("src", "")
+                if "youtube.com" in src or "youtu.be" in src:
                     videos.append(src)
+            elif node.name == "a":
+                href = node.get("href", "")
+                if "youtube.com" in href or "youtu.be" in href:
                     videos.append(href)
         return videos
 if __name__ == "__main__":
     import sys
     url = "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview"
     if len(sys.argv) > 1:
         url = sys.argv[1]
     async def main():
         scraper = CrawlForAIScraper()
         await scraper.start()
         with open("output.json", "w") as f:
             f.write(json.dumps(data, indent=2))
         print(json.dumps(data, indent=2))
     asyncio.run(main())