Spaces:

sohamw03
/

knowledge-net

Paused

App Files Files Community

Soham Waghmare commited on Mar 24, 2025

Commit

9635653

1 Parent(s): 9155a62

feat: concurrent ws conns, parallel scraping, better image extraction

Browse files

Files changed (4) hide show

backend/app.py +38 -10
backend/crawl_ai.py +31 -18
backend/knet.py +5 -2
backend/scraper.py +46 -38

backend/app.py CHANGED Viewed

@@ -1,7 +1,8 @@
-# pip install asyncio eventlet
 # pip install google-genai beautifulsoup4 selenium newspaper3k lxml_html_clean
 import json
 import logging
 import socketio
 from dotenv import load_dotenv
@@ -20,26 +21,49 @@ logger = logging.getLogger(__name__)
 app = FastAPI()
 app.add_middleware(
-    CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
 )
 sio = socketio.AsyncServer(cors_allowed_origins="*", ping_timeout=60, ping_interval=10, async_mode="asgi")
 app.mount("/", socketio.ASGIApp(sio))
-# Initialize the scraper and KNet
-scraper_instance = CrawlForAIScraper()
-# scraper_instance = WebScraper()
-knet = KNet(scraper_instance)
 @sio.event
-def connect(sid, environ, auth):
     logger.info(f"Client connected: {sid}")
 @sio.event
-def disconnect(sid, reason):
     logger.info(f"Client disconnected: {sid}")
 @sio.event
@@ -50,6 +74,8 @@ async def health_check(sid, data):
 @sio.event
 async def start_research(sid, data):
     try:
         data = json.loads(data) if type(data) != dict else data
         topic = data.get("topic")
@@ -60,7 +86,9 @@ async def start_research(sid, data):
             try:
                 logger.debug(f"Progress update: {status['progress']}% - {status['message']}")
                 await sio.emit(
-                    "status", {"message": status["message"], "progress": status["progress"]}, room=session_id
                 )
             except Exception as e:
                 logger.error(f"Error in progress callback: {str(e)}")
@@ -77,11 +105,11 @@ async def start_research(sid, data):
 @sio.event
 async def test(sid, data):
     print("Testing...")
     data = json.loads(data) if type(data) != dict else data
     res = await knet.scraper._scrape_page(data["url"])
     print(json.dumps(res, indent=2))
-    await scraper_instance.close()
     await sio.emit("test", res, room=sid)

+# pip install asyncio eventlet
 # pip install google-genai beautifulsoup4 selenium newspaper3k lxml_html_clean
 import json
 import logging
+from typing import Dict
 import socketio
 from dotenv import load_dotenv
 app = FastAPI()
 app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
 )
 sio = socketio.AsyncServer(cors_allowed_origins="*", ping_timeout=60, ping_interval=10, async_mode="asgi")
 app.mount("/", socketio.ASGIApp(sio))
+class SessionManager:
+    def __init__(self):
+        self.sessions: Dict[str, tuple[KNet, CrawlForAIScraper]] = {}
+    async def get_or_create_session(self, sid: str) -> tuple[KNet, CrawlForAIScraper]:
+        if sid not in self.sessions:
+            scraper = CrawlForAIScraper()
+            await scraper.start()
+            knet = KNet(scraper)
+            self.sessions[sid] = (knet, scraper)
+        return self.sessions[sid]
+    async def cleanup_session(self, sid: str):
+        if sid in self.sessions:
+            _, scraper = self.sessions[sid]
+            await scraper.close()
+            del self.sessions[sid]
+session_manager = SessionManager()
 @sio.event
+async def connect(sid, environ, auth):
     logger.info(f"Client connected: {sid}")
+    await session_manager.get_or_create_session(sid)
 @sio.event
+async def disconnect(sid, reason):
     logger.info(f"Client disconnected: {sid}")
+    await session_manager.cleanup_session(sid)
 @sio.event
 @sio.event
 async def start_research(sid, data):
+    knet, scraper = await session_manager.get_or_create_session(sid)
     try:
         data = json.loads(data) if type(data) != dict else data
         topic = data.get("topic")
             try:
                 logger.debug(f"Progress update: {status['progress']}% - {status['message']}")
                 await sio.emit(
+                    "status",
+                    {"message": status["message"], "progress": status["progress"]},
+                    room=session_id,
                 )
             except Exception as e:
                 logger.error(f"Error in progress callback: {str(e)}")
 @sio.event
 async def test(sid, data):
+    knet, scraper = await session_manager.get_or_create_session(sid)
     print("Testing...")
     data = json.loads(data) if type(data) != dict else data
     res = await knet.scraper._scrape_page(data["url"])
     print(json.dumps(res, indent=2))
     await sio.emit("test", res, room=sid)

backend/crawl_ai.py CHANGED Viewed

@@ -1,14 +1,10 @@
 import asyncio
 import json
-import sys
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
-# from base64 import b64decode
-async def main():
     base_browser = BrowserConfig(
         browser_type="chromium",
         headless=True,
@@ -20,18 +16,28 @@ async def main():
     # Create an instance of AsyncWebCrawler
     async with AsyncWebCrawler(config=base_browser) as crawler:
         # Run the crawler on a URL
-        result = await crawler.arun(url=sys.argv[1], screenshot=False, cache_mode=CacheMode.BYPASS)
-        # Print the extracted content
-        hr = lambda: print(("-" * 80) * 2)
-        hr()
-        print(result.markdown)
-        hr()
-        print(json.dumps(result.media, indent=2))
-        hr()
-        print(json.dumps(result.links, indent=2))
-        hr()
-        print(json.dumps(result.downloaded_files, indent=2))
-        hr()
         # if result.success:
         #     # Save screenshot
@@ -50,4 +56,11 @@ async def main():
 if __name__ == "__main__":
-    asyncio.run(main())

 import asyncio
 import json
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
+async def main(urls):
     base_browser = BrowserConfig(
         browser_type="chromium",
         headless=True,
     # Create an instance of AsyncWebCrawler
     async with AsyncWebCrawler(config=base_browser) as crawler:
         # Run the crawler on a URL
+        results = await crawler.arun_many(
+            urls=urls,
+            screenshot=False,
+            cache_mode=CacheMode.BYPASS,
+            scan_full_page=True,
+            semaphore_count=3,
+            wait_for_images=True,
+        )
+        with open("output.json", "w") as f:
+            f.write("")
+        for result in results:
+            if result.success:
+                dump_result = {
+                    "url": result.url,
+                    "markdown": result.markdown,
+                }
+                with open("output.json", "a") as f:
+                    json.dump(dump_result, f)
+                # Print the extracted content
+                hr = lambda n=1: print(("-" * 80) * 2 * n)
+                print("[OK] URL:", result.url)
+                hr()
         # if result.success:
         #     # Save screenshot
 if __name__ == "__main__":
+    urls = [
+        "https://www.google.com",
+        "https://www.amazon.com",
+        "https://www.facebook.com",
+        "https://www.twitter.com",
+        "https://www.instagram.com",
+    ]
+    asyncio.run(main(urls))

backend/knet.py CHANGED Viewed

@@ -82,6 +82,7 @@ class KNet:
         self.logger = logging.getLogger(__name__)
         self.max_depth = 2
         self.max_breadth = 3
         self.search_prompt = """Generate 3-5 specific search queries to research the following topic: {topic}
@@ -152,7 +153,7 @@ class KNet:
             if node.data:
                 findings = ("\n" + "-" * 10 + "Next data" + "-" * 10 + "\n").join(
                     [json.dumps(d, indent=2) for d in node.data]
-                )
                 response = self.llm.generate_content(
                     f"Extract key findings from the following data related to the topic '{topic}':\n{findings}"
                 )
@@ -181,6 +182,8 @@ class KNet:
             raise e
     async def conduct_research(self, topic: str, progress_callback=None) -> Dict[str, Any]:
         self.token_count = 0
         progress = ResearchProgress(progress_callback)
         self.logger.info(f"Starting research on topic: {topic}")
@@ -203,7 +206,7 @@ class KNet:
                 # Search and scrape
                 current_node.data = await self.scraper.search_and_scrape(
-                    current_node.query, 3
                 )  # node -> data = [{url:...}, {url:...}, ...]
                 self.ctx_researcher.append(json.dumps(current_node.data, indent=2))
                 explored_queries.add(current_node.query)

         self.logger = logging.getLogger(__name__)
         self.max_depth = 2
         self.max_breadth = 3
+        self.num_sites_per_query = 5
         self.search_prompt = """Generate 3-5 specific search queries to research the following topic: {topic}
             if node.data:
                 findings = ("\n" + "-" * 10 + "Next data" + "-" * 10 + "\n").join(
                     [json.dumps(d, indent=2) for d in node.data]
+                )
                 response = self.llm.generate_content(
                     f"Extract key findings from the following data related to the topic '{topic}':\n{findings}"
                 )
             raise e
     async def conduct_research(self, topic: str, progress_callback=None) -> Dict[str, Any]:
+        self.ctx_researcher = []
+        self.ctx_manager = []
         self.token_count = 0
         progress = ResearchProgress(progress_callback)
         self.logger.info(f"Starting research on topic: {topic}")
                 # Search and scrape
                 current_node.data = await self.scraper.search_and_scrape(
+                    current_node.query, self.num_sites_per_query
                 )  # node -> data = [{url:...}, {url:...}, ...]
                 self.ctx_researcher.append(json.dumps(current_node.data, indent=2))
                 explored_queries.add(current_node.query)

backend/scraper.py CHANGED Viewed

@@ -179,31 +179,25 @@ class CrawlForAIScraper:
             await self.crawler.close()
             self._is_started = False
-    async def search_and_scrape(self, query: str, num_sites: int = 3) -> List[Dict[str, Any]]:
         await self.start()
         self.logger.info(f"Starting search for: {query}")
-        # Perform a Google search to get a list of webpages
-        search_results = await self._google_search(query, num_sites)
         self.logger.info(f"Found {len(search_results)} search results")
         # Scrape each webpage
         scraped_data = []
-        for idx, url in enumerate(search_results):
-            try:
-                self.logger.info(f"Scraping [{idx + 1}/{len(search_results)}]: {url}")
-                data = await self._scrape_page(url)
-                if data:
-                    scraped_data.append(data)
-                    self.logger.info(f"Successfully scraped: {url}")
-            except Exception as e:
-                self.logger.error(f"Error scraping {url}: {str(e)}")
-                continue
         self.logger.info(f"Completed scraping {len(scraped_data)} sites")
         return scraped_data
-    async def _google_search(self, query: str, num_results: int) -> List[str]:
         self.logger.info("Performing Google search...")
         try:
             encoded_query = quote_plus(query)
@@ -214,7 +208,6 @@ class CrawlForAIScraper:
                 screenshot=False,
                 cache_mode=CacheMode.BYPASS,
                 delay_before_return_html=2,
-                page_timeout=25000,
                 scan_full_page=True,
             )
@@ -239,33 +232,36 @@ class CrawlForAIScraper:
             self.logger.error(f"Google search error: {str(e)}")
             return []
-    async def _scrape_page(self, url: str) -> Dict[str, Any]:
         await self.start()
         try:
             # Run the crawler on a URL
-            result = await self.crawler.arun(
-                url=url,
                 screenshot=False,
                 cache_mode=CacheMode.BYPASS,
-                delay_before_return_html=2,
-                page_timeout=25000,
                 scan_full_page=True,
             )
-            soup = BeautifulSoup(result.html, "html.parser")
-            data = {
-                "url": url,
-                "text": result.markdown,
-                "images": self._extract_images(soup, result.url),
-                "videos": self._extract_videos(soup),
-                "links": result.links["external"],
-            }
-            return data
         except Exception as e:
-            self.logger.error(f"Scraping error for {url}: {str(e)}")
-            # raise e
             return {}
     def _extract_images(self, soup: BeautifulSoup, url: str) -> List[str]:
@@ -274,9 +270,14 @@ class CrawlForAIScraper:
         for img in soup.find_all("img"):
             if "src" in img.attrs:
                 src = img["src"]
-                # remove px or any characters from width and height
-                width = int("".join(filter(str.isdigit, img.get("width", "0"))))
-                height = int("".join(filter(str.isdigit, img.get("height", "0"))))
                 if width > 300 and height > 300 and "pixel" not in src and "icon" not in src:
                     images.append((src, width, height))
         images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
@@ -310,14 +311,21 @@ class CrawlForAIScraper:
 if __name__ == "__main__":
     import sys
-    url = "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview"
     if len(sys.argv) > 1:
-        url = sys.argv[1]
     async def main():
         scraper = CrawlForAIScraper()
         await scraper.start()
-        data = await scraper.search_and_scrape("what is ai")
         await scraper.close()
         with open("output.json", "w") as f:
             f.write(json.dumps(data, indent=2))

             await self.crawler.close()
             self._is_started = False
+    async def search_and_scrape(self, query: str, num_sites: int = 10) -> List[Dict[str, Any]]:
         await self.start()
         self.logger.info(f"Starting search for: {query}")
+        # Perform a search to get a list of webpages
+        search_results = await self._search(query, num_sites)
         self.logger.info(f"Found {len(search_results)} search results")
         # Scrape each webpage
         scraped_data = []
+        self.logger.info(f"Scraping {len(search_results)} sites...")
+        data = await self._scrape_pages(search_results)
+        if data:
+            scraped_data.extend(data)
         self.logger.info(f"Completed scraping {len(scraped_data)} sites")
         return scraped_data
+    async def _search(self, query: str, num_results: int) -> List[str]:
         self.logger.info("Performing Google search...")
         try:
             encoded_query = quote_plus(query)
                 screenshot=False,
                 cache_mode=CacheMode.BYPASS,
                 delay_before_return_html=2,
                 scan_full_page=True,
             )
             self.logger.error(f"Google search error: {str(e)}")
             return []
+    async def _scrape_pages(self, urls: str) -> Dict[str, Any]:
         await self.start()
         try:
             # Run the crawler on a URL
+            results = await self.crawler.arun_many(
+                urls=urls,
                 screenshot=False,
                 cache_mode=CacheMode.BYPASS,
                 scan_full_page=True,
+                semaphore_count=4,
+                wait_for_images=True,
+                page_timeout=25000,
             )
+            scraped_sites = []
+            for result in results:
+                if result.success:
+                    soup = BeautifulSoup(result.html, "html.parser")
+                    data = {
+                        "url": result.url,
+                        "text": result.markdown,
+                        "images": self._extract_images(soup, result.url),
+                        "videos": self._extract_videos(soup),
+                        "links": result.links["external"],
+                    }
+                    scraped_sites.append(data)
+            return scraped_sites
         except Exception as e:
+            self.logger.error(f"Scraping error while {urls}: {str(e)}")
             return {}
     def _extract_images(self, soup: BeautifulSoup, url: str) -> List[str]:
         for img in soup.find_all("img"):
             if "src" in img.attrs:
                 src = img["src"]
+                if not "width" or not "height" in img.attrs:
+                    continue
+                if "width" in img.attrs and img.get("width").lower() == "auto":
+                    images.append((src, 999, 0))
+                # Remove units from width and height: get start of the entity till the first non-digit character
+                width = "".join([i for i in img.get("width", "0") if i.isdigit() or i == "."])
+                height = "".join([i for i in img.get("height", "0") if i.isdigit() or i == "."])
+                width, height = float(width), float(height)
                 if width > 300 and height > 300 and "pixel" not in src and "icon" not in src:
                     images.append((src, width, height))
         images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
 if __name__ == "__main__":
     import sys
+    urls = [
+        "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview",
+        "https://docs.crawl4ai.com/advanced/multi-url-crawling/",
+        "https://github.com/SesameAILabs/csm",
+        "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview",
+        "https://docs.crawl4ai.com/advanced/multi-url-crawling/",
+        "https://github.com/SesameAILabs/csm",
+    ]
     if len(sys.argv) > 1:
+        urls = sys.argv[1:]
     async def main():
         scraper = CrawlForAIScraper()
         await scraper.start()
+        data = await scraper.search_and_scrape("quantum computing")
         await scraper.close()
         with open("output.json", "w") as f:
             f.write(json.dumps(data, indent=2))