Soham Waghmare commited on
Commit
9155a62
·
1 Parent(s): d1e806c
backend/app.py CHANGED
@@ -1,12 +1,17 @@
1
- # pip install asyncio eventlet
2
  # pip install google-genai beautifulsoup4 selenium newspaper3k lxml_html_clean
 
 
 
 
 
3
  from fastapi import FastAPI
4
  from fastapi.middleware.cors import CORSMiddleware
5
- import socketio
6
- import json, logging
7
  from knet import KNet
8
  from scraper import CrawlForAIScraper, WebScraper
9
- from dotenv import load_dotenv
 
10
  load_dotenv()
11
 
12
  # Configure logging
@@ -14,10 +19,12 @@ logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
  app = FastAPI()
17
- app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
 
 
18
 
19
  sio = socketio.AsyncServer(cors_allowed_origins="*", ping_timeout=60, ping_interval=10, async_mode="asgi")
20
- app.mount('/', socketio.ASGIApp(sio))
21
 
22
  # Initialize the scraper and KNet
23
  scraper_instance = CrawlForAIScraper()
@@ -52,7 +59,9 @@ async def start_research(sid, data):
52
  async def progress_callback(status):
53
  try:
54
  logger.debug(f"Progress update: {status['progress']}% - {status['message']}")
55
- await sio.emit("status", {"message": status["message"], "progress": status["progress"]}, room=session_id)
 
 
56
  except Exception as e:
57
  logger.error(f"Error in progress callback: {str(e)}")
58
  raise e
@@ -75,7 +84,9 @@ async def test(sid, data):
75
  await scraper_instance.close()
76
  await sio.emit("test", res, room=sid)
77
 
 
78
  if __name__ == "__main__":
79
  logger.info("Starting KnowledgeNet server...")
80
  import uvicorn
81
- uvicorn.run(app, host='127.0.0.1', port=5000)
 
 
1
+ # pip install asyncio eventlet
2
  # pip install google-genai beautifulsoup4 selenium newspaper3k lxml_html_clean
3
+ import json
4
+ import logging
5
+
6
+ import socketio
7
+ from dotenv import load_dotenv
8
  from fastapi import FastAPI
9
  from fastapi.middleware.cors import CORSMiddleware
10
+
 
11
  from knet import KNet
12
  from scraper import CrawlForAIScraper, WebScraper
13
+
14
+
15
  load_dotenv()
16
 
17
  # Configure logging
 
19
  logger = logging.getLogger(__name__)
20
 
21
  app = FastAPI()
22
+ app.add_middleware(
23
+ CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
24
+ )
25
 
26
  sio = socketio.AsyncServer(cors_allowed_origins="*", ping_timeout=60, ping_interval=10, async_mode="asgi")
27
+ app.mount("/", socketio.ASGIApp(sio))
28
 
29
  # Initialize the scraper and KNet
30
  scraper_instance = CrawlForAIScraper()
 
59
  async def progress_callback(status):
60
  try:
61
  logger.debug(f"Progress update: {status['progress']}% - {status['message']}")
62
+ await sio.emit(
63
+ "status", {"message": status["message"], "progress": status["progress"]}, room=session_id
64
+ )
65
  except Exception as e:
66
  logger.error(f"Error in progress callback: {str(e)}")
67
  raise e
 
84
  await scraper_instance.close()
85
  await sio.emit("test", res, room=sid)
86
 
87
+
88
  if __name__ == "__main__":
89
  logger.info("Starting KnowledgeNet server...")
90
  import uvicorn
91
+
92
+ uvicorn.run(app, host="127.0.0.1", port=5000)
backend/crawl_ai.py CHANGED
@@ -1,8 +1,13 @@
1
  import asyncio
2
- from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig
3
- import json, sys
 
 
 
 
4
  # from base64 import b64decode
5
 
 
6
  async def main():
7
  base_browser = BrowserConfig(
8
  browser_type="chromium",
@@ -43,5 +48,6 @@ async def main():
43
  # else:
44
  # print("[ERROR]", result.error_message)
45
 
 
46
  if __name__ == "__main__":
47
  asyncio.run(main())
 
1
  import asyncio
2
+ import json
3
+ import sys
4
+
5
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
6
+
7
+
8
  # from base64 import b64decode
9
 
10
+
11
  async def main():
12
  base_browser = BrowserConfig(
13
  browser_type="chromium",
 
48
  # else:
49
  # print("[ERROR]", result.error_message)
50
 
51
+
52
  if __name__ == "__main__":
53
  asyncio.run(main())
backend/knet.py CHANGED
@@ -1,14 +1,17 @@
1
- from typing import Dict, List, Any
2
- from textwrap import dedent
3
- import google.generativeai as genai
4
- from google.ai.generativelanguage_v1beta.types import content
5
- import logging
6
  import json
 
7
  import os
 
8
  from datetime import datetime
 
 
 
 
9
  from dotenv import load_dotenv
 
 
10
  from research_node import ResearchNode
11
- from collections import deque
12
 
13
  # Load environment variables
14
  load_dotenv()
@@ -147,8 +150,12 @@ class KNet:
147
  try:
148
  # Generate summary of key findings into research_manager's context
149
  if node.data:
150
- findings = ("\n" + "-"*10 + "Next data" + "-"*10 + "\n").join([json.dumps(d, indent=2) for d in node.data])
151
- response = self.llm.generate_content(f"Extract key findings from the following data related to the topic '{topic}':\n{findings}")
 
 
 
 
152
  self._track_tokens(response.usage_metadata.total_token_count)
153
  findings = response.text
154
  self.ctx_manager.append(findings)
@@ -160,9 +167,7 @@ class KNet:
160
  path=" -> ".join(node.get_path_to_root()),
161
  findings="\n".join(self.ctx_manager),
162
  )
163
- response = self.research_manager.generate_content(
164
- prompt, generation_config={**self.branch_schema}
165
- )
166
  self._track_tokens(response.usage_metadata.total_token_count)
167
  result = json.loads(response.text)
168
  self.logger.info(f"Branch decision for '{node.query}': {result['decision']}")
@@ -171,7 +176,7 @@ class KNet:
171
  except Exception as e:
172
  if result["candidates"][0]["finishReason"] == "RECITATION":
173
  self.logger.error(f"Retrying branch decision: {str(e)}\nC:{retry_count/3}")
174
- self._should_branch_deeper(node, topic, retry_count+1)
175
  self.logger.error(f"Branch decision failed: {str(e)}")
176
  raise e
177
 
@@ -190,14 +195,16 @@ class KNet:
190
  while to_explore:
191
  current_node, current_depth = to_explore.popleft()
192
 
193
- if (current_node.query in explored_queries or current_depth >= self.max_depth):
194
  continue
195
 
196
  self.logger.info(f"Exploring: {current_node.query} (Depth: {current_depth})")
197
  await progress.update(5, f"Exploring: {current_node.query}")
198
 
199
  # Search and scrape
200
- current_node.data = await self.scraper.search_and_scrape(current_node.query, 3) # node -> data = [{url:...}, {url:...}, ...]
 
 
201
  self.ctx_researcher.append(json.dumps(current_node.data, indent=2))
202
  explored_queries.add(current_node.query)
203
 
@@ -213,7 +220,9 @@ class KNet:
213
  await progress.update(30, "Generating comprehensive report...")
214
  final_report = self._generate_final_report(root_node)
215
 
216
- self.logger.info(f"Research completed. Explored {len(explored_queries)} queries across {root_node.max_depth()} levels")
 
 
217
  await progress.update(100, "Research complete!")
218
 
219
  with open("output.json", "a") as f:
@@ -229,7 +238,8 @@ class KNet:
229
  if not node.data:
230
  return []
231
 
232
- analysis_prompt = dedent(f"""Based on the following findings about "{topic}", suggest new research directions.
 
233
  Findings:
234
  {json.dumps(self.ctx_manager, indent=2)}
235
 
@@ -239,7 +249,8 @@ class KNet:
239
  - Goes deeper into important details
240
 
241
  Return as JSON array of objects with properties:
242
- - query (string)""")
 
243
 
244
  response = self.research_manager.generate_content(
245
  analysis_prompt, generation_config={**self.analysis_schema}
@@ -261,7 +272,7 @@ class KNet:
261
  except Exception as e:
262
  if result["candidates"][0]["finishReason"] == "RECITATION" and retry_count <= 3:
263
  self.logger.error(f"Retrying analysis: {str(e)}\nC:{retry_count/3}")
264
- self._analyze_and_branch(node, topic, retry_count+1)
265
  self.logger.error(f"Branch analysis failed: {str(e)}")
266
  raise e
267
 
@@ -318,6 +329,6 @@ class KNet:
318
  except Exception as e:
319
  if response["candidates"][0]["finishReason"] == "RECITATION":
320
  self.logger.error(f"Retrying final report: {str(e)}\nC:{retry_count/3}")
321
- self._generate_final_report(root_node, retry_count+1)
322
  self.logger.error(f"Error generating final report: {str(e)}")
323
  raise e
 
 
 
 
 
 
1
  import json
2
+ import logging
3
  import os
4
+ from collections import deque
5
  from datetime import datetime
6
+ from textwrap import dedent
7
+ from typing import Any, Dict, List
8
+
9
+ import google.generativeai as genai
10
  from dotenv import load_dotenv
11
+ from google.ai.generativelanguage_v1beta.types import content
12
+
13
  from research_node import ResearchNode
14
+
15
 
16
  # Load environment variables
17
  load_dotenv()
 
150
  try:
151
  # Generate summary of key findings into research_manager's context
152
  if node.data:
153
+ findings = ("\n" + "-" * 10 + "Next data" + "-" * 10 + "\n").join(
154
+ [json.dumps(d, indent=2) for d in node.data]
155
+ )
156
+ response = self.llm.generate_content(
157
+ f"Extract key findings from the following data related to the topic '{topic}':\n{findings}"
158
+ )
159
  self._track_tokens(response.usage_metadata.total_token_count)
160
  findings = response.text
161
  self.ctx_manager.append(findings)
 
167
  path=" -> ".join(node.get_path_to_root()),
168
  findings="\n".join(self.ctx_manager),
169
  )
170
+ response = self.research_manager.generate_content(prompt, generation_config={**self.branch_schema})
 
 
171
  self._track_tokens(response.usage_metadata.total_token_count)
172
  result = json.loads(response.text)
173
  self.logger.info(f"Branch decision for '{node.query}': {result['decision']}")
 
176
  except Exception as e:
177
  if result["candidates"][0]["finishReason"] == "RECITATION":
178
  self.logger.error(f"Retrying branch decision: {str(e)}\nC:{retry_count/3}")
179
+ self._should_branch_deeper(node, topic, retry_count + 1)
180
  self.logger.error(f"Branch decision failed: {str(e)}")
181
  raise e
182
 
 
195
  while to_explore:
196
  current_node, current_depth = to_explore.popleft()
197
 
198
+ if current_node.query in explored_queries or current_depth >= self.max_depth:
199
  continue
200
 
201
  self.logger.info(f"Exploring: {current_node.query} (Depth: {current_depth})")
202
  await progress.update(5, f"Exploring: {current_node.query}")
203
 
204
  # Search and scrape
205
+ current_node.data = await self.scraper.search_and_scrape(
206
+ current_node.query, 3
207
+ ) # node -> data = [{url:...}, {url:...}, ...]
208
  self.ctx_researcher.append(json.dumps(current_node.data, indent=2))
209
  explored_queries.add(current_node.query)
210
 
 
220
  await progress.update(30, "Generating comprehensive report...")
221
  final_report = self._generate_final_report(root_node)
222
 
223
+ self.logger.info(
224
+ f"Research completed. Explored {len(explored_queries)} queries across {root_node.max_depth()} levels"
225
+ )
226
  await progress.update(100, "Research complete!")
227
 
228
  with open("output.json", "a") as f:
 
238
  if not node.data:
239
  return []
240
 
241
+ analysis_prompt = dedent(
242
+ f"""Based on the following findings about "{topic}", suggest new research directions.
243
  Findings:
244
  {json.dumps(self.ctx_manager, indent=2)}
245
 
 
249
  - Goes deeper into important details
250
 
251
  Return as JSON array of objects with properties:
252
+ - query (string)"""
253
+ )
254
 
255
  response = self.research_manager.generate_content(
256
  analysis_prompt, generation_config={**self.analysis_schema}
 
272
  except Exception as e:
273
  if result["candidates"][0]["finishReason"] == "RECITATION" and retry_count <= 3:
274
  self.logger.error(f"Retrying analysis: {str(e)}\nC:{retry_count/3}")
275
+ self._analyze_and_branch(node, topic, retry_count + 1)
276
  self.logger.error(f"Branch analysis failed: {str(e)}")
277
  raise e
278
 
 
329
  except Exception as e:
330
  if response["candidates"][0]["finishReason"] == "RECITATION":
331
  self.logger.error(f"Retrying final report: {str(e)}\nC:{retry_count/3}")
332
+ self._generate_final_report(root_node, retry_count + 1)
333
  self.logger.error(f"Error generating final report: {str(e)}")
334
  raise e
backend/research_node.py CHANGED
@@ -1,15 +1,16 @@
1
- from typing import List, Dict, Any, Optional
2
  from datetime import datetime
 
 
3
 
4
  class ResearchNode:
5
- def __init__(self, query: str, parent: Optional['ResearchNode'] = None, depth: int = 0):
6
  self.query = query
7
  self.parent = parent
8
  self.depth = depth
9
  self.children: List[ResearchNode] = []
10
  self.data: List[Dict[str, Any]] = []
11
 
12
- def add_child(self, query: str) -> 'ResearchNode':
13
  child = ResearchNode(query, parent=self, depth=self.depth + 1)
14
  self.children.append(child)
15
  return child
@@ -36,4 +37,4 @@ class ResearchNode:
36
  data = self.data
37
  for child in self.children:
38
  data.extend(child.get_all_data())
39
- return data
 
 
1
  from datetime import datetime
2
+ from typing import Any, Dict, List, Optional
3
+
4
 
5
  class ResearchNode:
6
+ def __init__(self, query: str, parent: Optional["ResearchNode"] = None, depth: int = 0):
7
  self.query = query
8
  self.parent = parent
9
  self.depth = depth
10
  self.children: List[ResearchNode] = []
11
  self.data: List[Dict[str, Any]] = []
12
 
13
+ def add_child(self, query: str) -> "ResearchNode":
14
  child = ResearchNode(query, parent=self, depth=self.depth + 1)
15
  self.children.append(child)
16
  return child
 
37
  data = self.data
38
  for child in self.children:
39
  data.extend(child.get_all_data())
40
+ return data
backend/scraper.py CHANGED
@@ -1,14 +1,15 @@
1
  import asyncio
2
  import json
3
  import logging
 
4
  from typing import Any, Dict, List
5
  from urllib.parse import quote_plus
 
 
 
6
  from bs4 import BeautifulSoup
7
  from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
8
- import newspaper
9
  from newspaper import Article
10
- import requests
11
- import time
12
 
13
 
14
  class WebScraper:
@@ -154,17 +155,16 @@ class WebScraper:
154
  return merged
155
 
156
 
157
-
158
  class CrawlForAIScraper:
159
  def __init__(self) -> None:
160
  self.logger = logging.getLogger(__name__)
161
  self.base_browser = BrowserConfig(
162
- browser_type="chromium",
163
- headless=True,
164
- viewport_width=1920,
165
- viewport_height=1080,
166
- accept_downloads=True,
167
- )
168
  self.crawler = AsyncWebCrawler(config=self.base_browser)
169
  self._is_started = False
170
 
@@ -209,7 +209,14 @@ class CrawlForAIScraper:
209
  encoded_query = quote_plus(query)
210
  search_uri = f"https://www.google.com/search?q={encoded_query}"
211
 
212
- result = await self.crawler.arun(url=search_uri, screenshot=False, cache_mode=CacheMode.BYPASS, delay_before_return_html=2, page_timeout=25000, scan_full_page=True)
 
 
 
 
 
 
 
213
 
214
  soup = BeautifulSoup(result.html, "html.parser")
215
  search_results = []
@@ -237,7 +244,14 @@ class CrawlForAIScraper:
237
 
238
  try:
239
  # Run the crawler on a URL
240
- result = await self.crawler.arun(url=url, screenshot=False, cache_mode=CacheMode.BYPASS, delay_before_return_html=2, page_timeout=25000, scan_full_page=True)
 
 
 
 
 
 
 
241
  soup = BeautifulSoup(result.html, "html.parser")
242
  data = {
243
  "url": url,
@@ -257,47 +271,49 @@ class CrawlForAIScraper:
257
  def _extract_images(self, soup: BeautifulSoup, url: str) -> List[str]:
258
  # Extract images with width and height greater than 300 pixels
259
  images = []
260
- for img in soup.find_all('img'):
261
- if 'src' in img.attrs:
262
- src = img['src']
263
  # remove px or any characters from width and height
264
- width = int(''.join(filter(str.isdigit, img.get('width', '0'))))
265
- height = int(''.join(filter(str.isdigit, img.get('height', '0'))))
266
- if width > 300 and height > 300 and 'pixel' not in src and 'icon' not in src:
267
  images.append((src, width, height))
268
  images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
269
  images = [img[0] for img in images]
270
 
271
  # Add base URL to relative URLs
272
- base_url = '/'.join(url.split('/')[:3])
273
- images = [img if img.startswith('http') else base_url + img for img in images]
274
  return images
275
 
276
  def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
277
  # Extract videos from iframes and video tags
278
  videos = []
279
- nodes = list(soup.find_all('iframe')) + list(soup.find_all('video')) + list(soup.find_all('a'))
280
  for node in nodes:
281
- if node.name == 'iframe':
282
- src = node.get('src', '')
283
- if 'youtube.com' in src or 'youtu.be' in src:
284
  videos.append(src)
285
- elif node.name == 'video':
286
- src = node.get('src', '')
287
- if 'youtube.com' in src or 'youtu.be' in src:
288
  videos.append(src)
289
- elif node.name == 'a':
290
- href = node.get('href', '')
291
- if 'youtube.com' in href or 'youtu.be' in href:
292
  videos.append(href)
293
  return videos
294
 
295
 
296
  if __name__ == "__main__":
297
  import sys
 
298
  url = "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview"
299
  if len(sys.argv) > 1:
300
  url = sys.argv[1]
 
301
  async def main():
302
  scraper = CrawlForAIScraper()
303
  await scraper.start()
@@ -306,4 +322,5 @@ if __name__ == "__main__":
306
  with open("output.json", "w") as f:
307
  f.write(json.dumps(data, indent=2))
308
  print(json.dumps(data, indent=2))
 
309
  asyncio.run(main())
 
1
  import asyncio
2
  import json
3
  import logging
4
+ import time
5
  from typing import Any, Dict, List
6
  from urllib.parse import quote_plus
7
+
8
+ import newspaper
9
+ import requests
10
  from bs4 import BeautifulSoup
11
  from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
 
12
  from newspaper import Article
 
 
13
 
14
 
15
  class WebScraper:
 
155
  return merged
156
 
157
 
 
158
  class CrawlForAIScraper:
159
  def __init__(self) -> None:
160
  self.logger = logging.getLogger(__name__)
161
  self.base_browser = BrowserConfig(
162
+ browser_type="chromium",
163
+ headless=True,
164
+ viewport_width=1920,
165
+ viewport_height=1080,
166
+ accept_downloads=True,
167
+ )
168
  self.crawler = AsyncWebCrawler(config=self.base_browser)
169
  self._is_started = False
170
 
 
209
  encoded_query = quote_plus(query)
210
  search_uri = f"https://www.google.com/search?q={encoded_query}"
211
 
212
+ result = await self.crawler.arun(
213
+ url=search_uri,
214
+ screenshot=False,
215
+ cache_mode=CacheMode.BYPASS,
216
+ delay_before_return_html=2,
217
+ page_timeout=25000,
218
+ scan_full_page=True,
219
+ )
220
 
221
  soup = BeautifulSoup(result.html, "html.parser")
222
  search_results = []
 
244
 
245
  try:
246
  # Run the crawler on a URL
247
+ result = await self.crawler.arun(
248
+ url=url,
249
+ screenshot=False,
250
+ cache_mode=CacheMode.BYPASS,
251
+ delay_before_return_html=2,
252
+ page_timeout=25000,
253
+ scan_full_page=True,
254
+ )
255
  soup = BeautifulSoup(result.html, "html.parser")
256
  data = {
257
  "url": url,
 
271
  def _extract_images(self, soup: BeautifulSoup, url: str) -> List[str]:
272
  # Extract images with width and height greater than 300 pixels
273
  images = []
274
+ for img in soup.find_all("img"):
275
+ if "src" in img.attrs:
276
+ src = img["src"]
277
  # remove px or any characters from width and height
278
+ width = int("".join(filter(str.isdigit, img.get("width", "0"))))
279
+ height = int("".join(filter(str.isdigit, img.get("height", "0"))))
280
+ if width > 300 and height > 300 and "pixel" not in src and "icon" not in src:
281
  images.append((src, width, height))
282
  images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
283
  images = [img[0] for img in images]
284
 
285
  # Add base URL to relative URLs
286
+ base_url = "/".join(url.split("/")[:3])
287
+ images = [img if img.startswith("http") else base_url + img for img in images]
288
  return images
289
 
290
  def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
291
  # Extract videos from iframes and video tags
292
  videos = []
293
+ nodes = list(soup.find_all("iframe")) + list(soup.find_all("video")) + list(soup.find_all("a"))
294
  for node in nodes:
295
+ if node.name == "iframe":
296
+ src = node.get("src", "")
297
+ if "youtube.com" in src or "youtu.be" in src:
298
  videos.append(src)
299
+ elif node.name == "video":
300
+ src = node.get("src", "")
301
+ if "youtube.com" in src or "youtu.be" in src:
302
  videos.append(src)
303
+ elif node.name == "a":
304
+ href = node.get("href", "")
305
+ if "youtube.com" in href or "youtu.be" in href:
306
  videos.append(href)
307
  return videos
308
 
309
 
310
  if __name__ == "__main__":
311
  import sys
312
+
313
  url = "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview"
314
  if len(sys.argv) > 1:
315
  url = sys.argv[1]
316
+
317
  async def main():
318
  scraper = CrawlForAIScraper()
319
  await scraper.start()
 
322
  with open("output.json", "w") as f:
323
  f.write(json.dumps(data, indent=2))
324
  print(json.dumps(data, indent=2))
325
+
326
  asyncio.run(main())