Soham Waghmare commited on
Commit
2d96b3b
·
0 Parent(s):

init: Add initial backend structure with Flask, WebSocket support, and web scraping functionality

Browse files
Files changed (5) hide show
  1. .gitignore +10 -0
  2. backend/app.py +76 -0
  3. backend/knet.py +240 -0
  4. backend/research_node.py +26 -0
  5. backend/scraper.py +155 -0
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Flask ignore files
2
+ backend/__pycache__/
3
+ backend/*.pyc
4
+ backend/*.pyo
5
+ backend/*.pyd
6
+ backend/*.pyo
7
+ backend/.venv/
8
+ backend/.env*
9
+
10
+ # Next.js ignore files
backend/app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pip install flask[async] flask-socketio flask-cors
2
+ # pip install google-genai beautifulsoup4 selenium newspaper3k lxml_html_clean eventlet
3
+ from flask import Flask, request, jsonify
4
+ from flask_cors import CORS
5
+ from flask_socketio import SocketIO, emit
6
+ import os, json, logging
7
+ from knet import KNet
8
+ from dotenv import load_dotenv
9
+
10
+ # Configure logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ load_dotenv()
15
+ knet = KNet()
16
+
17
+ app = Flask(__name__)
18
+ CORS(app)
19
+
20
+ # Increased pingTimeout and added logger
21
+ socketio = SocketIO(app, cors_allowed_origins="*", ping_timeout=9999, ping_interval=25)
22
+
23
+
24
+ @socketio.on("connect")
25
+ def handle_connect():
26
+ logger.info(f"Client connected: {request.sid}")
27
+
28
+
29
+ @socketio.on("disconnect")
30
+ def handle_disconnect():
31
+ logger.info(f"Client disconnected: {request.sid}")
32
+
33
+
34
+ @socketio.on("health_check")
35
+ def handle_health_check(_):
36
+ logger.debug("Health check received")
37
+ emit("health_check", {"status": "ok"})
38
+
39
+
40
+ @socketio.on("start_research")
41
+ def handle_research(data):
42
+ try:
43
+ data = json.loads(data)
44
+ topic = data.get("topic")
45
+ session_id = request.sid
46
+ logger.info(f"Starting research for client {session_id} on topic: {topic}")
47
+
48
+ def progress_callback(status):
49
+ try:
50
+ logger.debug(
51
+ f"Progress update: {status['progress']}% - {status['message']}"
52
+ )
53
+ socketio.emit(
54
+ "status",
55
+ {"message": status["message"], "progress": status["progress"]},
56
+ room=session_id,
57
+ )
58
+ except Exception as e:
59
+ logger.error(f"Error in progress callback: {str(e)}")
60
+
61
+ try:
62
+ research_results = knet.conduct_research(topic, progress_callback)
63
+ logger.info(f"Research completed for topic: {topic}")
64
+ socketio.emit("research_complete", research_results, room=session_id)
65
+ except Exception as e:
66
+ logger.error(f"Research error: {str(e)}")
67
+ socketio.emit("error", {"message": str(e)}, room=session_id)
68
+
69
+ except Exception as e:
70
+ logger.error(f"Error handling research request: {str(e)}")
71
+ socketio.emit("error", {"message": str(e)}, room=request.sid)
72
+
73
+
74
+ if __name__ == "__main__":
75
+ logger.info("Starting KnowledgeNet server...")
76
+ socketio.run(app, debug=True, port=5000)
backend/knet.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional, Any
2
+ import google.generativeai as genai
3
+ import logging
4
+ import os
5
+ from datetime import datetime
6
+ from dotenv import load_dotenv
7
+ from scraper import WebScraper
8
+ from research_node import ResearchNode
9
+ from collections import deque
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+
15
+ class ResearchProgress:
16
+ def __init__(self, callback=None):
17
+ self.progress = 0
18
+ self.callback = callback
19
+
20
+ def update(self, progress: int, message: str):
21
+ self.progress = progress
22
+ if self.callback:
23
+ self.callback({"progress": progress, "message": message})
24
+
25
+
26
+ class KNet:
27
+ def __init__(self, api_key: Optional[str] = None):
28
+ self.api_key = api_key or os.getenv("GOOGLE_API_KEY")
29
+ if not self.api_key:
30
+ raise ValueError("Google API key is required")
31
+
32
+ # Initialize Google GenAI
33
+ genai.configure(api_key=self.api_key)
34
+
35
+ # Keep both models with original configurations
36
+ self.llm = genai.GenerativeModel(
37
+ "gemini-2.0-flash-lite-preview-02-05",
38
+ generation_config={"temperature": 0.7},
39
+ )
40
+
41
+ self.research_manager = genai.GenerativeModel(
42
+ "gemini-2.0-flash-lite-preview-02-05",
43
+ generation_config={"temperature": 0.3},
44
+ )
45
+
46
+ # Initialize scraper
47
+ self.scraper = WebScraper()
48
+ self.logger = logging.getLogger(__name__)
49
+ self.max_depth = 5
50
+ self.min_importance_score = 0.6
51
+
52
+ self.search_prompt = """Generate 3-5 specific search queries to research the following topic: {topic}
53
+
54
+ Requirements:
55
+ 1. Queries should cover different aspects of the topic
56
+ 2. Be specific and technical
57
+ 3. Include key terms and concepts
58
+ 4. Format each query on a new line
59
+ 5. Return only the queries, no explanations"""
60
+
61
+ def __del__(self):
62
+ # Cleanup scraper when KNet instance is destroyed
63
+ if hasattr(self, "scraper"):
64
+ self.scraper.cleanup()
65
+
66
+ def conduct_research(self, topic: str, progress_callback=None) -> Dict[str, Any]:
67
+ progress = ResearchProgress(progress_callback)
68
+ self.logger.info(f"Starting research on topic: {topic}")
69
+ try:
70
+ # Setup aiohttp session at start of research
71
+ self.scraper.setup()
72
+ root_node = ResearchNode(topic)
73
+ research_stack = deque([root_node])
74
+ explored_queries = set()
75
+
76
+ # Generate initial search queries
77
+ self.logger.info("Generating search queries...")
78
+ response = self.llm.generate_content(self.search_prompt.format(topic=topic))
79
+ search_queries = response.text.strip().split("\n")
80
+ self.logger.info(f"Generated queries: {search_queries}")
81
+
82
+ progress.update(10, "Starting deep research exploration...")
83
+ self.logger.info("Research exploration initiated")
84
+
85
+ # Process each generated query
86
+ for query in search_queries:
87
+ if query.strip():
88
+ data = self.scraper.search_and_scrape(query.strip())
89
+ if data:
90
+ root_node.data.extend(data)
91
+
92
+ while research_stack:
93
+ current_node = research_stack.pop()
94
+
95
+ if (
96
+ current_node.query in explored_queries
97
+ or current_node.depth > self.max_depth
98
+ ):
99
+ continue
100
+
101
+ self.logger.info(
102
+ f"Exploring branch: {current_node.query} (Depth: {current_node.depth})"
103
+ )
104
+ progress.update(
105
+ 30 + (len(explored_queries) * 50 / (self.max_depth * 3)),
106
+ f"Exploring: {current_node.query}",
107
+ )
108
+
109
+ # Conduct research for current node
110
+ current_node.data = self.scraper.search_and_scrape(current_node.query)
111
+ explored_queries.add(current_node.query)
112
+
113
+ # Generate and evaluate new branches
114
+ if current_node.depth < self.max_depth:
115
+ new_branches = self._analyze_and_branch(current_node)
116
+ for branch in reversed(
117
+ new_branches
118
+ ): # Reverse to maintain DFS order
119
+ research_stack.append(branch)
120
+
121
+ self.logger.info("Generating final research report")
122
+ progress.update(80, "Generating comprehensive report...")
123
+ final_report = self._generate_final_report(root_node)
124
+
125
+ self.logger.info("Research completed successfully")
126
+ progress.update(100, "Research complete!")
127
+
128
+ return final_report
129
+
130
+ except Exception as e:
131
+ self.logger.error(f"Research failed: {str(e)}")
132
+ self.scraper.cleanup()
133
+ raise e
134
+ finally:
135
+ self.scraper.cleanup()
136
+
137
+ def _analyze_and_branch(self, node: ResearchNode) -> List[ResearchNode]:
138
+ analysis_prompt = f"""Analyze the research data and suggest new branches for deeper exploration.
139
+ Current topic: {node.query}
140
+ Current depth: {node.depth}
141
+ Path from root: {' -> '.join(node.get_path_to_root())}
142
+
143
+ Suggest new research directions that:
144
+ 1. Are specific and focused
145
+ 2. Explore unexplored aspects
146
+ 3. Follow promising leads from the current data
147
+
148
+ For each suggestion, rate its importance (0-1) and explain why.
149
+ Format: Importance Score | Query | Reason"""
150
+
151
+ response = self.research_manager.generate_content(analysis_prompt)
152
+ result = response.text
153
+
154
+ new_nodes = []
155
+ for line in result.split("\n"):
156
+ if "|" not in line:
157
+ continue
158
+
159
+ parts = line.split("|")
160
+ if len(parts) < 2:
161
+ continue
162
+
163
+ try:
164
+ importance = float(parts[0].strip())
165
+ query = parts[1].strip()
166
+
167
+ if importance >= self.min_importance_score:
168
+ child_node = node.add_child(query)
169
+ child_node.importance_score = importance
170
+ new_nodes.append(child_node)
171
+ except ValueError:
172
+ continue
173
+
174
+ return new_nodes
175
+
176
+ def _generate_final_report(self, root_node: ResearchNode) -> Dict[str, Any]:
177
+ def collect_data(node: ResearchNode) -> List[Dict]:
178
+ all_data = node.data.copy()
179
+ for child in node.children:
180
+ all_data.extend(collect_data(child))
181
+ return all_data
182
+
183
+ all_research_data = collect_data(root_node)
184
+
185
+ # Generate structured report using LLM
186
+ report_prompt = f"""Generate a comprehensive research report using the collected data.
187
+ Main Topic: {root_node.query}
188
+
189
+ Structure the report with:
190
+ 1. Executive Summary
191
+ 2. Key Findings
192
+ 3. Detailed Analysis
193
+ 4. Related Topics and Branches
194
+ 5. Sources and References
195
+
196
+ Include relevant quotes and citations."""
197
+
198
+ response = self.research_manager.generate_content(report_prompt)
199
+ report_content = response.text
200
+
201
+ # Organize multimedia content
202
+ media_content = {"images": [], "videos": [], "links": [], "references": []}
203
+
204
+ for data in all_research_data:
205
+ if data.get("images"):
206
+ media_content["images"].extend(data["images"])
207
+ if data.get("videos"):
208
+ media_content["videos"].extend(data["videos"])
209
+ if data.get("links"):
210
+ media_content["links"].append(
211
+ {
212
+ "url": data["url"],
213
+ "title": data.get("title", ""),
214
+ "summary": data.get("summary", ""),
215
+ }
216
+ )
217
+
218
+ # Build research tree structure
219
+ def build_tree_structure(node: ResearchNode) -> Dict:
220
+ return {
221
+ "query": node.query,
222
+ "importance": node.importance_score,
223
+ "depth": node.depth,
224
+ "children": [build_tree_structure(child) for child in node.children],
225
+ }
226
+
227
+ return {
228
+ "topic": root_node.query,
229
+ "timestamp": datetime.now().isoformat(),
230
+ "content": report_content,
231
+ "media": media_content,
232
+ "research_tree": build_tree_structure(root_node),
233
+ "metadata": {
234
+ "total_sources": len(all_research_data),
235
+ "max_depth_reached": max(
236
+ data.depth for data in collect_data(root_node)
237
+ ),
238
+ "total_branches": len(list(collect_data(root_node))),
239
+ },
240
+ }
backend/research_node.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any, Optional
2
+ from datetime import datetime
3
+
4
+ class ResearchNode:
5
+ def __init__(self, query: str, parent: Optional['ResearchNode'] = None, depth: int = 0):
6
+ self.query = query
7
+ self.parent = parent
8
+ self.depth = depth
9
+ self.children: List[ResearchNode] = []
10
+ self.data: List[Dict[str, Any]] = []
11
+ self.explored = False
12
+ self.importance_score = 0.0
13
+ self.timestamp = datetime.now()
14
+
15
+ def add_child(self, query: str) -> 'ResearchNode':
16
+ child = ResearchNode(query, parent=self, depth=self.depth + 1)
17
+ self.children.append(child)
18
+ return child
19
+
20
+ def get_path_to_root(self) -> List[str]:
21
+ path = [self.query]
22
+ current = self
23
+ while current.parent:
24
+ current = current.parent
25
+ path.append(current.query)
26
+ return list(reversed(path))
backend/scraper.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from selenium import webdriver
3
+ import logging
4
+ from typing import List, Dict, Any
5
+ import newspaper
6
+ from newspaper import Article
7
+ import re
8
+ import requests
9
+
10
+
11
+ class WebScraper:
12
+ def __init__(self):
13
+ self.chrome_options = webdriver.ChromeOptions()
14
+ # self.chrome_options.add_argument("--headless")
15
+ self.driver = webdriver.Chrome(options=self.chrome_options)
16
+ self.logger = logging.getLogger(__name__)
17
+ self.newspaper_config = newspaper.Config()
18
+ self.newspaper_config.browser_user_agent = "Mozilla/5.0"
19
+ self.newspaper_config.request_timeout = 10
20
+ self.session = requests.Session()
21
+ self.timeout = 30
22
+
23
+ def setup(self):
24
+ pass # No setup needed for synchronous operation
25
+
26
+ def cleanup(self):
27
+ if self.driver:
28
+ self.driver.quit()
29
+
30
+ def search_and_scrape(
31
+ self, query: str, num_sites: int = 10
32
+ ) -> List[Dict[str, Any]]:
33
+ self.logger.info(f"Starting search for: {query}")
34
+ search_results = self._google_search(query, num_sites)
35
+ self.logger.info(f"Found {len(search_results)} search results")
36
+
37
+ scraped_data = []
38
+ for idx, url in enumerate(search_results):
39
+ try:
40
+ self.logger.info(f"Scraping [{idx + 1}/{len(search_results)}]: {url}")
41
+ data = self._scrape_url(url)
42
+ if data:
43
+ scraped_data.append(data)
44
+ self.logger.info(f"Successfully scraped: {url}")
45
+ except Exception as e:
46
+ self.logger.error(f"Error scraping {url}: {str(e)}")
47
+ continue
48
+
49
+ self.logger.info(f"Completed scraping {len(scraped_data)} sites")
50
+ return scraped_data
51
+
52
+ def _google_search(self, query: str, num_results: int) -> List[str]:
53
+ self.logger.info("Performing Google search...")
54
+ try:
55
+ self.driver.get(
56
+ f"https://www.google.com/search?q={query.replace(' ', '+')}&num={num_results}"
57
+ )
58
+ self.driver.implicitly_wait(5)
59
+
60
+ elements = self.driver.find_elements("css selector", "div.g div.yuRUbf > a")
61
+ search_results = []
62
+ for element in elements:
63
+ url = element.get_attribute("href")
64
+ if url and url.startswith("http"):
65
+ search_results.append(url)
66
+ if len(search_results) >= num_results:
67
+ break
68
+
69
+ self.logger.info(f"Found {len(search_results)} URLs")
70
+ return search_results
71
+
72
+ except Exception as e:
73
+ self.logger.error(f"Google search error: {str(e)}")
74
+ return []
75
+
76
+ def _scrape_url(self, url: str) -> Dict[str, Any]:
77
+ try:
78
+ article = Article(url, config=self.newspaper_config)
79
+ article.download()
80
+ article.parse()
81
+ article.nlp()
82
+
83
+ data = {
84
+ "url": url,
85
+ "title": article.title,
86
+ "text": article.text,
87
+ "summary": article.summary,
88
+ "keywords": article.keywords,
89
+ "images": article.images,
90
+ "videos": [],
91
+ "links": article.links,
92
+ "authors": article.authors,
93
+ "publish_date": article.publish_date,
94
+ "metadata": {"language": article.meta_lang, "tags": article.tags},
95
+ }
96
+
97
+ if not data["text"]:
98
+ response = self.session.get(url, timeout=self.timeout)
99
+ soup = BeautifulSoup(response.text, "html.parser")
100
+ selenium_data = {
101
+ "url": url,
102
+ "title": soup.title.string if soup.title else "",
103
+ "text": self._extract_text(soup),
104
+ "images": self._extract_images(soup),
105
+ "videos": self._extract_videos(soup),
106
+ "links": self._extract_links(soup),
107
+ }
108
+ return self._merge_extraction_results(data, selenium_data)
109
+
110
+ return data
111
+
112
+ except Exception as e:
113
+ self.logger.error(f"Scraping error for {url}: {str(e)}")
114
+ return None
115
+
116
+ def _merge_extraction_results(
117
+ self, news_data: Dict, selenium_data: Dict
118
+ ) -> Dict[str, Any]:
119
+ merged = selenium_data.copy()
120
+
121
+ if news_data:
122
+ for field in ["title", "text", "images", "links"]:
123
+ if news_data.get(field):
124
+ merged[field] = news_data[field]
125
+
126
+ merged.update(
127
+ {
128
+ "summary": news_data.get("summary"),
129
+ "keywords": news_data.get("keywords"),
130
+ "authors": news_data.get("authors"),
131
+ "publish_date": news_data.get("publish_date"),
132
+ "metadata": news_data.get("metadata"),
133
+ }
134
+ )
135
+
136
+ return merged
137
+
138
+ def _extract_text(self, soup: BeautifulSoup) -> str:
139
+ for element in soup(["script", "style", "nav", "header", "footer"]):
140
+ element.decompose()
141
+ return " ".join(soup.stripped_strings)
142
+
143
+ def _extract_images(self, soup: BeautifulSoup) -> List[str]:
144
+ return [img.get("src") for img in soup.find_all("img") if img.get("src")]
145
+
146
+ def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
147
+ videos = []
148
+ for iframe in soup.find_all("iframe"):
149
+ src = iframe.get("src", "")
150
+ if "youtube.com" in src or "youtu.be" in src:
151
+ videos.append(src)
152
+ return videos
153
+
154
+ def _extract_links(self, soup: BeautifulSoup) -> List[str]:
155
+ return [a.get("href") for a in soup.find_all("a") if a.get("href")]