Spaces:
Paused
Paused
Soham Waghmare
commited on
Commit
·
2d96b3b
0
Parent(s):
init: Add initial backend structure with Flask, WebSocket support, and web scraping functionality
Browse files- .gitignore +10 -0
- backend/app.py +76 -0
- backend/knet.py +240 -0
- backend/research_node.py +26 -0
- backend/scraper.py +155 -0
.gitignore
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Flask ignore files
|
| 2 |
+
backend/__pycache__/
|
| 3 |
+
backend/*.pyc
|
| 4 |
+
backend/*.pyo
|
| 5 |
+
backend/*.pyd
|
| 6 |
+
backend/*.pyo
|
| 7 |
+
backend/.venv/
|
| 8 |
+
backend/.env*
|
| 9 |
+
|
| 10 |
+
# Next.js ignore files
|
backend/app.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pip install flask[async] flask-socketio flask-cors
|
| 2 |
+
# pip install google-genai beautifulsoup4 selenium newspaper3k lxml_html_clean eventlet
|
| 3 |
+
from flask import Flask, request, jsonify
|
| 4 |
+
from flask_cors import CORS
|
| 5 |
+
from flask_socketio import SocketIO, emit
|
| 6 |
+
import os, json, logging
|
| 7 |
+
from knet import KNet
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
|
| 10 |
+
# Configure logging
|
| 11 |
+
logging.basicConfig(level=logging.INFO)
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
load_dotenv()
|
| 15 |
+
knet = KNet()
|
| 16 |
+
|
| 17 |
+
app = Flask(__name__)
|
| 18 |
+
CORS(app)
|
| 19 |
+
|
| 20 |
+
# Increased pingTimeout and added logger
|
| 21 |
+
socketio = SocketIO(app, cors_allowed_origins="*", ping_timeout=9999, ping_interval=25)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@socketio.on("connect")
|
| 25 |
+
def handle_connect():
|
| 26 |
+
logger.info(f"Client connected: {request.sid}")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@socketio.on("disconnect")
|
| 30 |
+
def handle_disconnect():
|
| 31 |
+
logger.info(f"Client disconnected: {request.sid}")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@socketio.on("health_check")
|
| 35 |
+
def handle_health_check(_):
|
| 36 |
+
logger.debug("Health check received")
|
| 37 |
+
emit("health_check", {"status": "ok"})
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@socketio.on("start_research")
|
| 41 |
+
def handle_research(data):
|
| 42 |
+
try:
|
| 43 |
+
data = json.loads(data)
|
| 44 |
+
topic = data.get("topic")
|
| 45 |
+
session_id = request.sid
|
| 46 |
+
logger.info(f"Starting research for client {session_id} on topic: {topic}")
|
| 47 |
+
|
| 48 |
+
def progress_callback(status):
|
| 49 |
+
try:
|
| 50 |
+
logger.debug(
|
| 51 |
+
f"Progress update: {status['progress']}% - {status['message']}"
|
| 52 |
+
)
|
| 53 |
+
socketio.emit(
|
| 54 |
+
"status",
|
| 55 |
+
{"message": status["message"], "progress": status["progress"]},
|
| 56 |
+
room=session_id,
|
| 57 |
+
)
|
| 58 |
+
except Exception as e:
|
| 59 |
+
logger.error(f"Error in progress callback: {str(e)}")
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
research_results = knet.conduct_research(topic, progress_callback)
|
| 63 |
+
logger.info(f"Research completed for topic: {topic}")
|
| 64 |
+
socketio.emit("research_complete", research_results, room=session_id)
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.error(f"Research error: {str(e)}")
|
| 67 |
+
socketio.emit("error", {"message": str(e)}, room=session_id)
|
| 68 |
+
|
| 69 |
+
except Exception as e:
|
| 70 |
+
logger.error(f"Error handling research request: {str(e)}")
|
| 71 |
+
socketio.emit("error", {"message": str(e)}, room=request.sid)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
if __name__ == "__main__":
|
| 75 |
+
logger.info("Starting KnowledgeNet server...")
|
| 76 |
+
socketio.run(app, debug=True, port=5000)
|
backend/knet.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List, Optional, Any
|
| 2 |
+
import google.generativeai as genai
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
from scraper import WebScraper
|
| 8 |
+
from research_node import ResearchNode
|
| 9 |
+
from collections import deque
|
| 10 |
+
|
| 11 |
+
# Load environment variables
|
| 12 |
+
load_dotenv()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ResearchProgress:
|
| 16 |
+
def __init__(self, callback=None):
|
| 17 |
+
self.progress = 0
|
| 18 |
+
self.callback = callback
|
| 19 |
+
|
| 20 |
+
def update(self, progress: int, message: str):
|
| 21 |
+
self.progress = progress
|
| 22 |
+
if self.callback:
|
| 23 |
+
self.callback({"progress": progress, "message": message})
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class KNet:
|
| 27 |
+
def __init__(self, api_key: Optional[str] = None):
|
| 28 |
+
self.api_key = api_key or os.getenv("GOOGLE_API_KEY")
|
| 29 |
+
if not self.api_key:
|
| 30 |
+
raise ValueError("Google API key is required")
|
| 31 |
+
|
| 32 |
+
# Initialize Google GenAI
|
| 33 |
+
genai.configure(api_key=self.api_key)
|
| 34 |
+
|
| 35 |
+
# Keep both models with original configurations
|
| 36 |
+
self.llm = genai.GenerativeModel(
|
| 37 |
+
"gemini-2.0-flash-lite-preview-02-05",
|
| 38 |
+
generation_config={"temperature": 0.7},
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
self.research_manager = genai.GenerativeModel(
|
| 42 |
+
"gemini-2.0-flash-lite-preview-02-05",
|
| 43 |
+
generation_config={"temperature": 0.3},
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# Initialize scraper
|
| 47 |
+
self.scraper = WebScraper()
|
| 48 |
+
self.logger = logging.getLogger(__name__)
|
| 49 |
+
self.max_depth = 5
|
| 50 |
+
self.min_importance_score = 0.6
|
| 51 |
+
|
| 52 |
+
self.search_prompt = """Generate 3-5 specific search queries to research the following topic: {topic}
|
| 53 |
+
|
| 54 |
+
Requirements:
|
| 55 |
+
1. Queries should cover different aspects of the topic
|
| 56 |
+
2. Be specific and technical
|
| 57 |
+
3. Include key terms and concepts
|
| 58 |
+
4. Format each query on a new line
|
| 59 |
+
5. Return only the queries, no explanations"""
|
| 60 |
+
|
| 61 |
+
def __del__(self):
|
| 62 |
+
# Cleanup scraper when KNet instance is destroyed
|
| 63 |
+
if hasattr(self, "scraper"):
|
| 64 |
+
self.scraper.cleanup()
|
| 65 |
+
|
| 66 |
+
def conduct_research(self, topic: str, progress_callback=None) -> Dict[str, Any]:
|
| 67 |
+
progress = ResearchProgress(progress_callback)
|
| 68 |
+
self.logger.info(f"Starting research on topic: {topic}")
|
| 69 |
+
try:
|
| 70 |
+
# Setup aiohttp session at start of research
|
| 71 |
+
self.scraper.setup()
|
| 72 |
+
root_node = ResearchNode(topic)
|
| 73 |
+
research_stack = deque([root_node])
|
| 74 |
+
explored_queries = set()
|
| 75 |
+
|
| 76 |
+
# Generate initial search queries
|
| 77 |
+
self.logger.info("Generating search queries...")
|
| 78 |
+
response = self.llm.generate_content(self.search_prompt.format(topic=topic))
|
| 79 |
+
search_queries = response.text.strip().split("\n")
|
| 80 |
+
self.logger.info(f"Generated queries: {search_queries}")
|
| 81 |
+
|
| 82 |
+
progress.update(10, "Starting deep research exploration...")
|
| 83 |
+
self.logger.info("Research exploration initiated")
|
| 84 |
+
|
| 85 |
+
# Process each generated query
|
| 86 |
+
for query in search_queries:
|
| 87 |
+
if query.strip():
|
| 88 |
+
data = self.scraper.search_and_scrape(query.strip())
|
| 89 |
+
if data:
|
| 90 |
+
root_node.data.extend(data)
|
| 91 |
+
|
| 92 |
+
while research_stack:
|
| 93 |
+
current_node = research_stack.pop()
|
| 94 |
+
|
| 95 |
+
if (
|
| 96 |
+
current_node.query in explored_queries
|
| 97 |
+
or current_node.depth > self.max_depth
|
| 98 |
+
):
|
| 99 |
+
continue
|
| 100 |
+
|
| 101 |
+
self.logger.info(
|
| 102 |
+
f"Exploring branch: {current_node.query} (Depth: {current_node.depth})"
|
| 103 |
+
)
|
| 104 |
+
progress.update(
|
| 105 |
+
30 + (len(explored_queries) * 50 / (self.max_depth * 3)),
|
| 106 |
+
f"Exploring: {current_node.query}",
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# Conduct research for current node
|
| 110 |
+
current_node.data = self.scraper.search_and_scrape(current_node.query)
|
| 111 |
+
explored_queries.add(current_node.query)
|
| 112 |
+
|
| 113 |
+
# Generate and evaluate new branches
|
| 114 |
+
if current_node.depth < self.max_depth:
|
| 115 |
+
new_branches = self._analyze_and_branch(current_node)
|
| 116 |
+
for branch in reversed(
|
| 117 |
+
new_branches
|
| 118 |
+
): # Reverse to maintain DFS order
|
| 119 |
+
research_stack.append(branch)
|
| 120 |
+
|
| 121 |
+
self.logger.info("Generating final research report")
|
| 122 |
+
progress.update(80, "Generating comprehensive report...")
|
| 123 |
+
final_report = self._generate_final_report(root_node)
|
| 124 |
+
|
| 125 |
+
self.logger.info("Research completed successfully")
|
| 126 |
+
progress.update(100, "Research complete!")
|
| 127 |
+
|
| 128 |
+
return final_report
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
self.logger.error(f"Research failed: {str(e)}")
|
| 132 |
+
self.scraper.cleanup()
|
| 133 |
+
raise e
|
| 134 |
+
finally:
|
| 135 |
+
self.scraper.cleanup()
|
| 136 |
+
|
| 137 |
+
def _analyze_and_branch(self, node: ResearchNode) -> List[ResearchNode]:
|
| 138 |
+
analysis_prompt = f"""Analyze the research data and suggest new branches for deeper exploration.
|
| 139 |
+
Current topic: {node.query}
|
| 140 |
+
Current depth: {node.depth}
|
| 141 |
+
Path from root: {' -> '.join(node.get_path_to_root())}
|
| 142 |
+
|
| 143 |
+
Suggest new research directions that:
|
| 144 |
+
1. Are specific and focused
|
| 145 |
+
2. Explore unexplored aspects
|
| 146 |
+
3. Follow promising leads from the current data
|
| 147 |
+
|
| 148 |
+
For each suggestion, rate its importance (0-1) and explain why.
|
| 149 |
+
Format: Importance Score | Query | Reason"""
|
| 150 |
+
|
| 151 |
+
response = self.research_manager.generate_content(analysis_prompt)
|
| 152 |
+
result = response.text
|
| 153 |
+
|
| 154 |
+
new_nodes = []
|
| 155 |
+
for line in result.split("\n"):
|
| 156 |
+
if "|" not in line:
|
| 157 |
+
continue
|
| 158 |
+
|
| 159 |
+
parts = line.split("|")
|
| 160 |
+
if len(parts) < 2:
|
| 161 |
+
continue
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
importance = float(parts[0].strip())
|
| 165 |
+
query = parts[1].strip()
|
| 166 |
+
|
| 167 |
+
if importance >= self.min_importance_score:
|
| 168 |
+
child_node = node.add_child(query)
|
| 169 |
+
child_node.importance_score = importance
|
| 170 |
+
new_nodes.append(child_node)
|
| 171 |
+
except ValueError:
|
| 172 |
+
continue
|
| 173 |
+
|
| 174 |
+
return new_nodes
|
| 175 |
+
|
| 176 |
+
def _generate_final_report(self, root_node: ResearchNode) -> Dict[str, Any]:
|
| 177 |
+
def collect_data(node: ResearchNode) -> List[Dict]:
|
| 178 |
+
all_data = node.data.copy()
|
| 179 |
+
for child in node.children:
|
| 180 |
+
all_data.extend(collect_data(child))
|
| 181 |
+
return all_data
|
| 182 |
+
|
| 183 |
+
all_research_data = collect_data(root_node)
|
| 184 |
+
|
| 185 |
+
# Generate structured report using LLM
|
| 186 |
+
report_prompt = f"""Generate a comprehensive research report using the collected data.
|
| 187 |
+
Main Topic: {root_node.query}
|
| 188 |
+
|
| 189 |
+
Structure the report with:
|
| 190 |
+
1. Executive Summary
|
| 191 |
+
2. Key Findings
|
| 192 |
+
3. Detailed Analysis
|
| 193 |
+
4. Related Topics and Branches
|
| 194 |
+
5. Sources and References
|
| 195 |
+
|
| 196 |
+
Include relevant quotes and citations."""
|
| 197 |
+
|
| 198 |
+
response = self.research_manager.generate_content(report_prompt)
|
| 199 |
+
report_content = response.text
|
| 200 |
+
|
| 201 |
+
# Organize multimedia content
|
| 202 |
+
media_content = {"images": [], "videos": [], "links": [], "references": []}
|
| 203 |
+
|
| 204 |
+
for data in all_research_data:
|
| 205 |
+
if data.get("images"):
|
| 206 |
+
media_content["images"].extend(data["images"])
|
| 207 |
+
if data.get("videos"):
|
| 208 |
+
media_content["videos"].extend(data["videos"])
|
| 209 |
+
if data.get("links"):
|
| 210 |
+
media_content["links"].append(
|
| 211 |
+
{
|
| 212 |
+
"url": data["url"],
|
| 213 |
+
"title": data.get("title", ""),
|
| 214 |
+
"summary": data.get("summary", ""),
|
| 215 |
+
}
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
# Build research tree structure
|
| 219 |
+
def build_tree_structure(node: ResearchNode) -> Dict:
|
| 220 |
+
return {
|
| 221 |
+
"query": node.query,
|
| 222 |
+
"importance": node.importance_score,
|
| 223 |
+
"depth": node.depth,
|
| 224 |
+
"children": [build_tree_structure(child) for child in node.children],
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
return {
|
| 228 |
+
"topic": root_node.query,
|
| 229 |
+
"timestamp": datetime.now().isoformat(),
|
| 230 |
+
"content": report_content,
|
| 231 |
+
"media": media_content,
|
| 232 |
+
"research_tree": build_tree_structure(root_node),
|
| 233 |
+
"metadata": {
|
| 234 |
+
"total_sources": len(all_research_data),
|
| 235 |
+
"max_depth_reached": max(
|
| 236 |
+
data.depth for data in collect_data(root_node)
|
| 237 |
+
),
|
| 238 |
+
"total_branches": len(list(collect_data(root_node))),
|
| 239 |
+
},
|
| 240 |
+
}
|
backend/research_node.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict, Any, Optional
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
|
| 4 |
+
class ResearchNode:
|
| 5 |
+
def __init__(self, query: str, parent: Optional['ResearchNode'] = None, depth: int = 0):
|
| 6 |
+
self.query = query
|
| 7 |
+
self.parent = parent
|
| 8 |
+
self.depth = depth
|
| 9 |
+
self.children: List[ResearchNode] = []
|
| 10 |
+
self.data: List[Dict[str, Any]] = []
|
| 11 |
+
self.explored = False
|
| 12 |
+
self.importance_score = 0.0
|
| 13 |
+
self.timestamp = datetime.now()
|
| 14 |
+
|
| 15 |
+
def add_child(self, query: str) -> 'ResearchNode':
|
| 16 |
+
child = ResearchNode(query, parent=self, depth=self.depth + 1)
|
| 17 |
+
self.children.append(child)
|
| 18 |
+
return child
|
| 19 |
+
|
| 20 |
+
def get_path_to_root(self) -> List[str]:
|
| 21 |
+
path = [self.query]
|
| 22 |
+
current = self
|
| 23 |
+
while current.parent:
|
| 24 |
+
current = current.parent
|
| 25 |
+
path.append(current.query)
|
| 26 |
+
return list(reversed(path))
|
backend/scraper.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
from selenium import webdriver
|
| 3 |
+
import logging
|
| 4 |
+
from typing import List, Dict, Any
|
| 5 |
+
import newspaper
|
| 6 |
+
from newspaper import Article
|
| 7 |
+
import re
|
| 8 |
+
import requests
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class WebScraper:
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.chrome_options = webdriver.ChromeOptions()
|
| 14 |
+
# self.chrome_options.add_argument("--headless")
|
| 15 |
+
self.driver = webdriver.Chrome(options=self.chrome_options)
|
| 16 |
+
self.logger = logging.getLogger(__name__)
|
| 17 |
+
self.newspaper_config = newspaper.Config()
|
| 18 |
+
self.newspaper_config.browser_user_agent = "Mozilla/5.0"
|
| 19 |
+
self.newspaper_config.request_timeout = 10
|
| 20 |
+
self.session = requests.Session()
|
| 21 |
+
self.timeout = 30
|
| 22 |
+
|
| 23 |
+
def setup(self):
|
| 24 |
+
pass # No setup needed for synchronous operation
|
| 25 |
+
|
| 26 |
+
def cleanup(self):
|
| 27 |
+
if self.driver:
|
| 28 |
+
self.driver.quit()
|
| 29 |
+
|
| 30 |
+
def search_and_scrape(
|
| 31 |
+
self, query: str, num_sites: int = 10
|
| 32 |
+
) -> List[Dict[str, Any]]:
|
| 33 |
+
self.logger.info(f"Starting search for: {query}")
|
| 34 |
+
search_results = self._google_search(query, num_sites)
|
| 35 |
+
self.logger.info(f"Found {len(search_results)} search results")
|
| 36 |
+
|
| 37 |
+
scraped_data = []
|
| 38 |
+
for idx, url in enumerate(search_results):
|
| 39 |
+
try:
|
| 40 |
+
self.logger.info(f"Scraping [{idx + 1}/{len(search_results)}]: {url}")
|
| 41 |
+
data = self._scrape_url(url)
|
| 42 |
+
if data:
|
| 43 |
+
scraped_data.append(data)
|
| 44 |
+
self.logger.info(f"Successfully scraped: {url}")
|
| 45 |
+
except Exception as e:
|
| 46 |
+
self.logger.error(f"Error scraping {url}: {str(e)}")
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
self.logger.info(f"Completed scraping {len(scraped_data)} sites")
|
| 50 |
+
return scraped_data
|
| 51 |
+
|
| 52 |
+
def _google_search(self, query: str, num_results: int) -> List[str]:
|
| 53 |
+
self.logger.info("Performing Google search...")
|
| 54 |
+
try:
|
| 55 |
+
self.driver.get(
|
| 56 |
+
f"https://www.google.com/search?q={query.replace(' ', '+')}&num={num_results}"
|
| 57 |
+
)
|
| 58 |
+
self.driver.implicitly_wait(5)
|
| 59 |
+
|
| 60 |
+
elements = self.driver.find_elements("css selector", "div.g div.yuRUbf > a")
|
| 61 |
+
search_results = []
|
| 62 |
+
for element in elements:
|
| 63 |
+
url = element.get_attribute("href")
|
| 64 |
+
if url and url.startswith("http"):
|
| 65 |
+
search_results.append(url)
|
| 66 |
+
if len(search_results) >= num_results:
|
| 67 |
+
break
|
| 68 |
+
|
| 69 |
+
self.logger.info(f"Found {len(search_results)} URLs")
|
| 70 |
+
return search_results
|
| 71 |
+
|
| 72 |
+
except Exception as e:
|
| 73 |
+
self.logger.error(f"Google search error: {str(e)}")
|
| 74 |
+
return []
|
| 75 |
+
|
| 76 |
+
def _scrape_url(self, url: str) -> Dict[str, Any]:
|
| 77 |
+
try:
|
| 78 |
+
article = Article(url, config=self.newspaper_config)
|
| 79 |
+
article.download()
|
| 80 |
+
article.parse()
|
| 81 |
+
article.nlp()
|
| 82 |
+
|
| 83 |
+
data = {
|
| 84 |
+
"url": url,
|
| 85 |
+
"title": article.title,
|
| 86 |
+
"text": article.text,
|
| 87 |
+
"summary": article.summary,
|
| 88 |
+
"keywords": article.keywords,
|
| 89 |
+
"images": article.images,
|
| 90 |
+
"videos": [],
|
| 91 |
+
"links": article.links,
|
| 92 |
+
"authors": article.authors,
|
| 93 |
+
"publish_date": article.publish_date,
|
| 94 |
+
"metadata": {"language": article.meta_lang, "tags": article.tags},
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
if not data["text"]:
|
| 98 |
+
response = self.session.get(url, timeout=self.timeout)
|
| 99 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 100 |
+
selenium_data = {
|
| 101 |
+
"url": url,
|
| 102 |
+
"title": soup.title.string if soup.title else "",
|
| 103 |
+
"text": self._extract_text(soup),
|
| 104 |
+
"images": self._extract_images(soup),
|
| 105 |
+
"videos": self._extract_videos(soup),
|
| 106 |
+
"links": self._extract_links(soup),
|
| 107 |
+
}
|
| 108 |
+
return self._merge_extraction_results(data, selenium_data)
|
| 109 |
+
|
| 110 |
+
return data
|
| 111 |
+
|
| 112 |
+
except Exception as e:
|
| 113 |
+
self.logger.error(f"Scraping error for {url}: {str(e)}")
|
| 114 |
+
return None
|
| 115 |
+
|
| 116 |
+
def _merge_extraction_results(
|
| 117 |
+
self, news_data: Dict, selenium_data: Dict
|
| 118 |
+
) -> Dict[str, Any]:
|
| 119 |
+
merged = selenium_data.copy()
|
| 120 |
+
|
| 121 |
+
if news_data:
|
| 122 |
+
for field in ["title", "text", "images", "links"]:
|
| 123 |
+
if news_data.get(field):
|
| 124 |
+
merged[field] = news_data[field]
|
| 125 |
+
|
| 126 |
+
merged.update(
|
| 127 |
+
{
|
| 128 |
+
"summary": news_data.get("summary"),
|
| 129 |
+
"keywords": news_data.get("keywords"),
|
| 130 |
+
"authors": news_data.get("authors"),
|
| 131 |
+
"publish_date": news_data.get("publish_date"),
|
| 132 |
+
"metadata": news_data.get("metadata"),
|
| 133 |
+
}
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
return merged
|
| 137 |
+
|
| 138 |
+
def _extract_text(self, soup: BeautifulSoup) -> str:
|
| 139 |
+
for element in soup(["script", "style", "nav", "header", "footer"]):
|
| 140 |
+
element.decompose()
|
| 141 |
+
return " ".join(soup.stripped_strings)
|
| 142 |
+
|
| 143 |
+
def _extract_images(self, soup: BeautifulSoup) -> List[str]:
|
| 144 |
+
return [img.get("src") for img in soup.find_all("img") if img.get("src")]
|
| 145 |
+
|
| 146 |
+
def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
|
| 147 |
+
videos = []
|
| 148 |
+
for iframe in soup.find_all("iframe"):
|
| 149 |
+
src = iframe.get("src", "")
|
| 150 |
+
if "youtube.com" in src or "youtu.be" in src:
|
| 151 |
+
videos.append(src)
|
| 152 |
+
return videos
|
| 153 |
+
|
| 154 |
+
def _extract_links(self, soup: BeautifulSoup) -> List[str]:
|
| 155 |
+
return [a.get("href") for a in soup.find_all("a") if a.get("href")]
|