Spaces:

sohamw03
/

knowledge-net

Paused

App Files Files Community

Soham Waghmare commited on Feb 7, 2025

Commit

2d96b3b

0 Parent(s):

init: Add initial backend structure with Flask, WebSocket support, and web scraping functionality

Browse files

Files changed (5) hide show

.gitignore +10 -0
backend/app.py +76 -0
backend/knet.py +240 -0
backend/research_node.py +26 -0
backend/scraper.py +155 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Flask ignore files
+backend/__pycache__/
+backend/*.pyc
+backend/*.pyo
+backend/*.pyd
+backend/*.pyo
+backend/.venv/
+backend/.env*
+# Next.js ignore files

backend/app.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# pip install flask[async] flask-socketio flask-cors
+# pip install google-genai beautifulsoup4 selenium newspaper3k lxml_html_clean eventlet
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+from flask_socketio import SocketIO, emit
+import os, json, logging
+from knet import KNet
+from dotenv import load_dotenv
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+load_dotenv()
+knet = KNet()
+app = Flask(__name__)
+CORS(app)
+# Increased pingTimeout and added logger
+socketio = SocketIO(app, cors_allowed_origins="*", ping_timeout=9999, ping_interval=25)
+@socketio.on("connect")
+def handle_connect():
+    logger.info(f"Client connected: {request.sid}")
+@socketio.on("disconnect")
+def handle_disconnect():
+    logger.info(f"Client disconnected: {request.sid}")
+@socketio.on("health_check")
+def handle_health_check(_):
+    logger.debug("Health check received")
+    emit("health_check", {"status": "ok"})
+@socketio.on("start_research")
+def handle_research(data):
+    try:
+        data = json.loads(data)
+        topic = data.get("topic")
+        session_id = request.sid
+        logger.info(f"Starting research for client {session_id} on topic: {topic}")
+        def progress_callback(status):
+            try:
+                logger.debug(
+                    f"Progress update: {status['progress']}% - {status['message']}"
+                )
+                socketio.emit(
+                    "status",
+                    {"message": status["message"], "progress": status["progress"]},
+                    room=session_id,
+                )
+            except Exception as e:
+                logger.error(f"Error in progress callback: {str(e)}")
+        try:
+            research_results = knet.conduct_research(topic, progress_callback)
+            logger.info(f"Research completed for topic: {topic}")
+            socketio.emit("research_complete", research_results, room=session_id)
+        except Exception as e:
+            logger.error(f"Research error: {str(e)}")
+            socketio.emit("error", {"message": str(e)}, room=session_id)
+    except Exception as e:
+        logger.error(f"Error handling research request: {str(e)}")
+        socketio.emit("error", {"message": str(e)}, room=request.sid)
+if __name__ == "__main__":
+    logger.info("Starting KnowledgeNet server...")
+    socketio.run(app, debug=True, port=5000)

backend/knet.py ADDED Viewed

	@@ -0,0 +1,240 @@

+from typing import Dict, List, Optional, Any
+import google.generativeai as genai
+import logging
+import os
+from datetime import datetime
+from dotenv import load_dotenv
+from scraper import WebScraper
+from research_node import ResearchNode
+from collections import deque
+# Load environment variables
+load_dotenv()
+class ResearchProgress:
+    def __init__(self, callback=None):
+        self.progress = 0
+        self.callback = callback
+    def update(self, progress: int, message: str):
+        self.progress = progress
+        if self.callback:
+            self.callback({"progress": progress, "message": message})
+class KNet:
+    def __init__(self, api_key: Optional[str] = None):
+        self.api_key = api_key or os.getenv("GOOGLE_API_KEY")
+        if not self.api_key:
+            raise ValueError("Google API key is required")
+        # Initialize Google GenAI
+        genai.configure(api_key=self.api_key)
+        # Keep both models with original configurations
+        self.llm = genai.GenerativeModel(
+            "gemini-2.0-flash-lite-preview-02-05",
+            generation_config={"temperature": 0.7},
+        )
+        self.research_manager = genai.GenerativeModel(
+            "gemini-2.0-flash-lite-preview-02-05",
+            generation_config={"temperature": 0.3},
+        )
+        # Initialize scraper
+        self.scraper = WebScraper()
+        self.logger = logging.getLogger(__name__)
+        self.max_depth = 5
+        self.min_importance_score = 0.6
+        self.search_prompt = """Generate 3-5 specific search queries to research the following topic: {topic}
+        Requirements:
+        1. Queries should cover different aspects of the topic
+        2. Be specific and technical
+        3. Include key terms and concepts
+        4. Format each query on a new line
+        5. Return only the queries, no explanations"""
+    def __del__(self):
+        # Cleanup scraper when KNet instance is destroyed
+        if hasattr(self, "scraper"):
+            self.scraper.cleanup()
+    def conduct_research(self, topic: str, progress_callback=None) -> Dict[str, Any]:
+        progress = ResearchProgress(progress_callback)
+        self.logger.info(f"Starting research on topic: {topic}")
+        try:
+            # Setup aiohttp session at start of research
+            self.scraper.setup()
+            root_node = ResearchNode(topic)
+            research_stack = deque([root_node])
+            explored_queries = set()
+            # Generate initial search queries
+            self.logger.info("Generating search queries...")
+            response = self.llm.generate_content(self.search_prompt.format(topic=topic))
+            search_queries = response.text.strip().split("\n")
+            self.logger.info(f"Generated queries: {search_queries}")
+            progress.update(10, "Starting deep research exploration...")
+            self.logger.info("Research exploration initiated")
+            # Process each generated query
+            for query in search_queries:
+                if query.strip():
+                    data = self.scraper.search_and_scrape(query.strip())
+                    if data:
+                        root_node.data.extend(data)
+            while research_stack:
+                current_node = research_stack.pop()
+                if (
+                    current_node.query in explored_queries
+                    or current_node.depth > self.max_depth
+                ):
+                    continue
+                self.logger.info(
+                    f"Exploring branch: {current_node.query} (Depth: {current_node.depth})"
+                )
+                progress.update(
+                    30 + (len(explored_queries) * 50 / (self.max_depth * 3)),
+                    f"Exploring: {current_node.query}",
+                )
+                # Conduct research for current node
+                current_node.data = self.scraper.search_and_scrape(current_node.query)
+                explored_queries.add(current_node.query)
+                # Generate and evaluate new branches
+                if current_node.depth < self.max_depth:
+                    new_branches = self._analyze_and_branch(current_node)
+                    for branch in reversed(
+                        new_branches
+                    ):  # Reverse to maintain DFS order
+                        research_stack.append(branch)
+            self.logger.info("Generating final research report")
+            progress.update(80, "Generating comprehensive report...")
+            final_report = self._generate_final_report(root_node)
+            self.logger.info("Research completed successfully")
+            progress.update(100, "Research complete!")
+            return final_report
+        except Exception as e:
+            self.logger.error(f"Research failed: {str(e)}")
+            self.scraper.cleanup()
+            raise e
+        finally:
+            self.scraper.cleanup()
+    def _analyze_and_branch(self, node: ResearchNode) -> List[ResearchNode]:
+        analysis_prompt = f"""Analyze the research data and suggest new branches for deeper exploration.
+        Current topic: {node.query}
+        Current depth: {node.depth}
+        Path from root: {' -> '.join(node.get_path_to_root())}
+        Suggest new research directions that:
+        1. Are specific and focused
+        2. Explore unexplored aspects
+        3. Follow promising leads from the current data
+        For each suggestion, rate its importance (0-1) and explain why.
+        Format: Importance Score | Query | Reason"""
+        response = self.research_manager.generate_content(analysis_prompt)
+        result = response.text
+        new_nodes = []
+        for line in result.split("\n"):
+            if "|" not in line:
+                continue
+            parts = line.split("|")
+            if len(parts) < 2:
+                continue
+            try:
+                importance = float(parts[0].strip())
+                query = parts[1].strip()
+                if importance >= self.min_importance_score:
+                    child_node = node.add_child(query)
+                    child_node.importance_score = importance
+                    new_nodes.append(child_node)
+            except ValueError:
+                continue
+        return new_nodes
+    def _generate_final_report(self, root_node: ResearchNode) -> Dict[str, Any]:
+        def collect_data(node: ResearchNode) -> List[Dict]:
+            all_data = node.data.copy()
+            for child in node.children:
+                all_data.extend(collect_data(child))
+            return all_data
+        all_research_data = collect_data(root_node)
+        # Generate structured report using LLM
+        report_prompt = f"""Generate a comprehensive research report using the collected data.
+        Main Topic: {root_node.query}
+        Structure the report with:
+        1. Executive Summary
+        2. Key Findings
+        3. Detailed Analysis
+        4. Related Topics and Branches
+        5. Sources and References
+        Include relevant quotes and citations."""
+        response = self.research_manager.generate_content(report_prompt)
+        report_content = response.text
+        # Organize multimedia content
+        media_content = {"images": [], "videos": [], "links": [], "references": []}
+        for data in all_research_data:
+            if data.get("images"):
+                media_content["images"].extend(data["images"])
+            if data.get("videos"):
+                media_content["videos"].extend(data["videos"])
+            if data.get("links"):
+                media_content["links"].append(
+                    {
+                        "url": data["url"],
+                        "title": data.get("title", ""),
+                        "summary": data.get("summary", ""),
+                    }
+                )
+        # Build research tree structure
+        def build_tree_structure(node: ResearchNode) -> Dict:
+            return {
+                "query": node.query,
+                "importance": node.importance_score,
+                "depth": node.depth,
+                "children": [build_tree_structure(child) for child in node.children],
+            }
+        return {
+            "topic": root_node.query,
+            "timestamp": datetime.now().isoformat(),
+            "content": report_content,
+            "media": media_content,
+            "research_tree": build_tree_structure(root_node),
+            "metadata": {
+                "total_sources": len(all_research_data),
+                "max_depth_reached": max(
+                    data.depth for data in collect_data(root_node)
+                ),
+                "total_branches": len(list(collect_data(root_node))),
+            },
+        }

backend/research_node.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from typing import List, Dict, Any, Optional
+from datetime import datetime
+class ResearchNode:
+    def __init__(self, query: str, parent: Optional['ResearchNode'] = None, depth: int = 0):
+        self.query = query
+        self.parent = parent
+        self.depth = depth
+        self.children: List[ResearchNode] = []
+        self.data: List[Dict[str, Any]] = []
+        self.explored = False
+        self.importance_score = 0.0
+        self.timestamp = datetime.now()
+    def add_child(self, query: str) -> 'ResearchNode':
+        child = ResearchNode(query, parent=self, depth=self.depth + 1)
+        self.children.append(child)
+        return child
+    def get_path_to_root(self) -> List[str]:
+        path = [self.query]
+        current = self
+        while current.parent:
+            current = current.parent
+            path.append(current.query)
+        return list(reversed(path))

backend/scraper.py ADDED Viewed

	@@ -0,0 +1,155 @@

+from bs4 import BeautifulSoup
+from selenium import webdriver
+import logging
+from typing import List, Dict, Any
+import newspaper
+from newspaper import Article
+import re
+import requests
+class WebScraper:
+    def __init__(self):
+        self.chrome_options = webdriver.ChromeOptions()
+        # self.chrome_options.add_argument("--headless")
+        self.driver = webdriver.Chrome(options=self.chrome_options)
+        self.logger = logging.getLogger(__name__)
+        self.newspaper_config = newspaper.Config()
+        self.newspaper_config.browser_user_agent = "Mozilla/5.0"
+        self.newspaper_config.request_timeout = 10
+        self.session = requests.Session()
+        self.timeout = 30
+    def setup(self):
+        pass  # No setup needed for synchronous operation
+    def cleanup(self):
+        if self.driver:
+            self.driver.quit()
+    def search_and_scrape(
+        self, query: str, num_sites: int = 10
+    ) -> List[Dict[str, Any]]:
+        self.logger.info(f"Starting search for: {query}")
+        search_results = self._google_search(query, num_sites)
+        self.logger.info(f"Found {len(search_results)} search results")
+        scraped_data = []
+        for idx, url in enumerate(search_results):
+            try:
+                self.logger.info(f"Scraping [{idx + 1}/{len(search_results)}]: {url}")
+                data = self._scrape_url(url)
+                if data:
+                    scraped_data.append(data)
+                    self.logger.info(f"Successfully scraped: {url}")
+            except Exception as e:
+                self.logger.error(f"Error scraping {url}: {str(e)}")
+                continue
+        self.logger.info(f"Completed scraping {len(scraped_data)} sites")
+        return scraped_data
+    def _google_search(self, query: str, num_results: int) -> List[str]:
+        self.logger.info("Performing Google search...")
+        try:
+            self.driver.get(
+                f"https://www.google.com/search?q={query.replace(' ', '+')}&num={num_results}"
+            )
+            self.driver.implicitly_wait(5)
+            elements = self.driver.find_elements("css selector", "div.g div.yuRUbf > a")
+            search_results = []
+            for element in elements:
+                url = element.get_attribute("href")
+                if url and url.startswith("http"):
+                    search_results.append(url)
+                    if len(search_results) >= num_results:
+                        break
+            self.logger.info(f"Found {len(search_results)} URLs")
+            return search_results
+        except Exception as e:
+            self.logger.error(f"Google search error: {str(e)}")
+            return []
+    def _scrape_url(self, url: str) -> Dict[str, Any]:
+        try:
+            article = Article(url, config=self.newspaper_config)
+            article.download()
+            article.parse()
+            article.nlp()
+            data = {
+                "url": url,
+                "title": article.title,
+                "text": article.text,
+                "summary": article.summary,
+                "keywords": article.keywords,
+                "images": article.images,
+                "videos": [],
+                "links": article.links,
+                "authors": article.authors,
+                "publish_date": article.publish_date,
+                "metadata": {"language": article.meta_lang, "tags": article.tags},
+            }
+            if not data["text"]:
+                response = self.session.get(url, timeout=self.timeout)
+                soup = BeautifulSoup(response.text, "html.parser")
+                selenium_data = {
+                    "url": url,
+                    "title": soup.title.string if soup.title else "",
+                    "text": self._extract_text(soup),
+                    "images": self._extract_images(soup),
+                    "videos": self._extract_videos(soup),
+                    "links": self._extract_links(soup),
+                }
+                return self._merge_extraction_results(data, selenium_data)
+            return data
+        except Exception as e:
+            self.logger.error(f"Scraping error for {url}: {str(e)}")
+            return None
+    def _merge_extraction_results(
+        self, news_data: Dict, selenium_data: Dict
+    ) -> Dict[str, Any]:
+        merged = selenium_data.copy()
+        if news_data:
+            for field in ["title", "text", "images", "links"]:
+                if news_data.get(field):
+                    merged[field] = news_data[field]
+            merged.update(
+                {
+                    "summary": news_data.get("summary"),
+                    "keywords": news_data.get("keywords"),
+                    "authors": news_data.get("authors"),
+                    "publish_date": news_data.get("publish_date"),
+                    "metadata": news_data.get("metadata"),
+                }
+            )
+        return merged
+    def _extract_text(self, soup: BeautifulSoup) -> str:
+        for element in soup(["script", "style", "nav", "header", "footer"]):
+            element.decompose()
+        return " ".join(soup.stripped_strings)
+    def _extract_images(self, soup: BeautifulSoup) -> List[str]:
+        return [img.get("src") for img in soup.find_all("img") if img.get("src")]
+    def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
+        videos = []
+        for iframe in soup.find_all("iframe"):
+            src = iframe.get("src", "")
+            if "youtube.com" in src or "youtu.be" in src:
+                videos.append(src)
+        return videos
+    def _extract_links(self, soup: BeautifulSoup) -> List[str]:
+        return [a.get("href") for a in soup.find_all("a") if a.get("href")]