Spaces:

sohamw03
/

knowledge-net

Paused

App Files Files Community

Soham Waghmare commited on May 14, 2025

Commit

87d5bfc

1 Parent(s): 51f3191

feat: langgraph implementation for knet with SSE

Browse files

Files changed (6) hide show

.gitignore +11 -10
langgraph_backend/app.py +301 -0
langgraph_backend/pyproject.toml +14 -0
langgraph_backend/schema.py +22 -0
langgraph_backend/scraper.py +283 -0
langgraph_backend/uv.lock +0 -0

.gitignore CHANGED Viewed

@@ -1,16 +1,17 @@
 **/.env
 # Flask ignore files
-backend/__pycache__/
-backend/*.pyc
-backend/*.pyo
-backend/*.pyd
-backend/*.pyo
-backend/.venv/
-backend/.env*
-backend/downloads/*
-backend/*.log.*
-backend/.ruff_cache/
 # Next.js ignore files
 # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.

 **/.env
 # Flask ignore files
+**/__pycache__/
+**/*.pyc
+**/*.pyo
+**/*.pyd
+**/*.pyo
+**/*.csv
+**/.venv/
+**/.env*
+**/downloads/*
+**/*.log.*
+**/.ruff_cache/
 # Next.js ignore files
 # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.

langgraph_backend/app.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import asyncio
+import json
+import logging
+import os
+from datetime import datetime
+from textwrap import dedent
+from typing import Any, Dict, List, Optional, TypedDict
+from dotenv import load_dotenv
+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langgraph.graph import END, StateGraph
+from sse_starlette.sse import EventSourceResponse
+from schema import ResearchPlan
+from scraper import CrawlForAIScraper
+load_dotenv()
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+app = FastAPI()
+CORS_ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", ",").split(",")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=CORS_ALLOWED_ORIGINS,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Session management (in-memory for now)
+sessions: Dict[str, Dict[str, Any]] = {}
+@app.get("/health")
+async def health_check():
+    return {"status": "ok"}
+# --- Prompt templates ---
+RESEARCH_PLAN_PROMPT = dedent("""You are an expert Deep Research agent, part of a Multiagent system.
+<User query>
+{topic}
+</User query>
+---
+Generate few very high level steps on which other agents can do info collection runs. Provide only data collection steps, no data identification, summarization, manipulation, selection, etc.
+Do not presume any knowledge about the topic.
+Return a string array of steps.""")
+REPORT_OUTLINE_PROMPT = dedent("""Generate a outline for a report based on the findings:
+<Original user query>
+{topic}
+</Original user query>
+<Findings>
+{ctx_manager}
+</Findings>
+Deduplicate, reorganize and analyze the findings to create the outline.
+If there are multiple comparisons, use a table instead of multiple headings.
+The outline should include:
+- Title
+- List of h2 headings
+Do not include hashtags""")
+REPORT_FILLIN_PROMPT = dedent("""Fill in the content for the current outline heading based on the findings:
+<Findings>
+{ctx_manager}
+</Findings>
+<The outline>
+{report_outline}
+</The outline>
+<Current outline heading to fill in>
+## {slot}
+...
+</Current outline heading to fill in>
+Assume [done] headings have their respective content.
+The content should be comprehensive, detailed and well-structured, providing detailed information on current heading.
+If needed use tables, lists. Do not include subheadings.
+Do not include the heading in the content.
+""")
+# --- LangChain LLM setup (Gemini, correct usage) ---
+llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
+# --- State schema for LangGraph ---
+class ResearchState(TypedDict, total=False):
+    topic: str
+    scraper: Any
+    max_depth: int
+    num_sites_per_query: int
+    steps: List[str]
+    findings: Any
+    outline: str
+    progress: int
+    message: str
+    timestamp: str
+    content: str
+    media: dict
+    research_tree: dict
+    metadata: dict
+# --- LangGraph node: LLM step for research plan ---
+async def research_plan_node(state: dict) -> dict:
+    topic = state["topic"]
+    prompt = RESEARCH_PLAN_PROMPT.format(topic=topic)
+    result = await llm.with_structured_output(ResearchPlan).ainvoke(prompt)
+    try:
+        steps = json.loads(result.content) if hasattr(result, "content") else json.loads(str(result))
+        # TODO: split this module another knet module to handle global state
+    except Exception:
+        steps = [str(result)]
+    logger.info(f"Research plan:\n{json.dumps(steps, indent=2)}")
+    return {"progress": 10, "message": "Generated research plan"}
+# --- LangGraph node: Scrape for each step ---
+async def scrape_node(state: dict) -> dict:
+    steps = state["steps"]
+    scraper = state["scraper"]
+    num_sites_per_query = state["num_sites_per_query"]
+    findings = []
+    for idx, step in enumerate(steps):
+        scraped = await scraper.search_and_scrape(step, num_sites=num_sites_per_query)
+        findings.append({"step": step, "data": scraped})
+    return {"findings": findings, "progress": 70, "message": "Scraping complete"}
+# --- LangGraph node: Generate report outline ---
+async def outline_node(state: dict) -> dict:
+    topic = state["topic"]
+    findings = state["findings"]
+    findings_text = json.dumps(findings, indent=2)
+    prompt = REPORT_OUTLINE_PROMPT.format(topic=topic, findings=findings_text)
+    result = await llm.ainvoke(prompt)
+    outline = result.content if hasattr(result, "content") else str(result)
+    return {"outline": outline, "progress": 90, "message": "Generated report outline"}
+# --- LangGraph node: Fill in report content for each heading ---
+async def fillin_node(state: dict) -> dict:
+    findings = state["findings"]
+    outline = state["outline"]
+    topic = state["topic"]
+    # Try to parse outline as JSON, else fallback to text splitting
+    try:
+        outline_obj = json.loads(outline)
+        title = outline_obj["title"]
+        headings = outline_obj["headings"]
+    except Exception:
+        # Fallback: try to extract headings from text
+        lines = outline.splitlines()
+        title = lines[0].strip("# ") if lines else topic
+        headings = [line.strip("# ") for line in lines if line.strip().startswith("## ")]
+    findings_text = json.dumps(findings, indent=2)
+    report = f"# {title}\n\n"
+    for idx, heading in enumerate(headings):
+        prompt = REPORT_FILLIN_PROMPT.format(
+            findings=findings_text,
+            outline=outline,
+            slot=heading,
+        )
+        result = await llm.ainvoke(prompt)
+        content = result.content if hasattr(result, "content") else str(result)
+        # Remove heading if LLM included it
+        if content.strip().startswith(heading):
+            content = content.strip()[len(heading) :].strip()
+        report += f"\n\n## {heading}\n\n{content}\n"
+    return {"content": report, "progress": 95, "message": "Filled in report content"}
+# --- LangGraph node: Finalize report ---
+def finalize_node(state: dict) -> dict:
+    findings = state.get("findings", [])
+    media = {"images": [], "videos": [], "links": []}
+    for step in findings:
+        for site in step.get("data", []):
+            media["images"].extend(site.get("images", []))
+            media["videos"].extend(site.get("videos", []))
+            media["links"].extend(site.get("links", []))
+    # Dedupe
+    media["images"] = list(set(media["images"]))
+    media["videos"] = list(set(media["videos"]))
+    # Links: dedupe by URL
+    seen_links = set()
+    deduped_links = []
+    for link in media["links"]:
+        url = link["href"] if isinstance(link, dict) and "href" in link else str(link)
+        if url not in seen_links:
+            seen_links.add(url)
+            deduped_links.append(link)
+    media["links"] = deduped_links
+    return {
+        "topic": state["topic"],
+        "timestamp": datetime.now().isoformat(),
+        "content": state["content"],
+        "media": media,
+        "research_tree": {},
+        "metadata": {"steps": state.get("steps", [])},
+        "progress": 100,
+        "message": "Research complete!",
+    }
+# --- Main research logic using LangGraph ---
+async def run_research(topic, scraper, max_depth, num_sites_per_query):
+    # Build the research graph
+    graph = StateGraph(state_schema=ResearchState)
+    graph.add_node("plan", research_plan_node)
+    graph.add_node("scrape", scrape_node)
+    graph.add_node("outline_node", outline_node)
+    graph.add_node("fillin", fillin_node)
+    graph.add_node("finalize", finalize_node)
+    graph.add_edge("plan", "scrape")
+    graph.add_edge("scrape", "outline_node")
+    graph.add_edge("outline_node", "fillin")
+    graph.add_edge("fillin", "finalize")
+    graph.add_edge("finalize", END)
+    graph.set_entry_point("plan")
+    graph = graph.compile()
+    state = {
+        "topic": topic,
+        "scraper": scraper,
+        "max_depth": max_depth,
+        "num_sites_per_query": num_sites_per_query,
+    }
+    async for step in graph.astream(state):
+        progress = step.get("progress", 0)
+        message = step.get("message", "")
+        yield {
+            "event": "status",
+            "data": json.dumps({"progress": progress, "message": message}),
+        }
+    yield {
+        "event": "research_complete",
+        "data": json.dumps(
+            {
+                "topic": step["topic"],
+                "timestamp": step["timestamp"],
+                "content": step["content"],
+                "media": step["media"],
+                "research_tree": step["research_tree"],
+                "metadata": step["metadata"],
+            }
+        ),
+    }
+@app.post("/start_research")
+async def start_research(request: Request):
+    data = await request.json()
+    topic = data.get("topic", "").strip()
+    max_depth = int(data.get("max_depth", 1))
+    num_sites_per_query = int(data.get("num_sites_per_query", 5))
+    session_id = data.get("session_id") or os.urandom(8).hex()
+    if session_id not in sessions:
+        scraper = CrawlForAIScraper()
+        await scraper.start()
+        sessions[session_id] = {"scraper": scraper}
+    else:
+        scraper = sessions[session_id]["scraper"]
+    async def event_generator():
+        async for event in run_research(topic, scraper, max_depth, num_sites_per_query):
+            yield event
+    return EventSourceResponse(event_generator())
+@app.post("/abort_research")
+async def abort_research(request: Request):
+    data = await request.json()
+    session_id = data.get("session_id")
+    if session_id in sessions:
+        scraper = sessions[session_id]["scraper"]
+        await scraper.close()
+        del sessions[session_id]
+    return {"status": "aborted"}
+# Add more endpoints as needed for test, etc.
+if __name__ == "__main__":
+    logger.info("Starting KnowledgeNet server...")
+    import uvicorn
+    uvicorn.run(app, host="127.0.0.1", port=5000)

langgraph_backend/pyproject.toml ADDED Viewed

	@@ -0,0 +1,14 @@

+[project]
+name = "langgraph-backend"
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "bs4>=0.0.2",
+    "crawl4ai>=0.6.3",
+    "fastapi>=0.115.12",
+    "langchain[google-genai]>=0.3.25",
+    "langgraph>=0.4.3",
+    "python-dotenv>=1.1.0",
+    "sse-starlette>=2.3.5",
+    "uvicorn>=0.34.2",
+]

langgraph_backend/schema.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from typing import List, TypedDict
+class ResearchPlan(TypedDict):
+    steps: List[str]
+class ContinueBranch(TypedDict):
+    decision: bool
+class SearchQuery(TypedDict):
+    branches: List[str]
+class ReportOutline(TypedDict):
+    title: str
+    headings: List[str]
+class ReportFillin(TypedDict):
+    content: str

langgraph_backend/scraper.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import asyncio
+import json
+import logging
+import time
+from typing import Any, Dict, List
+from urllib.parse import quote_plus
+import requests
+from bs4 import BeautifulSoup
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
+class CrawlForAIScraper:
+    def __init__(self) -> None:
+        self.logger = logging.getLogger(__name__)
+        self.session = requests.Session()
+        self.base_browser = BrowserConfig(
+            browser_type="chromium",
+            headless=True,
+            viewport_width=1920,
+            viewport_height=1080,
+            accept_downloads=False,
+            verbose=False,
+        )
+        self.crawler = AsyncWebCrawler(config=self.base_browser)
+        self._is_started = False
+    async def start(self):
+        if not self._is_started:
+            await self.crawler.start()
+            time.sleep(1)
+            self._is_started = True
+    async def close(self):
+        if self._is_started:
+            await self.crawler.close()
+            self._is_started = False
+    async def search_and_scrape(self, query: str, num_sites: int = 10) -> List[Dict[str, Any]]:
+        await self.start()
+        self.logger.info(f"Querying: {query}")
+        # Perform a search to get a list of webpages
+        search_results = await self._search(query)
+        # Scrape each webpage
+        scraped_data = []
+        self.logger.info(f"Scraping {num_sites} sites...")
+        data = await self._scrape_pages(search_results[: num_sites + 2], num_sites)
+        scraped_data.extend(data)
+        # Scrape next pages when some failed
+        for _ in range(3):
+            if len(scraped_data) < num_sites:
+                idx_last_page = search_results.index(search_results[-1])
+                data = await self._scrape_pages(search_results[idx_last_page + 1 : num_sites + 2], num_sites)
+                scraped_data.extend(data)
+        self.logger.info(f"Completed scraping {len(scraped_data)} sites")
+        return scraped_data
+    async def _search(self, query: str) -> List[str]:
+        try:
+            encoded_query = quote_plus(query)
+            search_uri = f"https://www.google.com/search?q={encoded_query}"
+            result = await self.crawler.arun(
+                url=search_uri,
+                screenshot=False,
+                cache_mode=CacheMode.BYPASS,
+                delay_before_return_html=2,
+                scan_full_page=True,
+            )
+            soup = BeautifulSoup(result.html, "html.parser")
+            search_results = []
+            for link in list(soup.select("div > span > a"))[2:]:
+                url = link.get("href").replace(" ", "").replace("\n", "").strip()
+                if not url.startswith(("http://", "https://")):
+                    url = "https://" + url
+                if "support.google.com" in url or url.startswith("/search?q="):
+                    continue
+                search_results.append(url)
+            for _ in range(3):
+                if not search_results:
+                    self.logger.info("Performing DuckDuckGo search as fallback...")
+                    self.logger.warning("No search results found.")
+                    search_results = await self._duckduckgo_search(query)
+            if not search_results:
+                raise Exception("No results found")
+            self.logger.info(f"Found {len(search_results)} results")
+            return search_results
+        except Exception as e:
+            self.logger.error(f"Google search error: {str(e)}", exc_info=True)
+            raise
+    async def _duckduckgo_search(self, query: str) -> List[str]:
+        self.logger.info("Performing DuckDuckGo search...")
+        try:
+            encoded_query = quote_plus(query)
+            search_uri = f"https://html.duckduckgo.com/html/?q={encoded_query}"
+            # response = self.session.get(
+            #     url,
+            #     headers={
+            #         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+            #     },
+            #     timeout=10,
+            # )
+            # response.raise_for_status()
+            result = await self.crawler.arun(
+                url=search_uri,
+                screenshot=False,
+                cache_mode=CacheMode.BYPASS,
+                delay_before_return_html=2,
+                scan_full_page=True,
+            )
+            soup = BeautifulSoup(result.html, "html.parser")
+            search_results = []
+            # DuckDuckGo search results are in elements with class 'result__url'
+            for result in soup.select(".result__url"):
+                url = result.get("href").replace(" ", "").replace("\\n", "")
+                if not url.startswith(("http://", "https://")):
+                    url = "https://" + url
+                search_results.append(url)
+            self.logger.info(f"Found {len(search_results)} URLs")
+            return search_results
+        except requests.exceptions.RequestException as e:  # Catch network errors specifically
+            self.logger.error(f"DuckDuckGo search error: {str(e)}")
+            return []
+        except Exception as e:  # Catch any other errors
+            self.logger.error(f"DuckDuckGo search error: {str(e)}")
+            return []
+    async def _scrape_pages(self, urls: str, max_sites: int) -> Dict[str, Any]:
+        await self.start()
+        try:
+            # Run the crawler on a URL
+            results = await self.crawler.arun_many(
+                urls=urls,
+                screenshot=False,
+                cache_mode=CacheMode.BYPASS,
+                scan_full_page=True,
+                semaphore_count=4,
+                wait_for_images=True,
+                scroll_delay=0.1,
+                delay_before_return_html=2,
+                exclude_external_images=True,
+                page_timeout=25000,
+            )
+            scraped_sites = []
+            for result in results:
+                if result.success:
+                    soup = BeautifulSoup(result.html, "html.parser")
+                    # Combine images
+                    extracted_images = self._extract_images(soup, result.url)
+                    media_images = []
+                    for img in result.media["images"]:
+                        if img["width"] is None or (isinstance(img["width"], (int, float)) and img["width"] > 300):
+                            # Resolve multiple URLs in the src attribute
+                            src = img["src"]
+                            if " " in src and "w," in src:
+                                urls = [url.strip() for url in src.split(" ") if url.strip()]
+                                if urls:
+                                    last_url = urls[-1].split(" ")[0]
+                                    media_images.append(last_url)
+                            else:
+                                media_images.append(src)
+                    all_images = list(set(extracted_images + media_images))
+                    # Combine videos
+                    all_videos = self._extract_videos(soup)
+                    media_videos = [v["src"] for v in result.media["videos"] if v["src"]]
+                    all_videos = list(set(all_videos + media_videos))
+                    data = {
+                        "url": result.url,
+                        "text": result.markdown,
+                        "images": all_images,
+                        "videos": all_videos,
+                        "links": self._extract_links(result.links["external"]),
+                    }
+                    scraped_sites.append(data)
+                    self.logger.info(f"  - {result.url[:80]}...")
+            return scraped_sites[:max_sites]
+        except Exception as e:
+            self.logger.error(f"Scraping error while {urls}: {str(e)}")
+            return {}
+    def _extract_images(self, soup: BeautifulSoup, url: str) -> List[str]:
+        # Extract images with width and height greater than 300 pixels
+        images = []
+        for img in soup.find_all("img"):
+            if "src" in img.attrs:
+                src = img["src"]
+                if not "width" or "height" not in img.attrs:
+                    continue
+                if "width" in img.attrs and img.get("width").lower() == "auto":
+                    images.append((src, 999, 0))
+                # Remove units from width and height: get start of the entity till the first non-digit character
+                width = "".join([i for i in img.get("width", "0") if i.isdigit() or i == "."])
+                height = "".join([i for i in img.get("height", "0") if i.isdigit() or i == "."])
+                if width == "" or height == "":
+                    continue
+                width, height = float(width), float(height)
+                if width > 300 and height > 300 and "pixel" not in src and "icon" not in src:
+                    images.append((src, width, height))
+        images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
+        images = [img[0] for img in images]
+        # Add base URL to relative URLs
+        base_url = "/".join(url.split("/")[:3])
+        images = [img if img.startswith("http") else base_url + img for img in images]
+        return images
+    def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
+        # Extract videos from iframes and video tags
+        videos = []
+        nodes = list(soup.find_all("iframe")) + list(soup.find_all("video")) + list(soup.find_all("a"))
+        for node in nodes:
+            if not any(
+                keyword in node.get("src", "") or keyword in node.get("href", "")
+                for keyword in ["accounts.google.com", "blob:", "youtube.com/redirect"]
+            ):
+                continue
+            elif (
+                any(node.name in tag for tag in ["video", "iframe", "a"])
+                and "www.youtube.com/watch?v" in node.get("src", "")
+                or "www.youtube.com/watch?v" in node.get("href", "")
+            ):
+                videos.append(node.get("src", ""))
+        return videos
+    def _extract_links(self, links: list) -> List[str]:
+        # Filter out unwanted links
+        filtered_links = []
+        for link in links:
+            url = link.get("href")
+            if url.startswith(("http://", "https://")) and not any(
+                keyword in url
+                for keyword in ["support.google.com", "google.com", "accounts.google.com", "youtube.com", "blob:", "mailto:", "javascript:"]
+            ):
+                filtered_links.append(link)
+        return filtered_links
+if __name__ == "__main__":
+    # Testing the scraper
+    import sys
+    urls = [
+        "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview",
+        "https://docs.crawl4ai.com/advanced/multi-url-crawling/",
+        "https://github.com/SesameAILabs/csm",
+        "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview",
+        "https://docs.crawl4ai.com/advanced/multi-url-crawling/",
+        "https://github.com/SesameAILabs/csm",
+    ]
+    if len(sys.argv) > 1:
+        urls = sys.argv[1:]
+    async def main():
+        scraper = CrawlForAIScraper()
+        await scraper.start()
+        data = await scraper.search_and_scrape("blender.org")
+        await scraper.close()
+        with open("output.log.json", "w") as f:
+            f.write(json.dumps(data, indent=2))
+        print(json.dumps(data, indent=2))
+    asyncio.run(main())

langgraph_backend/uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff