Spaces:

sohamw03
/

knowledge-net

Paused

App Files Files Community

Soham Waghmare commited on May 16, 2025

Commit

fced70d

1 Parent(s): fbfef4e

feat: enhance research workflow with progress tracking and node management

Browse files

Files changed (3) hide show

langgraph_backend/app.py +167 -55
langgraph_backend/research_node.py +77 -0
langgraph_backend/scraper.py +4 -1

langgraph_backend/app.py CHANGED Viewed

@@ -1,24 +1,36 @@
 import asyncio
 import json
 import logging
 import os
 from datetime import datetime
-from typing import Any, Dict, List, Optional, TypedDict
 from dotenv import load_dotenv
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langgraph.graph import END, StateGraph
 from sse_starlette.sse import EventSourceResponse
-from prompts import RESEARCH_PLAN_PROMPT, SEARCH_QUERY_PROMPT
-from schema import ResearchPlan, SearchQuery
 from scraper import CrawlForAIScraper
 load_dotenv()
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -45,94 +57,194 @@ async def health_check():
 llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
 # --- State schema for LangGraph ---
 class ResearchState(TypedDict, total=False):
     topic: str
     research_plan: list[str]
     idx_research_plan: int
     ctx_researcher: list[str]
     ctx_manager: list[str]
     token_count: int
-    scraper: CrawlForAIScraper
-    max_depth: int
-    num_sites_per_query: int
 async def research_plan_node(state: ResearchState) -> ResearchPlan:
-    topic = state["topic"]
-    plan = await llm.with_structured_output(ResearchPlan).ainvoke(RESEARCH_PLAN_PROMPT.format(topic=topic), temperature=1.5)
-    if hasattr(plan, "steps"):
-        steps = plan["steps"]
-    logger.info(f"Research plan:\n{json.dumps(steps, indent=2)}")
-    return steps
 async def scrape_node(state: ResearchState) -> ResearchState:
-    topic = state["topic"]
-    scraper = state["scraper"]
-    max_depth = state["max_depth"]
-    num_sites_per_query = state["num_sites_per_query"]
-    # Generate initial search query
-    query = llm.with_structured_output(SearchQuery).invoke(
-        SEARCH_QUERY_PROMPT.format(
-            vertical=state["research_plan"][state["idx_research_plan"]], topic=topic, research_plan="None", past_queries="None", ctx_manager="None", n=1
-        ),
-        temperature=1.5,
     )
     # Search and scrape
-    data = await state["scraper"].search_and_scrape(
-        query, num_sites_per_query
-    )  # node -> data = [{url:...}, {url:...}, ...]
-    state["ctx_researcher"].append(json.dumps(data, indent=2))
-    pass
-    # TODO: Implement the scraping logic and update the state with the scraped data
 # --- Main research logic using LangGraph ---
-async def run_research(topic, scraper, max_depth, num_sites_per_query):
     # Build the research graph
     graph = StateGraph(state_schema=ResearchState)
     graph.add_node("plan", research_plan_node)
     graph.add_node("scrape", scrape_node)
     graph.add_node("gen_report", gen_report_node)
     graph.add_edge("plan", "scrape")
-    graph.add_edge("scrape", "conditional", "plan", "gen_report")
     graph.add_edge("gen_report", END)
     graph.set_entry_point("plan")
     graph = graph.compile()
     print(graph.get_graph().draw_mermaid())
-    state = {
-        "topic": topic,
         "scraper": scraper,
         "max_depth": max_depth,
         "num_sites_per_query": num_sites_per_query,
     }
-    async for step in graph.astream(state):
-        progress = step.get("progress", 0)
-        message = step.get("message", "")
-        yield {
-            "event": "status",
-            "data": json.dumps({"progress": progress, "message": message}),
-        }
-    yield {
-        "event": "research_complete",
-        "data": json.dumps(
-            {
-                "topic": step["topic"],
-                "timestamp": step["timestamp"],
-                "content": step["content"],
-                "media": step["media"],
-                "research_tree": step["research_tree"],
-                "metadata": step["metadata"],
-            }
-        ),
-    }
 @app.post("/start_research")
@@ -151,7 +263,7 @@ async def start_research(request: Request):
         scraper = sessions[session_id]["scraper"]
     async def event_generator():
-        async for event in run_research(topic, scraper, max_depth, num_sites_per_query):
             yield event
     return EventSourceResponse(event_generator())

 import asyncio
+import copy
 import json
 import logging
 import os
 from datetime import datetime
+from typing import Annotated, Any, Dict, List, Literal, Optional, TypedDict
 from dotenv import load_dotenv
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
 from langchain_google_genai import ChatGoogleGenerativeAI
+from langgraph.config import get_stream_writer
 from langgraph.graph import END, StateGraph
+from langgraph.types import Command, StreamWriter
 from sse_starlette.sse import EventSourceResponse
+from prompts import (
+    CONTINUE_BRANCH_PROMPT,
+    RESEARCH_PLAN_PROMPT,
+    SEARCH_QUERY_PROMPT,
+    SITE_SUMMARY_PROMPT,
+)
+from research_node import ResearchNode
+from schema import ContinueBranch, ResearchPlan, SearchQuery
 from scraper import CrawlForAIScraper
 load_dotenv()
+# Today's Date
+DATE = datetime.now().strftime("%d %b, %Y")
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
+class ResearchProgress:
+    def __init__(self, master_node: ResearchNode):
+        self.progress = 0
+        self.master_node = master_node
+    def send(self, writer: StreamWriter, progress: int, message: dict, ptype: str):
+        if ptype == "update":
+            self.progress = int(min(100, progress))  # max 100
+            writer({"event": "progress", "data": {"progress": self.progress, **message, "research_tree": self.master_node.build_tree_structure()}})
+        elif ptype == "setter":
+            self.progress = int(min(100, self.progress + progress))  # max 100
+            writer({"event": "progress", "data": {"progress": self.progress, **message, "research_tree": self.master_node.build_tree_structure()}})
+        elif ptype == "result":
+            self.progress = 100
+            writer({"event": "result", "data": message})
 # --- State schema for LangGraph ---
 class ResearchState(TypedDict, total=False):
+    scraper: CrawlForAIScraper
+    progress: ResearchProgress
+    # Paramters
     topic: str
+    max_depth: int
+    num_sites_per_query: int
+    # Global State
+    master_node: ResearchNode
+    current_node: ResearchNode
     research_plan: list[str]
     idx_research_plan: int
     ctx_researcher: list[str]
     ctx_manager: list[str]
+    raster_report: str
     token_count: int
 async def research_plan_node(state: ResearchState) -> ResearchPlan:
+    writer = get_stream_writer()
+    if state["idx_research_plan"] == 0:
+        topic = state["topic"]
+        plan = llm.with_structured_output(ResearchPlan).invoke(RESEARCH_PLAN_PROMPT.format(topic=topic), config={"temperature": 1.5})
+        if "steps" in plan:
+            steps = plan["steps"]
+        logger.info(f"Research plan:\n{json.dumps(steps, indent=2)}")
+        state["progress"].send(writer, 0, {"message": "Starting research..."}, ptype="setter")
+        return {"research_plan": steps}
+    else:
+        # TODO: Update the plan based on current information
+        return dict()
 async def scrape_node(state: ResearchState) -> ResearchState:
+    writer = get_stream_writer()
+    # Generate initial search query if first step
+    # TODO: Add a condition based on 1st iter or successive iters
+    # TODO: Wrap inference in backend.knet.generate_content
+    if state["idx_research_plan"] == 0:
+        query = (
+            llm.with_structured_output(SearchQuery)
+            .invoke(
+                SEARCH_QUERY_PROMPT.format(
+                    vertical=state["research_plan"][state["idx_research_plan"]],
+                    topic=state["topic"],
+                    research_plan="None",
+                    past_queries="None",
+                    ctx_manager="None",
+                    n=1,
+                ),
+                config={"temperature": 1.5},
+            )
+            .get("branches", [""])[0]
+        )
+        new_master = copy.deepcopy(state["master_node"])
+        curr_node = ResearchNode(query)
+        new_master.add_child(curr_node.query, node=curr_node)
+    else:
+        # TODO: Manage the Research Tree like above
+        query = (
+            llm.with_structured_output(SearchQuery)
+            .invoke(
+                SEARCH_QUERY_PROMPT.format(
+                    vertical=state["research_plan"][state["idx_research_plan"]],
+                    topic=state["topic"],
+                    research_plan="\n".join([f"[done] {step}" for i, step in enumerate(state["research_plan"]) if i < state["idx_research_plan"]]),
+                    past_queries="\n".join([f"[done] {query}" for query in state["current_node"].get_path_to_root()[1:]]),
+                    ctx_manager="\n\n---\n\n".join(state["ctx_manager"]),
+                    n=1,
+                ),
+                config={"temperature": 1.5},
+            )
+            .get("branches", [""])[0]
+        )
+    # Update progress
+    state["progress"].send(
+        writer, 100 / (len(state["research_plan"]) + 1), {"message": f"{state['research_plan'][state['idx_research_plan']]}"}, ptype="update"
     )
     # Search and scrape
+    data = await state["scraper"].search_and_scrape(query, state["num_sites_per_query"])  # node -> data = [{url:...}, {url:...}, ...]
+    # Add data to context
+    # src [1] : https://...
+    # content...
+    upd_ctx_researcher = state["ctx_researcher"] + ["\n\n---\n\n".join([f"src [{i + 1}] : {d['url']}\n{d['text']}" for i, d in enumerate(data)])]
+    return {"ctx_researcher": upd_ctx_researcher, "master_node": new_master, "current_node": curr_node}
+async def summarize_node(state: ResearchState) -> ResearchState:
+    # Generate summary of key findings into the manager's context
+    upd_ctx_manager = state["ctx_manager"]
+    if state["current_node"].data:
+        for idx in range(0, len(state["current_node"].data), 3):
+            data = state["current_node"].data[idx : idx + 3]
+            findings = ("\n" + "-" * 10 + "Next data" + "-" * 10 + "\n").join([json.dumps(d, indent=2) for d in data])
+            summary = llm.invoke(SITE_SUMMARY_PROMPT.format(query=state["current_node"].query, findings=findings), config={"temperature": 0.2})
+            upd_ctx_manager.append(summary) if isinstance(summary, str) else None
+    return {"ctx_manager": upd_ctx_manager}
+async def should_continue(state: ResearchState) -> Command[Literal["plan", "scrape", "gen_report"]]:
+    # If max depth is reached and we are at the last step of the research plan, generate report
+    if state["current_node"].depth > state["max_depth"] and state["idx_research_plan"] >= len(state["research_plan"]) - 1:
+        logger.info(f"Branch decision '{state['current_node'].query}': False")
+        return Command(goto="gen_report")
+    # If max depth is reached and we are not at the last step of the research plan, continue with the next step
+    elif state["current_node"].depth > state["max_depth"] and state["idx_research_plan"] < len(state["research_plan"]) - 1:
+        logger.info(f"Branch decision '{state['current_node'].query}': False")
+        return Command(goto="plan", update={"idx_research_plan": state["idx_research_plan"] + 1})
+    # If we have not reached max depth and not on last step of the research plan, continue with the next step
+    decision = llm.with_structured_output(ContinueBranch).invoke(
+        CONTINUE_BRANCH_PROMPT.format(
+            research_plan="\n".join([f"[done] {step}" for i, step in enumerate(state["research_plan"]) if i < state["idx_research_plan"]]),
+            query=state["current_node"].query,
+            past_queries="\n".join([f"[done] {query}" for query in state["current_node"].get_path_to_root()[1:]]),
+            ctx_manager="\n\n---\n\n".join(state["ctx_manager"]),
+        )
+    )
+    logger.info(f"Branch decision '{state['current_node'].query}': {decision['decision']}")
+    return Command(goto="scrape") if decision["decision"] else Command(goto="plan", update={"idx_research_plan": state["idx_research_plan"] + 1})
+async def gen_report_node(state: ResearchState) -> ResearchState:
+    return
 # --- Main research logic using LangGraph ---
+async def start_research_workflow(topic: str, scraper: CrawlForAIScraper, max_depth: int, num_sites_per_query: int):
     # Build the research graph
     graph = StateGraph(state_schema=ResearchState)
     graph.add_node("plan", research_plan_node)
     graph.add_node("scrape", scrape_node)
+    graph.add_node("summarize_node", summarize_node)
+    graph.add_node("should_continue", should_continue)
     graph.add_node("gen_report", gen_report_node)
     graph.add_edge("plan", "scrape")
+    graph.add_edge("scrape", "summarize_node")
+    graph.add_edge("summarize_node", "should_continue")
     graph.add_edge("gen_report", END)
     graph.set_entry_point("plan")
     graph = graph.compile()
     print(graph.get_graph().draw_mermaid())
+    master_node = ResearchNode()
+    state: ResearchState = {
         "scraper": scraper,
+        "progress": ResearchProgress(master_node),
+        "topic": topic,
         "max_depth": max_depth,
         "num_sites_per_query": num_sites_per_query,
+        "master_node": master_node,
+        "research_plan": [],
+        "idx_research_plan": 0,
+        "ctx_researcher": [],
+        "ctx_manager": [],
+        "raster_report": "",
+        "token_count": 0,
     }
+    async for update in graph.astream(state, stream_mode="custom"):
+        yield update
 @app.post("/start_research")
         scraper = sessions[session_id]["scraper"]
     async def event_generator():
+        async for event in start_research_workflow(topic, scraper, max_depth, num_sites_per_query):
             yield event
     return EventSourceResponse(event_generator())

langgraph_backend/research_node.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import copy
+from typing import Any, Dict, List, Optional, Self
+import uuid
+class ResearchNode:
+    def __init__(self, query: str = "_", parent: Optional[Self] = None, depth: int = 0):
+        self.parent = parent
+        self.id = str(uuid.uuid4())
+        self.query = query
+        self.depth = depth
+        self.children: List[ResearchNode] = []
+        self.data: List[Dict[str, Any]] = []
+    def find_node(self, node_id: str) -> Optional[Self]:
+        """
+        Returns the node with the given id.
+        If not found, returns None.
+        """
+        if self.id == node_id:
+            return self
+        for child in self.children:
+            found = child.find_node(node_id)
+            if found:
+                return found
+        return None
+    def add_child(self, query: str, node: Optional[Self] = None) -> Self:
+        if node:
+            child = node
+            child.parent = self
+            child.depth = self.depth + 1
+        else:
+            child = ResearchNode(query, parent=self, depth=self.depth + 1)
+        self.children.append(child)
+        return child
+    def get_path_to_root(self) -> List[str]:
+        """
+        Returns the path from this node to the root node.
+        List[str]: [root.query, ..., self.query]
+        """
+        path = [self.query]
+        current = self
+        while current.parent:
+            current = current.parent
+            path.append(current.query)
+        return list(reversed(path))
+    def max_depth(self) -> int:
+        if not self.children:
+            return self.depth
+        return max([child.max_depth() for child in self.children])
+    def total_children(self) -> int:
+        if not self.children:
+            return 0
+        return len(self.children) + sum([child.total_children() for child in self.children])
+    def get_all_data(self) -> List[Dict[str, Any]]:
+        data = copy.deepcopy(self.data)
+        for child in self.children:
+            data.extend(child.get_all_data())
+        return data
+    # Build research tree structure
+    def build_tree_structure(self) -> Dict:
+        if not self:
+            return {}
+        sources = {d["url"]: d["text"] for d in self.data if d.get("url") and d.get("text")}
+        return {
+            "query": self.query,
+            "depth": self.depth,
+            "sources": sources,
+            "children": [child.build_tree_structure() for child in self.children],
+        }

langgraph_backend/scraper.py CHANGED Viewed

@@ -7,7 +7,7 @@ from urllib.parse import quote_plus
 import requests
 from bs4 import BeautifulSoup
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
 class CrawlForAIScraper:
@@ -70,6 +70,7 @@ class CrawlForAIScraper:
                 cache_mode=CacheMode.BYPASS,
                 delay_before_return_html=2,
                 scan_full_page=True,
             )
             soup = BeautifulSoup(result.html, "html.parser")
@@ -119,6 +120,7 @@ class CrawlForAIScraper:
                 cache_mode=CacheMode.BYPASS,
                 delay_before_return_html=2,
                 scan_full_page=True,
             )
             soup = BeautifulSoup(result.html, "html.parser")
@@ -157,6 +159,7 @@ class CrawlForAIScraper:
                 delay_before_return_html=2,
                 exclude_external_images=True,
                 page_timeout=25000,
             )
             scraped_sites = []
             for result in results:

 import requests
 from bs4 import BeautifulSoup
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig
 class CrawlForAIScraper:
                 cache_mode=CacheMode.BYPASS,
                 delay_before_return_html=2,
                 scan_full_page=True,
+                config=CrawlerRunConfig(verbose=False),
             )
             soup = BeautifulSoup(result.html, "html.parser")
                 cache_mode=CacheMode.BYPASS,
                 delay_before_return_html=2,
                 scan_full_page=True,
+                config=CrawlerRunConfig(verbose=False),
             )
             soup = BeautifulSoup(result.html, "html.parser")
                 delay_before_return_html=2,
                 exclude_external_images=True,
                 page_timeout=25000,
+                config=CrawlerRunConfig(verbose=False),
             )
             scraped_sites = []
             for result in results: