Spaces:

sohamw03
/

knowledge-net

Paused

App Files Files Community

Soham Waghmare commited on May 23, 2025

Commit

70f0982

1 Parent(s): fced70d

feat: working langgraph DeepResearch

Browse files

Files changed (3) hide show

langgraph_backend/app.py +159 -77
langgraph_backend/research_node.py +16 -0
langgraph_backend/scraper.py +2 -2

langgraph_backend/app.py CHANGED Viewed

@@ -18,12 +18,20 @@ from sse_starlette.sse import EventSourceResponse
 from prompts import (
     CONTINUE_BRANCH_PROMPT,
     RESEARCH_PLAN_PROMPT,
     SEARCH_QUERY_PROMPT,
     SITE_SUMMARY_PROMPT,
 )
 from research_node import ResearchNode
-from schema import ContinueBranch, ResearchPlan, SearchQuery
 from scraper import CrawlForAIScraper
 load_dotenv()
@@ -58,17 +66,20 @@ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv(
 class ResearchProgress:
-    def __init__(self, master_node: ResearchNode):
         self.progress = 0
-        self.master_node = master_node
-    def send(self, writer: StreamWriter, progress: int, message: dict, ptype: str):
         if ptype == "update":
-            self.progress = int(min(100, progress))  # max 100
-            writer({"event": "progress", "data": {"progress": self.progress, **message, "research_tree": self.master_node.build_tree_structure()}})
-        elif ptype == "setter":
             self.progress = int(min(100, self.progress + progress))  # max 100
-            writer({"event": "progress", "data": {"progress": self.progress, **message, "research_tree": self.master_node.build_tree_structure()}})
         elif ptype == "result":
             self.progress = 100
             writer({"event": "result", "data": message})
@@ -95,74 +106,51 @@ class ResearchState(TypedDict, total=False):
     token_count: int
-async def research_plan_node(state: ResearchState) -> ResearchPlan:
     writer = get_stream_writer()
-    if state["idx_research_plan"] == 0:
         topic = state["topic"]
         plan = llm.with_structured_output(ResearchPlan).invoke(RESEARCH_PLAN_PROMPT.format(topic=topic), config={"temperature": 1.5})
         if "steps" in plan:
             steps = plan["steps"]
         logger.info(f"Research plan:\n{json.dumps(steps, indent=2)}")
-        state["progress"].send(writer, 0, {"message": "Starting research..."}, ptype="setter")
         return {"research_plan": steps}
-    else:
-        # TODO: Update the plan based on current information
-        return dict()
 async def scrape_node(state: ResearchState) -> ResearchState:
-    writer = get_stream_writer()
-    # Generate initial search query if first step
-    # TODO: Add a condition based on 1st iter or successive iters
-    # TODO: Wrap inference in backend.knet.generate_content
-    if state["idx_research_plan"] == 0:
-        query = (
-            llm.with_structured_output(SearchQuery)
-            .invoke(
-                SEARCH_QUERY_PROMPT.format(
-                    vertical=state["research_plan"][state["idx_research_plan"]],
-                    topic=state["topic"],
-                    research_plan="None",
-                    past_queries="None",
-                    ctx_manager="None",
-                    n=1,
-                ),
-                config={"temperature": 1.5},
-            )
-            .get("branches", [""])[0]
         )
-        new_master = copy.deepcopy(state["master_node"])
-        curr_node = ResearchNode(query)
         new_master.add_child(curr_node.query, node=curr_node)
     else:
-        # TODO: Manage the Research Tree like above
-        query = (
-            llm.with_structured_output(SearchQuery)
-            .invoke(
-                SEARCH_QUERY_PROMPT.format(
-                    vertical=state["research_plan"][state["idx_research_plan"]],
-                    topic=state["topic"],
-                    research_plan="\n".join([f"[done] {step}" for i, step in enumerate(state["research_plan"]) if i < state["idx_research_plan"]]),
-                    past_queries="\n".join([f"[done] {query}" for query in state["current_node"].get_path_to_root()[1:]]),
-                    ctx_manager="\n\n---\n\n".join(state["ctx_manager"]),
-                    n=1,
-                ),
-                config={"temperature": 1.5},
-            )
-            .get("branches", [""])[0]
-        )
-    # Update progress
-    state["progress"].send(
-        writer, 100 / (len(state["research_plan"]) + 1), {"message": f"{state['research_plan'][state['idx_research_plan']]}"}, ptype="update"
-    )
-    # Search and scrape
-    data = await state["scraper"].search_and_scrape(query, state["num_sites_per_query"])  # node -> data = [{url:...}, {url:...}, ...]
     # Add data to context
     # src [1] : https://...
     # content...
@@ -175,23 +163,43 @@ async def summarize_node(state: ResearchState) -> ResearchState:
     upd_ctx_manager = state["ctx_manager"]
     if state["current_node"].data:
         for idx in range(0, len(state["current_node"].data), 3):
-            data = state["current_node"].data[idx : idx + 3]
-            findings = ("\n" + "-" * 10 + "Next data" + "-" * 10 + "\n").join([json.dumps(d, indent=2) for d in data])
-            summary = llm.invoke(SITE_SUMMARY_PROMPT.format(query=state["current_node"].query, findings=findings), config={"temperature": 0.2})
-            upd_ctx_manager.append(summary) if isinstance(summary, str) else None
     return {"ctx_manager": upd_ctx_manager}
-async def should_continue(state: ResearchState) -> Command[Literal["plan", "scrape", "gen_report"]]:
     # If max depth is reached and we are at the last step of the research plan, generate report
-    if state["current_node"].depth > state["max_depth"] and state["idx_research_plan"] >= len(state["research_plan"]) - 1:
         logger.info(f"Branch decision '{state['current_node'].query}': False")
         return Command(goto="gen_report")
     # If max depth is reached and we are not at the last step of the research plan, continue with the next step
-    elif state["current_node"].depth > state["max_depth"] and state["idx_research_plan"] < len(state["research_plan"]) - 1:
         logger.info(f"Branch decision '{state['current_node'].query}': False")
-        return Command(goto="plan", update={"idx_research_plan": state["idx_research_plan"] + 1})
     # If we have not reached max depth and not on last step of the research plan, continue with the next step
     decision = llm.with_structured_output(ContinueBranch).invoke(
@@ -203,11 +211,84 @@ async def should_continue(state: ResearchState) -> Command[Literal["plan", "scra
         )
     )
     logger.info(f"Branch decision '{state['current_node'].query}': {decision['decision']}")
-    return Command(goto="scrape") if decision["decision"] else Command(goto="plan", update={"idx_research_plan": state["idx_research_plan"] + 1})
 async def gen_report_node(state: ResearchState) -> ResearchState:
-    return
 # --- Main research logic using LangGraph ---
@@ -216,26 +297,29 @@ async def start_research_workflow(topic: str, scraper: CrawlForAIScraper, max_de
     graph = StateGraph(state_schema=ResearchState)
     graph.add_node("plan", research_plan_node)
     graph.add_node("scrape", scrape_node)
-    graph.add_node("summarize_node", summarize_node)
-    graph.add_node("should_continue", should_continue)
     graph.add_node("gen_report", gen_report_node)
     graph.add_edge("plan", "scrape")
-    graph.add_edge("scrape", "summarize_node")
-    graph.add_edge("summarize_node", "should_continue")
     graph.add_edge("gen_report", END)
     graph.set_entry_point("plan")
     graph = graph.compile()
     print(graph.get_graph().draw_mermaid())
     master_node = ResearchNode()
     state: ResearchState = {
         "scraper": scraper,
-        "progress": ResearchProgress(master_node),
         "topic": topic,
         "max_depth": max_depth,
         "num_sites_per_query": num_sites_per_query,
         "master_node": master_node,
         "research_plan": [],
         "idx_research_plan": 0,
         "ctx_researcher": [],
@@ -243,7 +327,7 @@ async def start_research_workflow(topic: str, scraper: CrawlForAIScraper, max_de
         "raster_report": "",
         "token_count": 0,
     }
-    async for update in graph.astream(state, stream_mode="custom"):
         yield update
@@ -280,8 +364,6 @@ async def abort_research(request: Request):
     return {"status": "aborted"}
-# Add more endpoints as needed for test, etc.
 if __name__ == "__main__":
     logger.info("Starting KnowledgeNet server...")
     import uvicorn

 from prompts import (
     CONTINUE_BRANCH_PROMPT,
+    REPORT_FILLIN_PROMPT,
+    REPORT_OUTLINE_PROMPT,
     RESEARCH_PLAN_PROMPT,
     SEARCH_QUERY_PROMPT,
     SITE_SUMMARY_PROMPT,
 )
 from research_node import ResearchNode
+from schema import (
+    ContinueBranch,
+    ReportFillin,
+    ReportOutline,
+    ResearchPlan,
+    SearchQuery,
+)
 from scraper import CrawlForAIScraper
 load_dotenv()
 class ResearchProgress:
+    def __init__(self):  # Removed master_node from __init__
         self.progress = 0
+    def send(self, writer: StreamWriter, progress: int, message: dict, ptype: str, master_node_for_send: ResearchNode = None):
         if ptype == "update":
             self.progress = int(min(100, self.progress + progress))  # max 100
+            writer(
+                {"event": "progress", "data": {"progress": self.progress, **message, "research_tree": master_node_for_send.build_tree_structure()}}
+            )
+        elif ptype == "setter":
+            self.progress = int(min(100, progress))  # max 100
+            writer(
+                {"event": "progress", "data": {"progress": self.progress, **message, "research_tree": master_node_for_send.build_tree_structure()}}
+            )
         elif ptype == "result":
             self.progress = 100
             writer({"event": "result", "data": message})
     token_count: int
+async def research_plan_node(state: ResearchState) -> ResearchState:
     writer = get_stream_writer()
+    if len(state["research_plan"]) == 0:
         topic = state["topic"]
         plan = llm.with_structured_output(ResearchPlan).invoke(RESEARCH_PLAN_PROMPT.format(topic=topic), config={"temperature": 1.5})
         if "steps" in plan:
             steps = plan["steps"]
         logger.info(f"Research plan:\n{json.dumps(steps, indent=2)}")
+        state["progress"].send(writer, 0, {"message": "Starting research..."}, ptype="setter", master_node_for_send=state["master_node"])
         return {"research_plan": steps}
 async def scrape_node(state: ResearchState) -> ResearchState:
+    # TODO: idx_research_plan index error here
+    query = (
+        llm.with_structured_output(SearchQuery)
+        .invoke(
+            SEARCH_QUERY_PROMPT.format(
+                vertical=state["research_plan"][state["idx_research_plan"]],
+                topic=state["topic"],
+                research_plan="\n".join([f"[done] {step}" for i, step in enumerate(state["research_plan"]) if i < state["idx_research_plan"]]),
+                past_queries="\n".join([f"[done] {query}" for query in state["current_node"].get_path_to_root()[1:]]),
+                ctx_manager="\n\n---\n\n".join(state["ctx_manager"]),
+                n=1,
+            ),
+            config={"temperature": 1.5},
         )
+        .get("branches", [""])[0]
+    )
+    new_master = ResearchNode.deep_copy_tree(state["master_node"])
+    curr_node = ResearchNode(query)
+    # Add a new vertical node
+    if state["current_node"].depth >= state["max_depth"]:
         new_master.add_child(curr_node.query, node=curr_node)
+    # Add a branch to the current node
     else:
+        old_curr_node = new_master.find_node(state["current_node"].id)
+        old_curr_node.add_child(curr_node.query, node=curr_node)
+    data = await state["scraper"].search_and_scrape(query, state["num_sites_per_query"])
+    curr_node.data = data
     # Add data to context
     # src [1] : https://...
     # content...
     upd_ctx_manager = state["ctx_manager"]
     if state["current_node"].data:
         for idx in range(0, len(state["current_node"].data), 3):
+            summary = llm.invoke(
+                SITE_SUMMARY_PROMPT.format(query=state["current_node"].query, findings=state["ctx_researcher"][-1]), config={"temperature": 0.2}
+            ).text()
+            upd_ctx_manager.append(summary)
     return {"ctx_manager": upd_ctx_manager}
+async def should_continue_node(state: ResearchState) -> Command[Literal["plan", "scrape", "gen_report"]]:
+    print(  # TODO: Remove this print statement
+        json.dumps(
+            {
+                "current_node": {"query": state["current_node"].query, "depth": state["current_node"].depth},
+                "max_depth": state["max_depth"],
+                "idx_research_plan": state["idx_research_plan"],
+            },
+            indent=2,
+        )
+    )
+    writer = get_stream_writer()
+    target_progress_for_step = (state["idx_research_plan"] + 1) * (100.0 / (len(state["research_plan"]) if state["research_plan"] else 1))
+    state["progress"].send(
+        writer,
+        target_progress_for_step,
+        {"message": f"{state['research_plan'][state['idx_research_plan']]}"},
+        ptype="update",
+        master_node_for_send=state["master_node"],
+    )
     # If max depth is reached and we are at the last step of the research plan, generate report
+    if state["current_node"].depth >= state["max_depth"] and state["idx_research_plan"] >= len(state["research_plan"]) - 1:
         logger.info(f"Branch decision '{state['current_node'].query}': False")
         return Command(goto="gen_report")
     # If max depth is reached and we are not at the last step of the research plan, continue with the next step
+    if state["current_node"].depth >= state["max_depth"] and state["idx_research_plan"] < len(state["research_plan"]) - 1:
         logger.info(f"Branch decision '{state['current_node'].query}': False")
+        return Command(goto="plan", update={"idx_research_plan": state["idx_research_plan"] + 1, "current_node": state["master_node"]})
     # If we have not reached max depth and not on last step of the research plan, continue with the next step
     decision = llm.with_structured_output(ContinueBranch).invoke(
         )
     )
     logger.info(f"Branch decision '{state['current_node'].query}': {decision['decision']}")
+    return Command(goto="scrape", update={"idx_research_plan": state["idx_research_plan"] + 0 if decision["decision"] else 1})
 async def gen_report_node(state: ResearchState) -> ResearchState:
+    writer = get_stream_writer()
+    state["progress"].send(writer, 0, {"message": "Generating report..."}, ptype="setter", master_node_for_send=state["master_node"])
+    findings = "\n\n------\n\n".join(state["ctx_manager"])
+    with open("ctx_manager.log.txt", "w", encoding="utf-8") as f:
+        f.write(findings)
+    # Generate report outline
+    outline = llm.with_structured_output(ReportOutline).invoke(REPORT_OUTLINE_PROMPT.format(topic=state["topic"], ctx_manager=findings))
+    logger.info(f"Report outline:\n{json.dumps(outline, indent=2)}")
+    report = []
+    raster_report = f"# {outline['title']}\n\n"
+    # Fill in report outline
+    for i, heading in enumerate(outline["headings"]):
+        state["progress"].send(
+            writer,
+            100 / (len(outline["headings"]) + 1),
+            {"message": "Generating report..."},
+            ptype="update",
+            master_node_for_send=state["master_node"],
+        )
+        content = llm.with_structured_output(ReportFillin).invoke(
+            REPORT_FILLIN_PROMPT.format(
+                topic=state["topic"],
+                ctx_manager=findings,
+                report_progress=raster_report,
+                report_outline=["[done] " + outline["title"]] + [f"[done] {h}" for _, h in enumerate(outline["headings"]) if i < _],
+                slot=heading,
+            ),
+        )["content"]
+        # Remove heading if LLM put it there regardless
+        idx_heading = content.find(heading)
+        if idx_heading != -1:
+            content = content[idx_heading + len(heading) :].strip()
+        report.append({"heading": heading, "content": content})
+        raster_report += f"\n\n## {heading}\n\n{content}"
+    # Collate multimedia content
+    media_content = {"images": [], "videos": [], "links": []}
+    all_sources_data = state["master_node"].get_all_data()
+    for data in all_sources_data:
+        if data.get("images"):
+            media_content["images"].extend(data["images"])
+        if data.get("videos"):
+            media_content["videos"].extend(data["videos"])
+        if data.get("links"):
+            media_content["links"].extend([{"url": link["href"], "text": link["text"]} for link in data["links"]])
+    # Dedupe
+    media_content["images"] = list(set(media_content["images"]))
+    media_content["videos"] = list(set(media_content["videos"]))
+    media_content["links"] = list({json.dumps(d, sort_keys=True) for d in media_content["links"]})
+    media_content["links"] = [json.loads(d) for d in media_content["links"]]
+    result = {
+        "topic": state["topic"],
+        "timestamp": datetime.now().isoformat(),
+        "content": raster_report,
+        "media": media_content,
+        "research_tree": state["master_node"].build_tree_structure(),
+        "metadata": {
+            "total_queries": state["master_node"].total_children(),
+            "total_sources": len(all_sources_data),
+            "max_depth_reached": state["master_node"].max_depth(),
+            "total_tokens": state["token_count"],
+        },
+    }
+    with open("output.log.json", "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=2)
+    state["progress"].send(
+        writer,
+        100,
+        result,
+        ptype="result",
+    )
 # --- Main research logic using LangGraph ---
     graph = StateGraph(state_schema=ResearchState)
     graph.add_node("plan", research_plan_node)
     graph.add_node("scrape", scrape_node)
+    graph.add_node("summarize", summarize_node)
+    graph.add_node("should_continue", should_continue_node)
     graph.add_node("gen_report", gen_report_node)
     graph.add_edge("plan", "scrape")
+    graph.add_edge("scrape", "summarize")
+    graph.add_edge("summarize", "should_continue")
     graph.add_edge("gen_report", END)
     graph.set_entry_point("plan")
     graph = graph.compile()
     print(graph.get_graph().draw_mermaid())
     master_node = ResearchNode()
+    initial_current_node = master_node
     state: ResearchState = {
         "scraper": scraper,
+        "progress": ResearchProgress(),
         "topic": topic,
         "max_depth": max_depth,
         "num_sites_per_query": num_sites_per_query,
         "master_node": master_node,
+        "current_node": initial_current_node,
         "research_plan": [],
         "idx_research_plan": 0,
         "ctx_researcher": [],
         "raster_report": "",
         "token_count": 0,
     }
+    async for update in graph.astream(state, {"recursion_limit": 1000}, stream_mode="custom"):
         yield update
     return {"status": "aborted"}
 if __name__ == "__main__":
     logger.info("Starting KnowledgeNet server...")
     import uvicorn

langgraph_backend/research_node.py CHANGED Viewed

@@ -75,3 +75,19 @@ class ResearchNode:
             "children": [child.build_tree_structure() for child in self.children],
         }

             "children": [child.build_tree_structure() for child in self.children],
         }
+    # Return deep copy with node pointers | Isolated function
+    def deep_copy_tree(root: Optional[Self] = None) -> Self:
+        """
+        Returns a deep copy of the tree starting from this node.
+        """
+        if root is None:
+            return None
+        new_node = ResearchNode(root.query, depth=root.depth)
+        new_node.id = root.id
+        new_node.data = copy.deepcopy(root.data)
+        for child in root.children:
+            new_child = ResearchNode.deep_copy_tree(child)
+            new_child.parent = new_node
+            new_node.children.append(new_child)
+        return new_node

langgraph_backend/scraper.py CHANGED Viewed

@@ -188,8 +188,8 @@ class CrawlForAIScraper:
                     all_videos = list(set(all_videos + media_videos))
                     data = {
-                        "url": result.url,
-                        "text": result.markdown,
                         "images": all_images,
                         "videos": all_videos,
                         "links": self._extract_links(result.links["external"]),

                     all_videos = list(set(all_videos + media_videos))
                     data = {
+                        "url": str(result.url),
+                        "text": str(result.markdown),
                         "images": all_images,
                         "videos": all_videos,
                         "links": self._extract_links(result.links["external"]),