Spaces:

sohamw03
/

knowledge-net

Paused

App Files Files Community

Soham Waghmare commited on May 15, 2025

Commit

fbfef4e

1 Parent(s): 87d5bfc

feat: restructure and add nodes

Browse files

Files changed (2) hide show

langgraph_backend/app.py +38 -162
langgraph_backend/prompts.py +107 -0

langgraph_backend/app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import json
 import logging
 import os
 from datetime import datetime
-from textwrap import dedent
 from typing import Any, Dict, List, Optional, TypedDict
 from dotenv import load_dotenv
@@ -14,7 +13,8 @@ from langchain_google_genai import ChatGoogleGenerativeAI
 from langgraph.graph import END, StateGraph
 from sse_starlette.sse import EventSourceResponse
-from schema import ResearchPlan
 from scraper import CrawlForAIScraper
 load_dotenv()
@@ -41,54 +41,6 @@ async def health_check():
     return {"status": "ok"}
-# --- Prompt templates ---
-RESEARCH_PLAN_PROMPT = dedent("""You are an expert Deep Research agent, part of a Multiagent system.
-<User query>
-{topic}
-</User query>
----
-Generate few very high level steps on which other agents can do info collection runs. Provide only data collection steps, no data identification, summarization, manipulation, selection, etc.
-Do not presume any knowledge about the topic.
-Return a string array of steps.""")
-REPORT_OUTLINE_PROMPT = dedent("""Generate a outline for a report based on the findings:
-<Original user query>
-{topic}
-</Original user query>
-<Findings>
-{ctx_manager}
-</Findings>
-Deduplicate, reorganize and analyze the findings to create the outline.
-If there are multiple comparisons, use a table instead of multiple headings.
-The outline should include:
-- Title
-- List of h2 headings
-Do not include hashtags""")
-REPORT_FILLIN_PROMPT = dedent("""Fill in the content for the current outline heading based on the findings:
-<Findings>
-{ctx_manager}
-</Findings>
-<The outline>
-{report_outline}
-</The outline>
-<Current outline heading to fill in>
-## {slot}
-...
-</Current outline heading to fill in>
-Assume [done] headings have their respective content.
-The content should be comprehensive, detailed and well-structured, providing detailed information on current heading.
-If needed use tables, lists. Do not include subheadings.
-Do not include the heading in the content.
-""")
 # --- LangChain LLM setup (Gemini, correct usage) ---
 llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
@@ -96,121 +48,48 @@ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv(
 # --- State schema for LangGraph ---
 class ResearchState(TypedDict, total=False):
     topic: str
-    scraper: Any
     max_depth: int
     num_sites_per_query: int
-    steps: List[str]
-    findings: Any
-    outline: str
-    progress: int
-    message: str
-    timestamp: str
-    content: str
-    media: dict
-    research_tree: dict
-    metadata: dict
-# --- LangGraph node: LLM step for research plan ---
-async def research_plan_node(state: dict) -> dict:
     topic = state["topic"]
-    prompt = RESEARCH_PLAN_PROMPT.format(topic=topic)
-    result = await llm.with_structured_output(ResearchPlan).ainvoke(prompt)
-    try:
-        steps = json.loads(result.content) if hasattr(result, "content") else json.loads(str(result))
-        # TODO: split this module another knet module to handle global state
-    except Exception:
-        steps = [str(result)]
     logger.info(f"Research plan:\n{json.dumps(steps, indent=2)}")
-    return {"progress": 10, "message": "Generated research plan"}
-# --- LangGraph node: Scrape for each step ---
-async def scrape_node(state: dict) -> dict:
-    steps = state["steps"]
     scraper = state["scraper"]
     num_sites_per_query = state["num_sites_per_query"]
-    findings = []
-    for idx, step in enumerate(steps):
-        scraped = await scraper.search_and_scrape(step, num_sites=num_sites_per_query)
-        findings.append({"step": step, "data": scraped})
-    return {"findings": findings, "progress": 70, "message": "Scraping complete"}
-# --- LangGraph node: Generate report outline ---
-async def outline_node(state: dict) -> dict:
-    topic = state["topic"]
-    findings = state["findings"]
-    findings_text = json.dumps(findings, indent=2)
-    prompt = REPORT_OUTLINE_PROMPT.format(topic=topic, findings=findings_text)
-    result = await llm.ainvoke(prompt)
-    outline = result.content if hasattr(result, "content") else str(result)
-    return {"outline": outline, "progress": 90, "message": "Generated report outline"}
-# --- LangGraph node: Fill in report content for each heading ---
-async def fillin_node(state: dict) -> dict:
-    findings = state["findings"]
-    outline = state["outline"]
-    topic = state["topic"]
-    # Try to parse outline as JSON, else fallback to text splitting
-    try:
-        outline_obj = json.loads(outline)
-        title = outline_obj["title"]
-        headings = outline_obj["headings"]
-    except Exception:
-        # Fallback: try to extract headings from text
-        lines = outline.splitlines()
-        title = lines[0].strip("# ") if lines else topic
-        headings = [line.strip("# ") for line in lines if line.strip().startswith("## ")]
-    findings_text = json.dumps(findings, indent=2)
-    report = f"# {title}\n\n"
-    for idx, heading in enumerate(headings):
-        prompt = REPORT_FILLIN_PROMPT.format(
-            findings=findings_text,
-            outline=outline,
-            slot=heading,
-        )
-        result = await llm.ainvoke(prompt)
-        content = result.content if hasattr(result, "content") else str(result)
-        # Remove heading if LLM included it
-        if content.strip().startswith(heading):
-            content = content.strip()[len(heading) :].strip()
-        report += f"\n\n## {heading}\n\n{content}\n"
-    return {"content": report, "progress": 95, "message": "Filled in report content"}
-# --- LangGraph node: Finalize report ---
-def finalize_node(state: dict) -> dict:
-    findings = state.get("findings", [])
-    media = {"images": [], "videos": [], "links": []}
-    for step in findings:
-        for site in step.get("data", []):
-            media["images"].extend(site.get("images", []))
-            media["videos"].extend(site.get("videos", []))
-            media["links"].extend(site.get("links", []))
-    # Dedupe
-    media["images"] = list(set(media["images"]))
-    media["videos"] = list(set(media["videos"]))
-    # Links: dedupe by URL
-    seen_links = set()
-    deduped_links = []
-    for link in media["links"]:
-        url = link["href"] if isinstance(link, dict) and "href" in link else str(link)
-        if url not in seen_links:
-            seen_links.add(url)
-            deduped_links.append(link)
-    media["links"] = deduped_links
-    return {
-        "topic": state["topic"],
-        "timestamp": datetime.now().isoformat(),
-        "content": state["content"],
-        "media": media,
-        "research_tree": {},
-        "metadata": {"steps": state.get("steps", [])},
-        "progress": 100,
-        "message": "Research complete!",
-    }
 # --- Main research logic using LangGraph ---
@@ -219,17 +98,14 @@ async def run_research(topic, scraper, max_depth, num_sites_per_query):
     graph = StateGraph(state_schema=ResearchState)
     graph.add_node("plan", research_plan_node)
     graph.add_node("scrape", scrape_node)
-    graph.add_node("outline_node", outline_node)
-    graph.add_node("fillin", fillin_node)
-    graph.add_node("finalize", finalize_node)
     graph.add_edge("plan", "scrape")
-    graph.add_edge("scrape", "outline_node")
-    graph.add_edge("outline_node", "fillin")
-    graph.add_edge("fillin", "finalize")
-    graph.add_edge("finalize", END)
     graph.set_entry_point("plan")
     graph = graph.compile()
     state = {
         "topic": topic,

 import logging
 import os
 from datetime import datetime
 from typing import Any, Dict, List, Optional, TypedDict
 from dotenv import load_dotenv
 from langgraph.graph import END, StateGraph
 from sse_starlette.sse import EventSourceResponse
+from prompts import RESEARCH_PLAN_PROMPT, SEARCH_QUERY_PROMPT
+from schema import ResearchPlan, SearchQuery
 from scraper import CrawlForAIScraper
 load_dotenv()
     return {"status": "ok"}
 # --- LangChain LLM setup (Gemini, correct usage) ---
 llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
 # --- State schema for LangGraph ---
 class ResearchState(TypedDict, total=False):
     topic: str
+    research_plan: list[str]
+    idx_research_plan: int
+    ctx_researcher: list[str]
+    ctx_manager: list[str]
+    token_count: int
+    scraper: CrawlForAIScraper
     max_depth: int
     num_sites_per_query: int
+async def research_plan_node(state: ResearchState) -> ResearchPlan:
     topic = state["topic"]
+    plan = await llm.with_structured_output(ResearchPlan).ainvoke(RESEARCH_PLAN_PROMPT.format(topic=topic), temperature=1.5)
+    if hasattr(plan, "steps"):
+        steps = plan["steps"]
     logger.info(f"Research plan:\n{json.dumps(steps, indent=2)}")
+    return steps
+async def scrape_node(state: ResearchState) -> ResearchState:
+    topic = state["topic"]
     scraper = state["scraper"]
+    max_depth = state["max_depth"]
     num_sites_per_query = state["num_sites_per_query"]
+    # Generate initial search query
+    query = llm.with_structured_output(SearchQuery).invoke(
+        SEARCH_QUERY_PROMPT.format(
+            vertical=state["research_plan"][state["idx_research_plan"]], topic=topic, research_plan="None", past_queries="None", ctx_manager="None", n=1
+        ),
+        temperature=1.5,
+    )
+    # Search and scrape
+    data = await state["scraper"].search_and_scrape(
+        query, num_sites_per_query
+    )  # node -> data = [{url:...}, {url:...}, ...]
+    state["ctx_researcher"].append(json.dumps(data, indent=2))
+    pass
+    # TODO: Implement the scraping logic and update the state with the scraped data
 # --- Main research logic using LangGraph ---
     graph = StateGraph(state_schema=ResearchState)
     graph.add_node("plan", research_plan_node)
     graph.add_node("scrape", scrape_node)
+    graph.add_node("gen_report", gen_report_node)
     graph.add_edge("plan", "scrape")
+    graph.add_edge("scrape", "conditional", "plan", "gen_report")
+    graph.add_edge("gen_report", END)
     graph.set_entry_point("plan")
     graph = graph.compile()
+    print(graph.get_graph().draw_mermaid())
     state = {
         "topic": topic,

langgraph_backend/prompts.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from textwrap import dedent
+# --- Prompt templates ---
+RESEARCH_PLAN_PROMPT = dedent("""You are an expert Deep Research agent, part of a Multiagent system.
+<User query>
+{topic}
+</User query>
+---
+Generate few very high level steps on which other agents can do info collection runs. Provide only data collection steps, no data identification, summarization, manipulation, selection, etc.
+Do not presume any knowledge about the topic.
+Return a string array of steps.""")
+SITE_SUMMARY_PROMPT = dedent("""Extract specific verbatim key information from the following content that is related to the topic "{query}". No small talk.
+<Findings>
+{findings}
+</Findings>
+""")
+CONTINUE_BRANCH_PROMPT = dedent("""Given the current state of research, decide whether to continue exploring the current branch or not.
+<Global Research Plan>
+{research_plan}
+</Global Research Plan>
+Current Topic: {query}
+<Past Searched Queries>
+{past_queries}
+</Past Searched Queries>
+<Findings under current topic>
+{ctx_manager}
+</Findings under current topic>
+Consider:
+- Information saturation
+- Information duplication
+- Coverage of current topic
+- Potential for new insights
+Return only decision: true/false""")
+SEARCH_QUERY_PROMPT = dedent("""Based on the following findings on topic {vertical}, create google search queries
+<Original user query>
+{topic}
+</Original user query>
+<Global Research Plan>
+{research_plan}
+</Global Research Plan>
+<Past Searched Queries>
+{past_queries}
+</Past Searched Queries>
+<Findings under current topic>
+{ctx_manager}
+</Findings under current topic>
+Suggest {n} specific google search queries that:
+- Covers what has not been covered yet
+- Builds upon these findings
+- Explores different aspects
+- Goes deeper into important details
+- Do not do quote searches
+- Queries should be generic and short
+- Do not presume any knowledge about the topic
+Return as JSON array of objects with properties:
+- query (string)""")
+REPORT_OUTLINE_PROMPT = dedent("""Generate a outline for a report based on the findings:
+<Original user query>
+{topic}
+</Original user query>
+<Findings>
+{ctx_manager}
+</Findings>
+Deduplicate, reorganize and analyze the findings to create the outline.
+If there are multiple comparisons, use a table instead of multiple headings.
+The outline should include:
+- Title
+- List of h2 headings
+Do not include hashtags""")
+REPORT_FILLIN_PROMPT = dedent("""Fill in the content for the current outline heading based on the findings:
+<Findings>
+{ctx_manager}
+</Findings>
+<The outline>
+{report_outline}
+</The outline>
+<Current outline heading to fill in>
+## {slot}
+...
+</Current outline heading to fill in>
+Assume [done] headings have their respective content.
+The content should be comprehensive, detailed and well-structured, providing detailed information on current heading.
+If needed use tables, lists. Do not include subheadings.
+Do not include the heading in the content.
+""")