Soham Waghmare commited on
Commit
fbfef4e
·
1 Parent(s): 87d5bfc

feat: restructure and add nodes

Browse files
Files changed (2) hide show
  1. langgraph_backend/app.py +38 -162
  2. langgraph_backend/prompts.py +107 -0
langgraph_backend/app.py CHANGED
@@ -3,7 +3,6 @@ import json
3
  import logging
4
  import os
5
  from datetime import datetime
6
- from textwrap import dedent
7
  from typing import Any, Dict, List, Optional, TypedDict
8
 
9
  from dotenv import load_dotenv
@@ -14,7 +13,8 @@ from langchain_google_genai import ChatGoogleGenerativeAI
14
  from langgraph.graph import END, StateGraph
15
  from sse_starlette.sse import EventSourceResponse
16
 
17
- from schema import ResearchPlan
 
18
  from scraper import CrawlForAIScraper
19
 
20
  load_dotenv()
@@ -41,54 +41,6 @@ async def health_check():
41
  return {"status": "ok"}
42
 
43
 
44
- # --- Prompt templates ---
45
- RESEARCH_PLAN_PROMPT = dedent("""You are an expert Deep Research agent, part of a Multiagent system.
46
-
47
- <User query>
48
- {topic}
49
- </User query>
50
-
51
- ---
52
- Generate few very high level steps on which other agents can do info collection runs. Provide only data collection steps, no data identification, summarization, manipulation, selection, etc.
53
- Do not presume any knowledge about the topic.
54
- Return a string array of steps.""")
55
-
56
- REPORT_OUTLINE_PROMPT = dedent("""Generate a outline for a report based on the findings:
57
- <Original user query>
58
- {topic}
59
- </Original user query>
60
-
61
- <Findings>
62
- {ctx_manager}
63
- </Findings>
64
-
65
- Deduplicate, reorganize and analyze the findings to create the outline.
66
- If there are multiple comparisons, use a table instead of multiple headings.
67
- The outline should include:
68
- - Title
69
- - List of h2 headings
70
- Do not include hashtags""")
71
-
72
- REPORT_FILLIN_PROMPT = dedent("""Fill in the content for the current outline heading based on the findings:
73
- <Findings>
74
- {ctx_manager}
75
- </Findings>
76
-
77
- <The outline>
78
- {report_outline}
79
- </The outline>
80
-
81
- <Current outline heading to fill in>
82
- ## {slot}
83
- ...
84
- </Current outline heading to fill in>
85
-
86
- Assume [done] headings have their respective content.
87
- The content should be comprehensive, detailed and well-structured, providing detailed information on current heading.
88
- If needed use tables, lists. Do not include subheadings.
89
- Do not include the heading in the content.
90
- """)
91
-
92
  # --- LangChain LLM setup (Gemini, correct usage) ---
93
  llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
94
 
@@ -96,121 +48,48 @@ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv(
96
  # --- State schema for LangGraph ---
97
  class ResearchState(TypedDict, total=False):
98
  topic: str
99
- scraper: Any
 
 
 
 
 
 
 
100
  max_depth: int
101
  num_sites_per_query: int
102
- steps: List[str]
103
- findings: Any
104
- outline: str
105
- progress: int
106
- message: str
107
- timestamp: str
108
- content: str
109
- media: dict
110
- research_tree: dict
111
- metadata: dict
112
-
113
-
114
- # --- LangGraph node: LLM step for research plan ---
115
- async def research_plan_node(state: dict) -> dict:
116
  topic = state["topic"]
117
- prompt = RESEARCH_PLAN_PROMPT.format(topic=topic)
118
- result = await llm.with_structured_output(ResearchPlan).ainvoke(prompt)
119
- try:
120
- steps = json.loads(result.content) if hasattr(result, "content") else json.loads(str(result))
121
- # TODO: split this module another knet module to handle global state
122
- except Exception:
123
- steps = [str(result)]
124
  logger.info(f"Research plan:\n{json.dumps(steps, indent=2)}")
125
- return {"progress": 10, "message": "Generated research plan"}
126
 
127
 
128
- # --- LangGraph node: Scrape for each step ---
129
- async def scrape_node(state: dict) -> dict:
130
- steps = state["steps"]
131
  scraper = state["scraper"]
 
132
  num_sites_per_query = state["num_sites_per_query"]
133
- findings = []
134
- for idx, step in enumerate(steps):
135
- scraped = await scraper.search_and_scrape(step, num_sites=num_sites_per_query)
136
- findings.append({"step": step, "data": scraped})
137
- return {"findings": findings, "progress": 70, "message": "Scraping complete"}
138
 
 
 
 
 
 
 
 
139
 
140
- # --- LangGraph node: Generate report outline ---
141
- async def outline_node(state: dict) -> dict:
142
- topic = state["topic"]
143
- findings = state["findings"]
144
- findings_text = json.dumps(findings, indent=2)
145
- prompt = REPORT_OUTLINE_PROMPT.format(topic=topic, findings=findings_text)
146
- result = await llm.ainvoke(prompt)
147
- outline = result.content if hasattr(result, "content") else str(result)
148
- return {"outline": outline, "progress": 90, "message": "Generated report outline"}
149
-
150
-
151
- # --- LangGraph node: Fill in report content for each heading ---
152
- async def fillin_node(state: dict) -> dict:
153
- findings = state["findings"]
154
- outline = state["outline"]
155
- topic = state["topic"]
156
- # Try to parse outline as JSON, else fallback to text splitting
157
- try:
158
- outline_obj = json.loads(outline)
159
- title = outline_obj["title"]
160
- headings = outline_obj["headings"]
161
- except Exception:
162
- # Fallback: try to extract headings from text
163
- lines = outline.splitlines()
164
- title = lines[0].strip("# ") if lines else topic
165
- headings = [line.strip("# ") for line in lines if line.strip().startswith("## ")]
166
- findings_text = json.dumps(findings, indent=2)
167
- report = f"# {title}\n\n"
168
- for idx, heading in enumerate(headings):
169
- prompt = REPORT_FILLIN_PROMPT.format(
170
- findings=findings_text,
171
- outline=outline,
172
- slot=heading,
173
- )
174
- result = await llm.ainvoke(prompt)
175
- content = result.content if hasattr(result, "content") else str(result)
176
- # Remove heading if LLM included it
177
- if content.strip().startswith(heading):
178
- content = content.strip()[len(heading) :].strip()
179
- report += f"\n\n## {heading}\n\n{content}\n"
180
- return {"content": report, "progress": 95, "message": "Filled in report content"}
181
-
182
-
183
- # --- LangGraph node: Finalize report ---
184
- def finalize_node(state: dict) -> dict:
185
- findings = state.get("findings", [])
186
- media = {"images": [], "videos": [], "links": []}
187
- for step in findings:
188
- for site in step.get("data", []):
189
- media["images"].extend(site.get("images", []))
190
- media["videos"].extend(site.get("videos", []))
191
- media["links"].extend(site.get("links", []))
192
- # Dedupe
193
- media["images"] = list(set(media["images"]))
194
- media["videos"] = list(set(media["videos"]))
195
- # Links: dedupe by URL
196
- seen_links = set()
197
- deduped_links = []
198
- for link in media["links"]:
199
- url = link["href"] if isinstance(link, dict) and "href" in link else str(link)
200
- if url not in seen_links:
201
- seen_links.add(url)
202
- deduped_links.append(link)
203
- media["links"] = deduped_links
204
- return {
205
- "topic": state["topic"],
206
- "timestamp": datetime.now().isoformat(),
207
- "content": state["content"],
208
- "media": media,
209
- "research_tree": {},
210
- "metadata": {"steps": state.get("steps", [])},
211
- "progress": 100,
212
- "message": "Research complete!",
213
- }
214
 
215
 
216
  # --- Main research logic using LangGraph ---
@@ -219,17 +98,14 @@ async def run_research(topic, scraper, max_depth, num_sites_per_query):
219
  graph = StateGraph(state_schema=ResearchState)
220
  graph.add_node("plan", research_plan_node)
221
  graph.add_node("scrape", scrape_node)
222
- graph.add_node("outline_node", outline_node)
223
- graph.add_node("fillin", fillin_node)
224
- graph.add_node("finalize", finalize_node)
225
 
226
  graph.add_edge("plan", "scrape")
227
- graph.add_edge("scrape", "outline_node")
228
- graph.add_edge("outline_node", "fillin")
229
- graph.add_edge("fillin", "finalize")
230
- graph.add_edge("finalize", END)
231
  graph.set_entry_point("plan")
232
  graph = graph.compile()
 
233
 
234
  state = {
235
  "topic": topic,
 
3
  import logging
4
  import os
5
  from datetime import datetime
 
6
  from typing import Any, Dict, List, Optional, TypedDict
7
 
8
  from dotenv import load_dotenv
 
13
  from langgraph.graph import END, StateGraph
14
  from sse_starlette.sse import EventSourceResponse
15
 
16
+ from prompts import RESEARCH_PLAN_PROMPT, SEARCH_QUERY_PROMPT
17
+ from schema import ResearchPlan, SearchQuery
18
  from scraper import CrawlForAIScraper
19
 
20
  load_dotenv()
 
41
  return {"status": "ok"}
42
 
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  # --- LangChain LLM setup (Gemini, correct usage) ---
45
  llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
46
 
 
48
  # --- State schema for LangGraph ---
49
  class ResearchState(TypedDict, total=False):
50
  topic: str
51
+
52
+ research_plan: list[str]
53
+ idx_research_plan: int
54
+ ctx_researcher: list[str]
55
+ ctx_manager: list[str]
56
+ token_count: int
57
+
58
+ scraper: CrawlForAIScraper
59
  max_depth: int
60
  num_sites_per_query: int
61
+
62
+
63
+ async def research_plan_node(state: ResearchState) -> ResearchPlan:
 
 
 
 
 
 
 
 
 
 
 
64
  topic = state["topic"]
65
+ plan = await llm.with_structured_output(ResearchPlan).ainvoke(RESEARCH_PLAN_PROMPT.format(topic=topic), temperature=1.5)
66
+ if hasattr(plan, "steps"):
67
+ steps = plan["steps"]
 
 
 
 
68
  logger.info(f"Research plan:\n{json.dumps(steps, indent=2)}")
69
+ return steps
70
 
71
 
72
+ async def scrape_node(state: ResearchState) -> ResearchState:
73
+ topic = state["topic"]
 
74
  scraper = state["scraper"]
75
+ max_depth = state["max_depth"]
76
  num_sites_per_query = state["num_sites_per_query"]
 
 
 
 
 
77
 
78
+ # Generate initial search query
79
+ query = llm.with_structured_output(SearchQuery).invoke(
80
+ SEARCH_QUERY_PROMPT.format(
81
+ vertical=state["research_plan"][state["idx_research_plan"]], topic=topic, research_plan="None", past_queries="None", ctx_manager="None", n=1
82
+ ),
83
+ temperature=1.5,
84
+ )
85
 
86
+ # Search and scrape
87
+ data = await state["scraper"].search_and_scrape(
88
+ query, num_sites_per_query
89
+ ) # node -> data = [{url:...}, {url:...}, ...]
90
+ state["ctx_researcher"].append(json.dumps(data, indent=2))
91
+ pass
92
+ # TODO: Implement the scraping logic and update the state with the scraped data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
 
95
  # --- Main research logic using LangGraph ---
 
98
  graph = StateGraph(state_schema=ResearchState)
99
  graph.add_node("plan", research_plan_node)
100
  graph.add_node("scrape", scrape_node)
101
+ graph.add_node("gen_report", gen_report_node)
 
 
102
 
103
  graph.add_edge("plan", "scrape")
104
+ graph.add_edge("scrape", "conditional", "plan", "gen_report")
105
+ graph.add_edge("gen_report", END)
 
 
106
  graph.set_entry_point("plan")
107
  graph = graph.compile()
108
+ print(graph.get_graph().draw_mermaid())
109
 
110
  state = {
111
  "topic": topic,
langgraph_backend/prompts.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textwrap import dedent
2
+
3
+ # --- Prompt templates ---
4
+ RESEARCH_PLAN_PROMPT = dedent("""You are an expert Deep Research agent, part of a Multiagent system.
5
+
6
+ <User query>
7
+ {topic}
8
+ </User query>
9
+
10
+ ---
11
+ Generate few very high level steps on which other agents can do info collection runs. Provide only data collection steps, no data identification, summarization, manipulation, selection, etc.
12
+ Do not presume any knowledge about the topic.
13
+ Return a string array of steps.""")
14
+
15
+ SITE_SUMMARY_PROMPT = dedent("""Extract specific verbatim key information from the following content that is related to the topic "{query}". No small talk.
16
+ <Findings>
17
+ {findings}
18
+ </Findings>
19
+ """)
20
+
21
+ CONTINUE_BRANCH_PROMPT = dedent("""Given the current state of research, decide whether to continue exploring the current branch or not.
22
+ <Global Research Plan>
23
+ {research_plan}
24
+ </Global Research Plan>
25
+
26
+ Current Topic: {query}
27
+
28
+ <Past Searched Queries>
29
+ {past_queries}
30
+ </Past Searched Queries>
31
+
32
+ <Findings under current topic>
33
+ {ctx_manager}
34
+ </Findings under current topic>
35
+
36
+ Consider:
37
+ - Information saturation
38
+ - Information duplication
39
+ - Coverage of current topic
40
+ - Potential for new insights
41
+
42
+ Return only decision: true/false""")
43
+
44
+ SEARCH_QUERY_PROMPT = dedent("""Based on the following findings on topic {vertical}, create google search queries
45
+ <Original user query>
46
+ {topic}
47
+ </Original user query>
48
+
49
+ <Global Research Plan>
50
+ {research_plan}
51
+ </Global Research Plan>
52
+
53
+ <Past Searched Queries>
54
+ {past_queries}
55
+ </Past Searched Queries>
56
+
57
+ <Findings under current topic>
58
+ {ctx_manager}
59
+ </Findings under current topic>
60
+
61
+ Suggest {n} specific google search queries that:
62
+ - Covers what has not been covered yet
63
+ - Builds upon these findings
64
+ - Explores different aspects
65
+ - Goes deeper into important details
66
+
67
+ - Do not do quote searches
68
+ - Queries should be generic and short
69
+ - Do not presume any knowledge about the topic
70
+ Return as JSON array of objects with properties:
71
+ - query (string)""")
72
+
73
+ REPORT_OUTLINE_PROMPT = dedent("""Generate a outline for a report based on the findings:
74
+ <Original user query>
75
+ {topic}
76
+ </Original user query>
77
+
78
+ <Findings>
79
+ {ctx_manager}
80
+ </Findings>
81
+
82
+ Deduplicate, reorganize and analyze the findings to create the outline.
83
+ If there are multiple comparisons, use a table instead of multiple headings.
84
+ The outline should include:
85
+ - Title
86
+ - List of h2 headings
87
+ Do not include hashtags""")
88
+
89
+ REPORT_FILLIN_PROMPT = dedent("""Fill in the content for the current outline heading based on the findings:
90
+ <Findings>
91
+ {ctx_manager}
92
+ </Findings>
93
+
94
+ <The outline>
95
+ {report_outline}
96
+ </The outline>
97
+
98
+ <Current outline heading to fill in>
99
+ ## {slot}
100
+ ...
101
+ </Current outline heading to fill in>
102
+
103
+ Assume [done] headings have their respective content.
104
+ The content should be comprehensive, detailed and well-structured, providing detailed information on current heading.
105
+ If needed use tables, lists. Do not include subheadings.
106
+ Do not include the heading in the content.
107
+ """)