Soham Waghmare commited on
Commit
dcbc875
·
1 Parent(s): 1986dac

feat: enhance research agent with structured system message and update search tool integration

Browse files
langgraph_backend/agent_tools.py CHANGED
@@ -1,5 +1,6 @@
1
  import logging
2
  import os
 
3
 
4
  from dotenv import load_dotenv
5
  from langchain_core.messages.ai import AIMessage
@@ -9,29 +10,73 @@ from langgraph.checkpoint.memory import MemorySaver
9
  from langgraph.prebuilt import create_react_agent
10
  from langgraph.types import Command, interrupt
11
 
12
- from tools_tools import calc, scrape
13
 
14
  logger = logging.getLogger(__name__)
15
  logging.basicConfig(level=logging.INFO)
16
  load_dotenv()
17
 
18
  checkpointer = MemorySaver()
19
- tools = [calc, scrape]
20
 
21
  # --- LangChain LLM setup (Gemini, correct usage) ---
22
- model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  agent = create_react_agent(
24
  model=model,
25
  tools=tools,
26
  checkpointer=checkpointer,
 
27
  )
28
 
29
- # Usage example
30
- config = {"configurable": {"thread_id": "research_session_1"}}
31
-
32
 
33
  async def invoke_agent(message: str, thread_id: str):
34
- config = {"configurable": {"thread_id": thread_id}}
35
 
36
  async for event in agent.astream({"messages": [{"role": "user", "content": message}]}, config=config):
37
  print(event)
 
1
  import logging
2
  import os
3
+ from textwrap import dedent
4
 
5
  from dotenv import load_dotenv
6
  from langchain_core.messages.ai import AIMessage
 
10
  from langgraph.prebuilt import create_react_agent
11
  from langgraph.types import Command, interrupt
12
 
13
+ from tools_tools import search
14
 
15
  logger = logging.getLogger(__name__)
16
  logging.basicConfig(level=logging.INFO)
17
  load_dotenv()
18
 
19
  checkpointer = MemorySaver()
20
+ tools = [search]
21
 
22
  # --- LangChain LLM setup (Gemini, correct usage) ---
23
+ model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
24
+
25
+ # System message for the research agent
26
+ SYSTEM_MESSAGE = dedent(
27
+ """You are KnowledgeNet, an expert deep research agent designed to help users gather comprehensive information on any topic.
28
+
29
+ Your Operating Protocol:
30
+
31
+ You operate as a state machine and MUST follow these steps in order. In every response, you must first state which step you are currently performing.
32
+
33
+ Step 1: Initial Exploration
34
+ - Your first action is ALWAYS to call the search tool with a broad query to understand the topic.
35
+ - After the tool returns, analyze the results and proceed to Step 2.
36
+
37
+ Step 2: Analysis & Branching
38
+ - State: "Currently in Step 2: Analysis & Branching."
39
+ - Based on the results from Step 1, identify 2-3 key entities, questions, or claims.
40
+ - Formulate and execute new, more specific search queries for each of these identified points. This step MUST involve multiple tool calls.
41
+ - Once you have explored these branches, proceed to Step 3.
42
+
43
+ Step 3: Verification & Synthesis
44
+ - State: "Currently in Step 3: Verification & Synthesis."
45
+ - Review all the information gathered from all previous steps.
46
+ - If there are any unverified or conflicting claims, perform one final, targeted search to try and resolve them.
47
+ - If all information is gathered, state that you are ready to generate the final report and do not call any more tools.
48
+
49
+ Step 4: Final Report Generation
50
+ - Once you have completed all research steps, generate the final report according to the specified structure. Do not generate this report until all other steps are complete.
51
+
52
+ 5. **Structured Final Report:** For your final answer, provide a comprehensive report structured with the following headings:
53
+ - **Summary:** A one-paragraph executive summary of the findings.
54
+ - **Detailed Findings:** A detailed, point-by-point breakdown of the information discovered.
55
+ - **Supporting Evidence:** Use specific data points, timelines, direct quotes, and version numbers where available. Cite your sources clearly.
56
+ - **Conclusion:** A final conclusion that directly answers the user's query or explains why it cannot be answered.
57
+ - **Actionable Recommendation:** In the **Conclusion**, after presenting the technical facts, you MUST provide a single, clear, and actionable recommendation for the user. If multiple options are technically equivalent, choose the one that is the most direct, simplest, or officially recommended. State your reasoning for this choice (e.g., "While performance is similar, the standalone installer is recommended as it has no dependencies and is the most direct installation method.").
58
+ 6. **Disambiguation:** If a search result appears to be about a different topic with the same name (e.g., 'UV' for spectroscopy vs. 'uv' for a package manager), you must explicitly state that you are discarding it as irrelevant and refine your subsequent search queries to be more specific (e.g., search for "uv package manager" instead of just "uv").
59
+
60
+ Suggested Research Avenues (Check multiple types):
61
+ - Official websites, blogs, and product documentation for authoritative information.
62
+ - Academic databases (e.g., Google Scholar, ArXiv) for scholarly articles.
63
+ - Developer forums and communities (e.g., Reddit, Stack Overflow, Hacker News) for user discussions and practical insights.
64
+ - Official GitHub repositories (check Issues, Pull Requests, and Discussions) for project updates and roadmap discussions.
65
+ - Recent news articles and press releases for current updates.
66
+ - Social media (e.g., Twitter/X) for real-time announcements from official accounts or key developers.
67
+ - Government and institutional reports for reliable grey literature."""
68
+ )
69
+
70
  agent = create_react_agent(
71
  model=model,
72
  tools=tools,
73
  checkpointer=checkpointer,
74
+ prompt=SYSTEM_MESSAGE,
75
  )
76
 
 
 
 
77
 
78
  async def invoke_agent(message: str, thread_id: str):
79
+ config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 50}
80
 
81
  async for event in agent.astream({"messages": [{"role": "user", "content": message}]}, config=config):
82
  print(event)
langgraph_backend/prompts.py CHANGED
@@ -12,12 +12,39 @@ Generate few very high level steps on which other agents can do info collection
12
  Do not presume any knowledge about the topic.
13
  Return a string array of steps.""")
14
 
15
- SITE_SUMMARY_PROMPT = dedent("""Extract specific verbatim key information from the following content that is related to the topic "{query}". No small talk.
16
- <Findings>
17
  {findings}
18
- </Findings>
19
  """)
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  CONTINUE_BRANCH_PROMPT = dedent("""Given the current state of research, decide whether to continue exploring the current branch or not.
22
  <Global Research Plan>
23
  {research_plan}
 
12
  Do not presume any knowledge about the topic.
13
  Return a string array of steps.""")
14
 
15
+ SITE_SUMMARY_PROMPT = dedent("""Extract and filter the following search results from this query "{query}" to get important verbatim information. No small talk.
16
+ <findings>
17
  {findings}
18
+ </findings>
19
  """)
20
 
21
+ SITE_SUMMARY_PROMPT_V3 = dedent("""
22
+ You are a specialized data extraction component for a research agent.
23
+ Your goal is to process a list of web search results and extract only the most critical, relevant, and verbatim information related to the user's query.
24
+
25
+ **Original User Query:** "{query}"
26
+
27
+ **Processing Instructions:**
28
+ For each document provided in the `<search_results>`:
29
+ 1. **Analyze Relevance:** Read the document content and determine if it contains information that directly addresses or relates to the user's query.
30
+ 2. **Verbatim Extraction:** If relevant, extract the key sentences, data points, commands, or quotes verbatim. Do not rephrase. Focus on concrete facts, not general descriptions.
31
+ 3. **Maintain Source:** Ensure every piece of extracted information is clearly attributed to its source URL.
32
+ 4. **Handle Irrelevance:** If a document is completely irrelevant, ignore it in the output. If NONE of the documents are relevant, return an empty response.
33
+
34
+ **Output Format:**
35
+ You MUST format your entire response in structured markdown. For each source that contains relevant information, create a section with the following format:
36
+
37
+ ---
38
+ **Source:** [URL of the source]
39
+ * Verbatim fact or quote 1.
40
+ * Verbatim fact or quote 2.
41
+ * ...
42
+
43
+ **Search Results to Process:**
44
+ <search_results>
45
+ {findings}
46
+ </search_results>""")
47
+
48
  CONTINUE_BRANCH_PROMPT = dedent("""Given the current state of research, decide whether to continue exploring the current branch or not.
49
  <Global Research Plan>
50
  {research_plan}
langgraph_backend/tools_tools.py CHANGED
@@ -6,42 +6,34 @@ from dotenv import load_dotenv
6
  from langchain_core.tools import tool
7
  from langchain_google_genai import ChatGoogleGenerativeAI
8
 
9
- from prompts import SITE_SUMMARY_PROMPT
10
  from scraper import CrawlForAIScraper
11
 
12
  load_dotenv()
13
  scraper_inst = CrawlForAIScraper()
14
- model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
15
 
16
 
17
  @tool
18
- def calc(a: int, b: int) -> int:
19
- """
20
- Takes in two integers and returns their integer sum.
21
- """
22
- return str(a + b)
23
-
24
-
25
- @tool
26
- async def scrape(query: str, num_sites_per_query: int) -> List[Dict[str, Any]]:
27
  """
28
  Search in a search engine.
 
29
 
30
  Args:
31
  query: string query for the search engine.
32
- num_sites_per_query: number of sites to read after searching.
33
 
34
  Returns:
35
  Results related to the search.
36
  """
37
- sites = await scraper_inst.search_and_scrape(query, num_sites_per_query)
38
  # Add data to context
39
  # src [1] : https://...
40
  # content...
41
  agg_sites_ctx = ["\n\n---\n\n".join([f"src [{i + 1}] : {d['url']}\n{d['text']}" for i, d in enumerate(sites)])]
42
  summ_sites_ctx = []
43
  for idx in range(0, len(sites), 3):
44
- summary = model.invoke(SITE_SUMMARY_PROMPT.format(query=query, findings=agg_sites_ctx), config={"temperature": 0.2}).text()
45
  summ_sites_ctx.append(summary)
46
 
47
  return "\n\n---\n\n".join(summ_sites_ctx)
 
6
  from langchain_core.tools import tool
7
  from langchain_google_genai import ChatGoogleGenerativeAI
8
 
9
+ from prompts import SITE_SUMMARY_PROMPT_V3
10
  from scraper import CrawlForAIScraper
11
 
12
  load_dotenv()
13
  scraper_inst = CrawlForAIScraper()
14
+ model = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite", google_api_key=os.getenv("GOOGLE_API_KEY"))
15
 
16
 
17
  @tool
18
+ async def search(query: str) -> List[Dict[str, Any]]:
 
 
 
 
 
 
 
 
19
  """
20
  Search in a search engine.
21
+ Always call this tool if there is any knowledge gap in performing the task.
22
 
23
  Args:
24
  query: string query for the search engine.
 
25
 
26
  Returns:
27
  Results related to the search.
28
  """
29
+ sites = await scraper_inst.search_and_scrape(query, 5)
30
  # Add data to context
31
  # src [1] : https://...
32
  # content...
33
  agg_sites_ctx = ["\n\n---\n\n".join([f"src [{i + 1}] : {d['url']}\n{d['text']}" for i, d in enumerate(sites)])]
34
  summ_sites_ctx = []
35
  for idx in range(0, len(sites), 3):
36
+ summary = model.invoke(SITE_SUMMARY_PROMPT_V3.format(query=query, findings=agg_sites_ctx), config={"temperature": 0.5}).text()
37
  summ_sites_ctx.append(summary)
38
 
39
  return "\n\n---\n\n".join(summ_sites_ctx)