Spaces:

sohamw03
/

knowledge-net

Paused

App Files Files Community

Soham Waghmare commited on Jun 21, 2025

Commit

dcbc875

1 Parent(s): 1986dac

feat: enhance research agent with structured system message and update search tool integration

Browse files

Files changed (3) hide show

langgraph_backend/agent_tools.py +52 -7
langgraph_backend/prompts.py +30 -3
langgraph_backend/tools_tools.py +6 -14

langgraph_backend/agent_tools.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
 import os
 from dotenv import load_dotenv
 from langchain_core.messages.ai import AIMessage
@@ -9,29 +10,73 @@ from langgraph.checkpoint.memory import MemorySaver
 from langgraph.prebuilt import create_react_agent
 from langgraph.types import Command, interrupt
-from tools_tools import calc, scrape
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 load_dotenv()
 checkpointer = MemorySaver()
-tools = [calc, scrape]
 # --- LangChain LLM setup (Gemini, correct usage) ---
-model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
 agent = create_react_agent(
     model=model,
     tools=tools,
     checkpointer=checkpointer,
 )
-# Usage example
-config = {"configurable": {"thread_id": "research_session_1"}}
 async def invoke_agent(message: str, thread_id: str):
-    config = {"configurable": {"thread_id": thread_id}}
     async for event in agent.astream({"messages": [{"role": "user", "content": message}]}, config=config):
         print(event)

 import logging
 import os
+from textwrap import dedent
 from dotenv import load_dotenv
 from langchain_core.messages.ai import AIMessage
 from langgraph.prebuilt import create_react_agent
 from langgraph.types import Command, interrupt
+from tools_tools import search
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 load_dotenv()
 checkpointer = MemorySaver()
+tools = [search]
 # --- LangChain LLM setup (Gemini, correct usage) ---
+model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
+# System message for the research agent
+SYSTEM_MESSAGE = dedent(
+    """You are KnowledgeNet, an expert deep research agent designed to help users gather comprehensive information on any topic.
+    Your Operating Protocol:
+    You operate as a state machine and MUST follow these steps in order. In every response, you must first state which step you are currently performing.
+    Step 1: Initial Exploration
+    - Your first action is ALWAYS to call the search tool with a broad query to understand the topic.
+    - After the tool returns, analyze the results and proceed to Step 2.
+    Step 2: Analysis & Branching
+    - State: "Currently in Step 2: Analysis & Branching."
+    - Based on the results from Step 1, identify 2-3 key entities, questions, or claims.
+    - Formulate and execute new, more specific search queries for each of these identified points. This step MUST involve multiple tool calls.
+    - Once you have explored these branches, proceed to Step 3.
+    Step 3: Verification & Synthesis
+    - State: "Currently in Step 3: Verification & Synthesis."
+    - Review all the information gathered from all previous steps.
+    - If there are any unverified or conflicting claims, perform one final, targeted search to try and resolve them.
+    - If all information is gathered, state that you are ready to generate the final report and do not call any more tools.
+    Step 4: Final Report Generation
+    - Once you have completed all research steps, generate the final report according to the specified structure. Do not generate this report until all other steps are complete.
+    5.  **Structured Final Report:** For your final answer, provide a comprehensive report structured with the following headings:
+        - **Summary:** A one-paragraph executive summary of the findings.
+        - **Detailed Findings:** A detailed, point-by-point breakdown of the information discovered.
+        - **Supporting Evidence:** Use specific data points, timelines, direct quotes, and version numbers where available. Cite your sources clearly.
+        - **Conclusion:** A final conclusion that directly answers the user's query or explains why it cannot be answered.
+        - **Actionable Recommendation:** In the **Conclusion**, after presenting the technical facts, you MUST provide a single, clear, and actionable recommendation for the user. If multiple options are technically equivalent, choose the one that is the most direct, simplest, or officially recommended. State your reasoning for this choice (e.g., "While performance is similar, the standalone installer is recommended as it has no dependencies and is the most direct installation method.").
+    6. **Disambiguation:** If a search result appears to be about a different topic with the same name (e.g., 'UV' for spectroscopy vs. 'uv' for a package manager), you must explicitly state that you are discarding it as irrelevant and refine your subsequent search queries to be more specific (e.g., search for "uv package manager" instead of just "uv").
+    Suggested Research Avenues (Check multiple types):
+    - Official websites, blogs, and product documentation for authoritative information.
+    - Academic databases (e.g., Google Scholar, ArXiv) for scholarly articles.
+    - Developer forums and communities (e.g., Reddit, Stack Overflow, Hacker News) for user discussions and practical insights.
+    - Official GitHub repositories (check Issues, Pull Requests, and Discussions) for project updates and roadmap discussions.
+    - Recent news articles and press releases for current updates.
+    - Social media (e.g., Twitter/X) for real-time announcements from official accounts or key developers.
+    - Government and institutional reports for reliable grey literature."""
+)
 agent = create_react_agent(
     model=model,
     tools=tools,
     checkpointer=checkpointer,
+    prompt=SYSTEM_MESSAGE,
 )
 async def invoke_agent(message: str, thread_id: str):
+    config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 50}
     async for event in agent.astream({"messages": [{"role": "user", "content": message}]}, config=config):
         print(event)

langgraph_backend/prompts.py CHANGED Viewed

@@ -12,12 +12,39 @@ Generate few very high level steps on which other agents can do info collection
 Do not presume any knowledge about the topic.
 Return a string array of steps.""")
-SITE_SUMMARY_PROMPT = dedent("""Extract specific verbatim key information from the following content that is related to the topic "{query}". No small talk.
-<Findings>
 {findings}
-</Findings>
 """)
 CONTINUE_BRANCH_PROMPT = dedent("""Given the current state of research, decide whether to continue exploring the current branch or not.
 <Global Research Plan>
 {research_plan}

 Do not presume any knowledge about the topic.
 Return a string array of steps.""")
+SITE_SUMMARY_PROMPT = dedent("""Extract and filter the following search results from this query "{query}" to get important verbatim information. No small talk.
+<findings>
 {findings}
+</findings>
 """)
+SITE_SUMMARY_PROMPT_V3 = dedent("""
+    You are a specialized data extraction component for a research agent.
+    Your goal is to process a list of web search results and extract only the most critical, relevant, and verbatim information related to the user's query.
+    **Original User Query:** "{query}"
+    **Processing Instructions:**
+    For each document provided in the `<search_results>`:
+    1.  **Analyze Relevance:** Read the document content and determine if it contains information that directly addresses or relates to the user's query.
+    2.  **Verbatim Extraction:** If relevant, extract the key sentences, data points, commands, or quotes verbatim. Do not rephrase. Focus on concrete facts, not general descriptions.
+    3.  **Maintain Source:** Ensure every piece of extracted information is clearly attributed to its source URL.
+    4.  **Handle Irrelevance:** If a document is completely irrelevant, ignore it in the output. If NONE of the documents are relevant, return an empty response.
+    **Output Format:**
+    You MUST format your entire response in structured markdown. For each source that contains relevant information, create a section with the following format:
+    ---
+    **Source:** [URL of the source]
+    *   Verbatim fact or quote 1.
+    *   Verbatim fact or quote 2.
+    *   ...
+    **Search Results to Process:**
+    <search_results>
+    {findings}
+    </search_results>""")
 CONTINUE_BRANCH_PROMPT = dedent("""Given the current state of research, decide whether to continue exploring the current branch or not.
 <Global Research Plan>
 {research_plan}

langgraph_backend/tools_tools.py CHANGED Viewed

@@ -6,42 +6,34 @@ from dotenv import load_dotenv
 from langchain_core.tools import tool
 from langchain_google_genai import ChatGoogleGenerativeAI
-from prompts import SITE_SUMMARY_PROMPT
 from scraper import CrawlForAIScraper
 load_dotenv()
 scraper_inst = CrawlForAIScraper()
-model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
 @tool
-def calc(a: int, b: int) -> int:
-    """
-    Takes in two integers and returns their integer sum.
-    """
-    return str(a + b)
-@tool
-async def scrape(query: str, num_sites_per_query: int) -> List[Dict[str, Any]]:
     """
     Search in a search engine.
     Args:
         query: string query for the search engine.
-        num_sites_per_query: number of sites to read after searching.
     Returns:
         Results related to the search.
     """
-    sites = await scraper_inst.search_and_scrape(query, num_sites_per_query)
     # Add data to context
     # src [1] : https://...
     # content...
     agg_sites_ctx = ["\n\n---\n\n".join([f"src [{i + 1}] : {d['url']}\n{d['text']}" for i, d in enumerate(sites)])]
     summ_sites_ctx = []
     for idx in range(0, len(sites), 3):
-        summary = model.invoke(SITE_SUMMARY_PROMPT.format(query=query, findings=agg_sites_ctx), config={"temperature": 0.2}).text()
         summ_sites_ctx.append(summary)
     return "\n\n---\n\n".join(summ_sites_ctx)

 from langchain_core.tools import tool
 from langchain_google_genai import ChatGoogleGenerativeAI
+from prompts import SITE_SUMMARY_PROMPT_V3
 from scraper import CrawlForAIScraper
 load_dotenv()
 scraper_inst = CrawlForAIScraper()
+model = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite", google_api_key=os.getenv("GOOGLE_API_KEY"))
 @tool
+async def search(query: str) -> List[Dict[str, Any]]:
     """
     Search in a search engine.
+    Always call this tool if there is any knowledge gap in performing the task.
     Args:
         query: string query for the search engine.
     Returns:
         Results related to the search.
     """
+    sites = await scraper_inst.search_and_scrape(query, 5)
     # Add data to context
     # src [1] : https://...
     # content...
     agg_sites_ctx = ["\n\n---\n\n".join([f"src [{i + 1}] : {d['url']}\n{d['text']}" for i, d in enumerate(sites)])]
     summ_sites_ctx = []
     for idx in range(0, len(sites), 3):
+        summary = model.invoke(SITE_SUMMARY_PROMPT_V3.format(query=query, findings=agg_sites_ctx), config={"temperature": 0.5}).text()
         summ_sites_ctx.append(summary)
     return "\n\n---\n\n".join(summ_sites_ctx)