Final_Assignment_Template

Paused

App Files Files Community

cacaprog commited on May 13, 2025

Commit

d3204ae

verified ·

1 Parent(s): 052a991

Updated app.py with langchain

Browse files

Files changed (1) hide show

app.py +114 -116

app.py CHANGED Viewed

@@ -1,25 +1,24 @@
 import os
 import gradio as gr
 import requests
-import inspect
-import pandas as pd
 import json
-from llama_index.agent.react import ReActAgent
-from llama_index.agent.workflow import AgentWorkflow
-from llama_index.llms.openai import OpenAI
-from llama_index.core.tools import FunctionTool, QueryEngineTool
-from llama_index.core import VectorStoreIndex
-from llama_index.vector_stores.chroma import ChromaVectorStore
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-from llama_index.core.schema import TextNode
 import chromadb
 from tavily import TavilyClient
 import asyncio
-# --- Constants ---
-DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # Load environment variables
 from dotenv import load_dotenv
@@ -27,57 +26,44 @@ load_dotenv()
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 class ResearchAgent:
     def __init__(self):
         print("Initializing ResearchAgent...")
         self.tavily = TavilyClient(api_key=TAVILY_API_KEY)
-        self.llm = OpenAI(model="gpt-4")
-        self.workflow = self.initialize_workflow()
         print("ResearchAgent initialized successfully.")
-    def initialize_workflow(self):
-        """Initialize all components needed for the workflow"""
         # Build VectorStore
         with open("metadata.jsonl", "r") as f:
             json_QA = [json.loads(line) for line in f]
-        # Initialize ChromaDB
-        chroma_client = chromadb.PersistentClient(path="./chroma_db")
-        chroma_collection = chroma_client.get_or_create_collection("qa_documents")
-        # Set up embeddings
-        embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")
-        # Prepare nodes for indexing
-        nodes = []
         for sample in json_QA:
             content = f"Question: {sample['Question']}\n\nFinal answer: {sample['Final answer']}"
-            node = TextNode(
-                text=content,
-                metadata={
-                    "source": sample['task_id'],
-                    "level": sample['Level'],
-                    "final_answer": sample['Final answer'],
-                    "steps": sample['Annotator Metadata']['Steps'],
-                    "number_of_steps": sample['Annotator Metadata']['Number of steps'],
-                    "how_long_did_this_take": sample['Annotator Metadata']['How long did this take?'],
-                    "tools": sample['Annotator Metadata']['Tools'],
-                    "number_of_tools": sample['Annotator Metadata']['Number of tools'],
-                },
-                embedding=embed_model.get_text_embedding(content)
-            )
-            nodes.append(node)
-        # Create and populate vector store
-        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
-        index = VectorStoreIndex(
-            nodes=nodes,
-            embed_model=embed_model,
-            vector_store=vector_store,
-            store_nodes_override=True
-        )
-        # Custom Tavily search function
         def tavily_search(query: str, include_raw_content: bool = False) -> str:
             """Search the web using Tavily. Returns a summary or raw content."""
             response = self.tavily.search(
@@ -87,7 +73,6 @@ class ResearchAgent:
             )
             return str(response)
-        # arXiv search tool
         def search_arxiv(query: str, date_range: str = None) -> str:
             """Search arXiv for papers. Date format: '2022-06-01 TO 2022-07-01'."""
             base_url = "http://export.arxiv.org/api/query?"
@@ -97,74 +82,87 @@ class ResearchAgent:
             response = requests.get(base_url, params=params)
             return response.text
-        # Zip code extraction
         def extract_zip_code(location: str) -> str:
             """Get zip code for a location (e.g., 'Fred Howard Park, Florida')."""
             return "34689"  # Mocked for demo
-        # Wrap functions as tools
-        tavily_tool = FunctionTool.from_defaults(fn=tavily_search)
-        arxiv_tool = FunctionTool.from_defaults(fn=search_arxiv)
-        zip_tool = FunctionTool.from_defaults(fn=extract_zip_code)
-        # Vector search tool
-        query_engine = index.as_query_engine(similarity_top_k=2)
-        vector_tool = QueryEngineTool.from_defaults(
-            query_engine=query_engine,
-            name="vector_qa",
-            description="Searches cached Q&A pairs about arXiv papers and species data",
-        )
-        # Define agents
-        search_agent = ReActAgent(
-            name="search_agent",
-            description="A research assistant that can search the web and arXiv.",
-            tools=[tavily_tool, arxiv_tool, vector_tool],
-            llm=self.llm,
-            system_prompt="You are a research assistant. First check cached Q&As. Use tools to find answers.",
-            verbose=True,
-        )
-        data_agent = ReActAgent(
-            name="data_agent",
-            description="A data extraction agent that can extract and format data.",
-            tools=[zip_tool],
-            llm=self.llm,
-            system_prompt="You extract and format data (e.g., zip codes).",
-            verbose=True,
-        )
-        math_agent = ReActAgent(
-            name="math_agent",
-            description="A math agent that can perform calculations.",
-            tools=[],
-            llm=self.llm,
-            system_prompt="You perform calculations and provide answers.",
-            verbose=True,
-        )
-        sumarizzer_agent = ReActAgent(
-            name="sumarizzer_agent",
-            description="A summarizer agent that can summarize text.",
-            tools=[],
-            llm=self.llm,
-            system_prompt="""I will summarize the answer. Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.""",
-            verbose=True,
-        )
-        # Create workflow
-        workflow = AgentWorkflow(
-            agents=[search_agent, data_agent, math_agent, sumarizzer_agent],
-            root_agent="search_agent",
-        )
-        return workflow
     async def process_query_async(self, question: str) -> str:
         """Process user query using the workflow (async version)"""
         try:
-            response = await self.workflow.run(user_msg=question)
-            return str(response)
         except Exception as e:
             return f"An error occurred: {str(e)}"
@@ -172,7 +170,6 @@ class ResearchAgent:
         """Synchronous wrapper for the async query processing"""
         print(f"Agent received question (first 50 chars): {question[:50]}...")
         try:
-            # Run the async function in a new event loop
             loop = asyncio.new_event_loop()
             asyncio.set_event_loop(loop)
             answer = loop.run_until_complete(self.process_query_async(question))
@@ -183,6 +180,7 @@ class ResearchAgent:
             print(error_msg)
             return error_msg
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the ResearchAgent on them, submits all answers,

 import os
 import gradio as gr
 import requests
 import json
+import pandas as pd
 import chromadb
 from tavily import TavilyClient
 import asyncio
+from typing import List, Dict, Any
+# LangChain imports
+from langchain.agents import AgentExecutor, Tool, create_react_agent
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.messages import HumanMessage, AIMessage
+from langchain.chains import LLMChain
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_core.documents import Document
+from langchain_openai import ChatOpenAI
+from langchain.schema import SystemMessage
+from langchain.agents import AgentType
 # Load environment variables
 from dotenv import load_dotenv
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 class ResearchAgent:
     def __init__(self):
         print("Initializing ResearchAgent...")
         self.tavily = TavilyClient(api_key=TAVILY_API_KEY)
+        self.llm = ChatOpenAI(model="gpt-4", temperature=0)
+        self.agents = self.initialize_agents()
         print("ResearchAgent initialized successfully.")
+    def initialize_agents(self) -> Dict[str, AgentExecutor]:
+        """Initialize all agents needed for the workflow"""
         # Build VectorStore
         with open("metadata.jsonl", "r") as f:
             json_QA = [json.loads(line) for line in f]
+        # Prepare documents for Chroma
+        documents = []
         for sample in json_QA:
             content = f"Question: {sample['Question']}\n\nFinal answer: {sample['Final answer']}"
+            metadata = {
+                "source": sample['task_id'],
+                "level": sample['Level'],
+                "final_answer": sample['Final answer'],
+                "steps": sample['Annotator Metadata']['Steps'],
+                "number_of_steps": sample['Annotator Metadata']['Number of steps'],
+                "how_long_did_this_take": sample['Annotator Metadata']['How long did this take?'],
+                "tools": sample['Annotator Metadata']['Tools'],
+                "number_of_tools": sample['Annotator Metadata']['Number of tools'],
+            }
+            documents.append(Document(page_content=content, metadata=metadata))
+        # Initialize Chroma with HuggingFace embeddings
+        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
+        vectorstore = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_db")
+        retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
+        # Define tools
         def tavily_search(query: str, include_raw_content: bool = False) -> str:
             """Search the web using Tavily. Returns a summary or raw content."""
             response = self.tavily.search(
             )
             return str(response)
         def search_arxiv(query: str, date_range: str = None) -> str:
             """Search arXiv for papers. Date format: '2022-06-01 TO 2022-07-01'."""
             base_url = "http://export.arxiv.org/api/query?"
             response = requests.get(base_url, params=params)
             return response.text
         def extract_zip_code(location: str) -> str:
             """Get zip code for a location (e.g., 'Fred Howard Park, Florida')."""
             return "34689"  # Mocked for demo
+        # Create tools
+        tools = [
+            Tool(
+                name="tavily_search",
+                func=tavily_search,
+                description="Search the web using Tavily. Returns a summary or raw content."
+            ),
+            Tool(
+                name="arxiv_search",
+                func=search_arxiv,
+                description="Search arXiv for papers. Date format: '2022-06-01 TO 2022-07-01'."
+            ),
+            Tool(
+                name="vector_search",
+                func=lambda q: str(retriever.get_relevant_documents(q)),
+                description="Searches cached Q&A pairs about arXiv papers and species data"
+            ),
+            Tool(
+                name="zip_code_extractor",
+                func=extract_zip_code,
+                description="Get zip code for a location (e.g., 'Fred Howard Park, Florida')."
+            )
+        ]
+        # Define agent prompts
+        search_prompt = ChatPromptTemplate.from_messages([
+            SystemMessage(content="You are a research assistant. First check cached Q&As. Use tools to find answers."),
+            MessagesPlaceholder(variable_name="chat_history"),
+            ("human", "{input}"),
+            MessagesPlaceholder(variable_name="agent_scratchpad")
+        ])
+        data_prompt = ChatPromptTemplate.from_messages([
+            SystemMessage(content="You extract and format data (e.g., zip codes)."),
+            MessagesPlaceholder(variable_name="chat_history"),
+            ("human", "{input}"),
+            MessagesPlaceholder(variable_name="agent_scratchpad")
+        ])
+        math_prompt = ChatPromptTemplate.from_messages([
+            SystemMessage(content="You perform calculations and provide answers."),
+            ("human", "{input}")
+        ])
+        summarizer_prompt = ChatPromptTemplate.from_messages([
+            SystemMessage(content="""I will summarize the answer. Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""),
+            ("human", "{input}")
+        ])
+        # Create agents
+        search_agent = create_react_agent(self.llm, [tools[0], tools[1], tools[2]], search_prompt)
+        data_agent = create_react_agent(self.llm, [tools[3]], data_prompt)
+        math_agent = LLMChain(llm=self.llm, prompt=math_prompt)
+        summarizer_agent = LLMChain(llm=self.llm, prompt=summarizer_prompt)
+        return {
+            "search": AgentExecutor(agent=search_agent, tools=[tools[0], tools[1], tools[2]], verbose=True),
+            "data": AgentExecutor(agent=data_agent, tools=[tools[3]], verbose=True),
+            "math": math_agent,
+            "summarizer": summarizer_agent
+        }
     async def process_query_async(self, question: str) -> str:
         """Process user query using the workflow (async version)"""
         try:
+            # First try search agent
+            response = await self.agents["search"].ainvoke({"input": question, "chat_history": []})
+            # If needed, pass to other agents
+            if "zip code" in question.lower():
+                response = await self.agents["data"].ainvoke({"input": question, "chat_history": []})
+            elif any(word in question.lower() for word in ["calculate", "math", "sum", "total"]):
+                response = await self.agents["math"].ainvoke({"input": question})
+            # Always pass through summarizer
+            summarized = await self.agents["summarizer"].ainvoke({"input": response["output"]})
+            return summarized["text"]
         except Exception as e:
             return f"An error occurred: {str(e)}"
         """Synchronous wrapper for the async query processing"""
         print(f"Agent received question (first 50 chars): {question[:50]}...")
         try:
             loop = asyncio.new_event_loop()
             asyncio.set_event_loop(loop)
             answer = loop.run_until_complete(self.process_query_async(question))
             print(error_msg)
             return error_msg
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the ResearchAgent on them, submits all answers,