Final_Assignment_Template

Sleeping

App Files Files Community

José Enrique commited on Sep 15, 2025

Commit

d4bb43c

1 Parent(s): 9ccff9e

langgraph agent

Browse files

Files changed (6) hide show

evaluation_langgraph.py +31 -18
langgraph_agent.py +14 -12
requirements.txt +2 -0
responses_GAIA_Evaluation_DatasetSingle Langraph agent.json +7 -0
tests/search_tools_tests.py +77 -0
tools/searchTools_lg.py +47 -0

evaluation_langgraph.py CHANGED Viewed

@@ -14,9 +14,13 @@ from langgraph_agent import build_agents
 from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
 from langfuse import Langfuse
 # Load environment variables
 load_dotenv()
 langfuse = Langfuse()
 # Initialize OpenTelemetry Tracer
 #trace_provider = TracerProvider()
 #trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
@@ -50,20 +54,26 @@ def add_image(metadata)->list:
 #@observe()
 def run_agent(agent, question,trace_name,metadata):
     # with tracer.start_as_current_span(trace_name) as span:
     #     span.set_attribute("langfuse.tag", "dataset-run")
     #     span.set_attribute("langfuse.input", question)
         # if the question has attachments:
         # find file under /attachments with the same task_id
-    images = add_image(metadata)
     question = question + " The task_id is: " + metadata["task_id"]
     messages = [HumanMessage(content=question )]
-    try:
         messages = agent.invoke(
-            {"messages": messages}
         )
     except Exception as e:
         print(f"Error running agent: {e}")
         output = f"Error running agent: {e}"
@@ -81,7 +91,7 @@ def run_agent(agent, question,trace_name,metadata):
     #     input=question,
     #     output=output
     # )
-    return output
 def simple_evaluation(output, expected_output):
   trimmed_output = str(output).strip().strip('"').strip("$")
@@ -117,7 +127,7 @@ def simple_evaluation(output, expected_output):
   return common_items_count
-def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name):
     dataset = langfuse.get_dataset(langfuse_dataset)
     responses = []
     # Run our agent against each dataset item (limited to first 10 above)
@@ -126,18 +136,23 @@ def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name):
         with item.run(
             run_name = run_name
         ) as root_span:
-            root_span.update(input=item.input)
             task_id = item.metadata["task_id"]
-            if task_id == "a1e91b78-d3d8-4675-bb8d-62741b4b68a6":
                 try:
                     output = run_agent(agent,item.input,trace_name,item.metadata)
                     responses.append({"task_id": task_id, "submitted_answer": output})
-                    root_span.update(output=output)
                 except Exception as e:
                     output = f"Error running agent: {e}"
                 # score the result against the expected output
-                root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output))
             # Link the trace to the dataset item for analysis
             # item.link(
@@ -154,7 +169,8 @@ def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name):
             # )
         # Flush data to ensure all telemetry is sent
-    langfuse.flush()
     # Save the responses to a JSON lines file
     print("Saving responses to file...")
@@ -168,16 +184,13 @@ def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name):
 def evaluate():
     print("Starting agent...")
     agent = build_agents()
     print("Agent built successfully.")
-    run_evaluation(agent,"GAIA_Evaluation_Dataset","Single Langraph agent","OpenAI gpt4o","langraph-trace")
-    # simple_evaluation("Dimitry","Clasu")
-    # print("comparison", simple_evaluation("Dimitry","Clasu"))
-    # print("sain", simple_evaluation('"Saint Petersburg"',"Saint Petersburg"))
-    # print("pages", simple_evaluation('"132,133,136,195,245"',"132, 133, 134, 197, 245"))
-    # print("veg", simple_evaluation('"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"',"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"))
-    # print("right", simple_evaluation('"right"',"Right"))
 if __name__ == "__main__":
     evaluate()

 from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
 from langfuse import Langfuse
 # Load environment variables
 load_dotenv()
+print("Environment variables loaded.")
+# initialize langfuse
 langfuse = Langfuse()
 # Initialize OpenTelemetry Tracer
 #trace_provider = TracerProvider()
 #trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
 #@observe()
 def run_agent(agent, question,trace_name,metadata):
+    print("Running agent with question:", question)
     # with tracer.start_as_current_span(trace_name) as span:
     #     span.set_attribute("langfuse.tag", "dataset-run")
     #     span.set_attribute("langfuse.input", question)
         # if the question has attachments:
         # find file under /attachments with the same task_id
+    #images = add_image(metadata)
+    print("Running agent with question:")
     question = question + " The task_id is: " + metadata["task_id"]
     messages = [HumanMessage(content=question )]
+    try:
+        print("Invoking agent with question:", question)
         messages = agent.invoke(
+            {"messages": messages, "input_file": None}
         )
+        print("Agent messages")
+        # Show the messages
+        for m in messages['messages']:
+            m.pretty_print()
     except Exception as e:
         print(f"Error running agent: {e}")
         output = f"Error running agent: {e}"
     #     input=question,
     #     output=output
     # )
+    return messages
 def simple_evaluation(output, expected_output):
   trimmed_output = str(output).strip().strip('"').strip("$")
   return common_items_count
+def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name,update_dataset=True):
     dataset = langfuse.get_dataset(langfuse_dataset)
     responses = []
     # Run our agent against each dataset item (limited to first 10 above)
         with item.run(
             run_name = run_name
         ) as root_span:
+            if update_dataset:
+                root_span.update(input=item.input)
             task_id = item.metadata["task_id"]
+            if task_id == "5a0c1adf-205e-4841-a666-7c3ef95def9d":
                 try:
+                    print("Running agent")
                     output = run_agent(agent,item.input,trace_name,item.metadata)
                     responses.append({"task_id": task_id, "submitted_answer": output})
+                    if update_dataset:
+                        root_span.update(output=output)
                 except Exception as e:
+                    print(f"Error running agent: {e}")
                     output = f"Error running agent: {e}"
                 # score the result against the expected output
+                if update_dataset:
+                    root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output))
             # Link the trace to the dataset item for analysis
             # item.link(
             # )
         # Flush data to ensure all telemetry is sent
+    if update_dataset:
+        langfuse.flush()
     # Save the responses to a JSON lines file
     print("Saving responses to file...")
 def evaluate():
     print("Starting agent...")
     agent = build_agents()
     print("Agent built successfully.")
+    run_evaluation(agent,"GAIA_Evaluation_Dataset","Single Langraph agent","OpenAI gpt4o","langraph-trace",update_dataset=False)
 if __name__ == "__main__":
     evaluate()

langgraph_agent.py CHANGED Viewed

@@ -5,26 +5,20 @@ from langchain_openai import ChatOpenAI
 from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
 from langgraph.graph.message import add_messages
 from langgraph.prebuilt import ToolNode, tools_condition
-from tools.searchTools import wiki_search, mini_web_search, arvix_search
 class AgentState(TypedDict):
     input_file: Optional[str]
     messages: Annotated[list[AnyMessage], add_messages]
-# add toools:
 tools = [
-    wiki_search,
-    mini_web_search,
-    arvix_search,
-]
-# LLM model and tools
-vision_llm = ChatOpenAI(model="gpt-4o")
-llm = ChatOpenAI(model="gpt-4o")
-llm_withtools = llm.bind_tools(tools, parallel_tool_calls = False)
 def agent(state:AgentState):
     tools_description = """
@@ -56,6 +50,14 @@ def agent(state:AgentState):
     If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
     """)
     return {
     "input_file": state["input_file"],
     "messages": [llm_withtools.invoke([sys_message]+ state["messages"])]

 from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
 from langgraph.graph.message import add_messages
 from langgraph.prebuilt import ToolNode, tools_condition
+from tools.searchTools_lg import wiki_search, mini_web_search, arvix_search
 class AgentState(TypedDict):
     input_file: Optional[str]
     messages: Annotated[list[AnyMessage], add_messages]
+ # add toools:
 tools = [
+        wiki_search,
+        mini_web_search,
+        arvix_search,
+    ]
 def agent(state:AgentState):
     tools_description = """
     If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
     """)
+    # LLM model and tools
+    vision_llm = ChatOpenAI(model="gpt-4o")
+    llm = ChatOpenAI(model="gpt-4o")
+    llm_withtools = llm.bind_tools(tools, parallel_tool_calls = False)
     return {
     "input_file": state["input_file"],
     "messages": [llm_withtools.invoke([sys_message]+ state["messages"])]

requirements.txt CHANGED Viewed

@@ -18,6 +18,8 @@ langchain-huggingface
 langchain-groq
 langchain-tavily
 langchain-chroma
 arxiv
 pymupdf
 wikipedia

 langchain-groq
 langchain-tavily
 langchain-chroma
+langchain_openai
+langgraph
 arxiv
 pymupdf
 wikipedia

responses_GAIA_Evaluation_DatasetSingle Langraph agent.json ADDED Viewed

	@@ -0,0 +1,7 @@

+[
+    {
+        "task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
+        "submitted_answer": {
+            "input_file": null,
+            "messages": [

tests/search_tools_tests.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from dotenv import load_dotenv
+import unittest
+from unittest.mock import patch, MagicMock
+from tools.searchTools_lg import wiki_search, mini_web_search, arvix_search
+class TestSearchToolsLG(unittest.TestCase):
+    @patch("tools.searchTools_lg.WikipediaLoader")
+    def test_wiki_search(self, mock_loader):
+        mock_doc = MagicMock()
+        mock_doc.metadata = {"source": "Wikipedia", "page": "1"}
+        mock_doc.page_content = "Test Wikipedia content"
+        mock_loader.return_value.load.return_value = [mock_doc]
+        result = wiki_search("test query")
+        self.assertIn("wiki_results", result)
+        self.assertIn("Test Wikipedia content", result["wiki_results"])
+    @patch("tools.searchTools_lg.TavilySearch")
+    def test_mini_web_search(self, mock_tavily):
+        mock_doc = MagicMock()
+        mock_doc.result = [
+            {'url': 'https://www.python.org/',
+             'title': 'Welcome to Python.org',
+             'content': '**Notice:** While JavaScript is not essential for this website, your interaction with the content will be limited. # Python 3: Fibonacci series up to n More about defining functions in Python\xa03 # Python 3: List comprehensions Lists (known as arrays in other languages) are one of the compound data types that Python understands. More about lists in Python\xa03 # Python 3: Simple arithmetic More about simple math functions in Python\xa03. >>> print("Hello, I\'m Python!") Hello, I\'m Python! Python Hi, Python. Experienced programmers in any other language can pick up Python very quickly, and beginners find the clean syntax and indentation structure easy to learn. Latest: Python 3.13.7 docs.python.org jobs.python.org * Python 3.14.0rc2 and 3.13.7 are go!',
+             'score': 0.98583,
+             'raw_content': None},
+             {'url': 'https://www.w3schools.com/python/python_intro.asp',
+              'title': 'Introduction to Python - W3Schools',
+              'content': 'PYTHON # Python Introduction ## What is Python? Python is a popular programming language. ### What can Python do? ## Why Python? * Python has syntax that allows developers to write programs with fewer lines than some other programming languages. * In this tutorial Python will be written in a text editor. ### Python Syntax compared to other programming languages ##### Top Tutorials HTML Tutorial   CSS Tutorial   How To Tutorial   Python Tutorial   W3.CSS Tutorial   ##### Top References HTML Reference   CSS Reference   Python Reference   W3.CSS Reference   ##### Top Examples HTML Examples   CSS Examples   How To Examples   Python Examples   W3.CSS Examples   CSS Certificate   Python Certificate   Tutorials, references, and examples are constantly reviewed to avoid errors, but we cannot warrant full correctness  ',
+              'score': 0.98365,
+              'raw_content': None},
+              {'url': 'https://en.wikipedia.org/wiki/Python_(programming_language)',
+               'title': 'Python (programming language) - Wikipedia', 'content': 'Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.',
+               'score': 0.97793,
+               'raw_content': None}]
+        mock_tavily.return_value.invoke.return_value = [mock_doc]
+        result = mini_web_search("test query")
+        self.assertTrue(len(result) > 0)
+    @patch("tools.searchTools_lg.ArxivLoader")
+    def test_arvix_search(self, mock_arxiv):
+        mock_doc = MagicMock()
+        mock_doc.metadata = {'Published': '2024-04-05', 'Title': 'Egglog Python: A Pythonic Library for E-graphs', 'Authors': 'Saul Shanabrook', 'Summary': 'E-graphs have emerged as a versatile data structure with applications in\nsynthesis, optimization, and verification through techniques such as equality\nsaturation. This paper introduces Python bindings for the experimental egglog\nlibrary (previously called egg-smol), which aims to bring the benefits of\ne-graphs to the Python ecosystem. The bindings offer a high-level, Pythonic API\nproviding an accessible and familiar interface for Python users. By integrating\ne-graph techniques with Python, we hope to enable collaboration and innovation\nacross various domains in the scientific computing and machine learning\ncommunities. We discuss the advantages of using Python bindings for both Python\nand existing egg-smol users, as well as possible future directions for\ndevelopment.'}
+        mock_doc.page_content = "'Egg-smol Python: A Pythonic Library for E-graphs\nSaul Shanabrook\ns.shanabrook@gmail.com\nAbstract\nE-graphs have emerged as a versatile data structure with ap-\nplications in synthesis, optimization, and verification through\ntechniques such as equality saturation."
+        mock_arxiv.return_value.load.return_value = [mock_doc]
+        result = arvix_search("Python programming")
+        self.assertTrue(len(result) > 0)
+    def test_wiki_search_integration(self):
+        result = wiki_search("Python programming")
+        self.assertIn("wiki_results", result)
+        self.assertTrue(len(result["wiki_results"]) > 0)
+        #print("Wiki search result:", result["wiki_results"][:200])
+    def test_mini_web_search_integration(self):
+        result = mini_web_search("Python programming")
+        self.assertIn("results", result)
+        self.assertTrue(len(result["results"]) > 0)
+    def test_arvix_search_integration(self):
+        result = arvix_search("Python programming")
+        self.assertTrue(len(result) > 0)
+if __name__ == "__main__":
+    # load environment variables from .env file
+    load_dotenv()
+    unittest.main()

tools/searchTools_lg.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from langchain_core.tools import tool
+from langchain_tavily import TavilySearch
+from langchain_community.document_loaders import WikipediaLoader
+from langchain_community.document_loaders import ArxivLoader
+from langchain_community.vectorstores import SupabaseVectorStore
+@tool
+def wiki_search(query: str) -> str:
+    """Search Wikipedia for a query and return maximum 2 results.
+    Args:
+        query: The search query."""
+    search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
+    formatted_search_docs = "\n\n---\n\n".join(
+        [
+            f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
+            for doc in search_docs
+        ])
+    return {"wiki_results": formatted_search_docs}
+@tool
+def mini_web_search(query: str) -> str:
+    """Search Tavily for a query and return maximum 3 results.
+    Args:
+        query: The search query."""
+    search_docs = TavilySearch(max_results=3, topic="general",).invoke({"query":query})
+    # formatted_search_docs = "\n\n---\n\n".join(
+    #     [
+    #         f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
+    #         for doc in search_docs
+    #     ])
+    # return {"web_results": formatted_search_docs}
+    return search_docs
+@tool
+def arvix_search(query: str) -> str:
+    """Search for scientific papers in Arxiv for a query and return maximum 3 result.
+    Args:
+        query: The search query."""
+    search_docs = ArxivLoader(query=query, load_max_docs=3).load()
+    # formatted_search_docs = "\n\n---\n\n".join(
+    #     [
+    #         f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content[:1000]}\n</Document>'
+    #         for doc in search_docs
+    #     ])
+    return search_docs