José Enrique commited on
Commit
d4bb43c
·
1 Parent(s): 9ccff9e

langgraph agent

Browse files
evaluation_langgraph.py CHANGED
@@ -14,9 +14,13 @@ from langgraph_agent import build_agents
14
  from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
15
 
16
  from langfuse import Langfuse
 
17
  # Load environment variables
18
  load_dotenv()
 
 
19
  langfuse = Langfuse()
 
20
  # Initialize OpenTelemetry Tracer
21
  #trace_provider = TracerProvider()
22
  #trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
@@ -50,20 +54,26 @@ def add_image(metadata)->list:
50
 
51
  #@observe()
52
  def run_agent(agent, question,trace_name,metadata):
 
53
  # with tracer.start_as_current_span(trace_name) as span:
54
  # span.set_attribute("langfuse.tag", "dataset-run")
55
  # span.set_attribute("langfuse.input", question)
56
  # if the question has attachments:
57
  # find file under /attachments with the same task_id
58
- images = add_image(metadata)
59
-
60
  question = question + " The task_id is: " + metadata["task_id"]
61
  messages = [HumanMessage(content=question )]
62
 
63
- try:
 
64
  messages = agent.invoke(
65
- {"messages": messages}
66
  )
 
 
 
 
67
  except Exception as e:
68
  print(f"Error running agent: {e}")
69
  output = f"Error running agent: {e}"
@@ -81,7 +91,7 @@ def run_agent(agent, question,trace_name,metadata):
81
  # input=question,
82
  # output=output
83
  # )
84
- return output
85
  def simple_evaluation(output, expected_output):
86
 
87
  trimmed_output = str(output).strip().strip('"').strip("$")
@@ -117,7 +127,7 @@ def simple_evaluation(output, expected_output):
117
  return common_items_count
118
 
119
 
120
- def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name):
121
  dataset = langfuse.get_dataset(langfuse_dataset)
122
  responses = []
123
  # Run our agent against each dataset item (limited to first 10 above)
@@ -126,18 +136,23 @@ def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name):
126
  with item.run(
127
  run_name = run_name
128
  ) as root_span:
129
- root_span.update(input=item.input)
 
130
  task_id = item.metadata["task_id"]
131
- if task_id == "a1e91b78-d3d8-4675-bb8d-62741b4b68a6":
132
  try:
 
133
  output = run_agent(agent,item.input,trace_name,item.metadata)
134
  responses.append({"task_id": task_id, "submitted_answer": output})
135
- root_span.update(output=output)
 
136
  except Exception as e:
 
137
  output = f"Error running agent: {e}"
138
 
139
  # score the result against the expected output
140
- root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output))
 
141
 
142
  # Link the trace to the dataset item for analysis
143
  # item.link(
@@ -154,7 +169,8 @@ def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name):
154
  # )
155
 
156
  # Flush data to ensure all telemetry is sent
157
- langfuse.flush()
 
158
 
159
  # Save the responses to a JSON lines file
160
  print("Saving responses to file...")
@@ -168,16 +184,13 @@ def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name):
168
 
169
 
170
  def evaluate():
 
 
171
  print("Starting agent...")
172
  agent = build_agents()
173
  print("Agent built successfully.")
174
- run_evaluation(agent,"GAIA_Evaluation_Dataset","Single Langraph agent","OpenAI gpt4o","langraph-trace")
175
- # simple_evaluation("Dimitry","Clasu")
176
- # print("comparison", simple_evaluation("Dimitry","Clasu"))
177
- # print("sain", simple_evaluation('"Saint Petersburg"',"Saint Petersburg"))
178
- # print("pages", simple_evaluation('"132,133,136,195,245"',"132, 133, 134, 197, 245"))
179
- # print("veg", simple_evaluation('"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"',"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"))
180
- # print("right", simple_evaluation('"right"',"Right"))
181
 
182
  if __name__ == "__main__":
183
  evaluate()
 
14
  from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
15
 
16
  from langfuse import Langfuse
17
+
18
  # Load environment variables
19
  load_dotenv()
20
+ print("Environment variables loaded.")
21
+ # initialize langfuse
22
  langfuse = Langfuse()
23
+
24
  # Initialize OpenTelemetry Tracer
25
  #trace_provider = TracerProvider()
26
  #trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
 
54
 
55
  #@observe()
56
  def run_agent(agent, question,trace_name,metadata):
57
+ print("Running agent with question:", question)
58
  # with tracer.start_as_current_span(trace_name) as span:
59
  # span.set_attribute("langfuse.tag", "dataset-run")
60
  # span.set_attribute("langfuse.input", question)
61
  # if the question has attachments:
62
  # find file under /attachments with the same task_id
63
+ #images = add_image(metadata)
64
+ print("Running agent with question:")
65
  question = question + " The task_id is: " + metadata["task_id"]
66
  messages = [HumanMessage(content=question )]
67
 
68
+ try:
69
+ print("Invoking agent with question:", question)
70
  messages = agent.invoke(
71
+ {"messages": messages, "input_file": None}
72
  )
73
+ print("Agent messages")
74
+ # Show the messages
75
+ for m in messages['messages']:
76
+ m.pretty_print()
77
  except Exception as e:
78
  print(f"Error running agent: {e}")
79
  output = f"Error running agent: {e}"
 
91
  # input=question,
92
  # output=output
93
  # )
94
+ return messages
95
  def simple_evaluation(output, expected_output):
96
 
97
  trimmed_output = str(output).strip().strip('"').strip("$")
 
127
  return common_items_count
128
 
129
 
130
+ def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name,update_dataset=True):
131
  dataset = langfuse.get_dataset(langfuse_dataset)
132
  responses = []
133
  # Run our agent against each dataset item (limited to first 10 above)
 
136
  with item.run(
137
  run_name = run_name
138
  ) as root_span:
139
+ if update_dataset:
140
+ root_span.update(input=item.input)
141
  task_id = item.metadata["task_id"]
142
+ if task_id == "5a0c1adf-205e-4841-a666-7c3ef95def9d":
143
  try:
144
+ print("Running agent")
145
  output = run_agent(agent,item.input,trace_name,item.metadata)
146
  responses.append({"task_id": task_id, "submitted_answer": output})
147
+ if update_dataset:
148
+ root_span.update(output=output)
149
  except Exception as e:
150
+ print(f"Error running agent: {e}")
151
  output = f"Error running agent: {e}"
152
 
153
  # score the result against the expected output
154
+ if update_dataset:
155
+ root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output))
156
 
157
  # Link the trace to the dataset item for analysis
158
  # item.link(
 
169
  # )
170
 
171
  # Flush data to ensure all telemetry is sent
172
+ if update_dataset:
173
+ langfuse.flush()
174
 
175
  # Save the responses to a JSON lines file
176
  print("Saving responses to file...")
 
184
 
185
 
186
  def evaluate():
187
+
188
+
189
  print("Starting agent...")
190
  agent = build_agents()
191
  print("Agent built successfully.")
192
+ run_evaluation(agent,"GAIA_Evaluation_Dataset","Single Langraph agent","OpenAI gpt4o","langraph-trace",update_dataset=False)
193
+
 
 
 
 
 
194
 
195
  if __name__ == "__main__":
196
  evaluate()
langgraph_agent.py CHANGED
@@ -5,26 +5,20 @@ from langchain_openai import ChatOpenAI
5
  from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
6
  from langgraph.graph.message import add_messages
7
  from langgraph.prebuilt import ToolNode, tools_condition
8
- from tools.searchTools import wiki_search, mini_web_search, arvix_search
9
 
10
  class AgentState(TypedDict):
11
  input_file: Optional[str]
12
  messages: Annotated[list[AnyMessage], add_messages]
13
 
14
 
15
- # add toools:
16
  tools = [
17
- wiki_search,
18
- mini_web_search,
19
- arvix_search,
20
 
21
- ]
22
-
23
-
24
- # LLM model and tools
25
- vision_llm = ChatOpenAI(model="gpt-4o")
26
- llm = ChatOpenAI(model="gpt-4o")
27
- llm_withtools = llm.bind_tools(tools, parallel_tool_calls = False)
28
 
29
  def agent(state:AgentState):
30
  tools_description = """
@@ -56,6 +50,14 @@ def agent(state:AgentState):
56
  If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
57
  """)
58
 
 
 
 
 
 
 
 
 
59
  return {
60
  "input_file": state["input_file"],
61
  "messages": [llm_withtools.invoke([sys_message]+ state["messages"])]
 
5
  from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
6
  from langgraph.graph.message import add_messages
7
  from langgraph.prebuilt import ToolNode, tools_condition
8
+ from tools.searchTools_lg import wiki_search, mini_web_search, arvix_search
9
 
10
  class AgentState(TypedDict):
11
  input_file: Optional[str]
12
  messages: Annotated[list[AnyMessage], add_messages]
13
 
14
 
15
+ # add toools:
16
  tools = [
17
+ wiki_search,
18
+ mini_web_search,
19
+ arvix_search,
20
 
21
+ ]
 
 
 
 
 
 
22
 
23
  def agent(state:AgentState):
24
  tools_description = """
 
50
  If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
51
  """)
52
 
53
+
54
+
55
+
56
+ # LLM model and tools
57
+ vision_llm = ChatOpenAI(model="gpt-4o")
58
+ llm = ChatOpenAI(model="gpt-4o")
59
+ llm_withtools = llm.bind_tools(tools, parallel_tool_calls = False)
60
+
61
  return {
62
  "input_file": state["input_file"],
63
  "messages": [llm_withtools.invoke([sys_message]+ state["messages"])]
requirements.txt CHANGED
@@ -18,6 +18,8 @@ langchain-huggingface
18
  langchain-groq
19
  langchain-tavily
20
  langchain-chroma
 
 
21
  arxiv
22
  pymupdf
23
  wikipedia
 
18
  langchain-groq
19
  langchain-tavily
20
  langchain-chroma
21
+ langchain_openai
22
+ langgraph
23
  arxiv
24
  pymupdf
25
  wikipedia
responses_GAIA_Evaluation_DatasetSingle Langraph agent.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
4
+ "submitted_answer": {
5
+ "input_file": null,
6
+ "messages": [
7
+
tests/search_tools_tests.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ from dotenv import load_dotenv
5
+
6
+ import unittest
7
+ from unittest.mock import patch, MagicMock
8
+ from tools.searchTools_lg import wiki_search, mini_web_search, arvix_search
9
+
10
+ class TestSearchToolsLG(unittest.TestCase):
11
+
12
+ @patch("tools.searchTools_lg.WikipediaLoader")
13
+ def test_wiki_search(self, mock_loader):
14
+ mock_doc = MagicMock()
15
+ mock_doc.metadata = {"source": "Wikipedia", "page": "1"}
16
+ mock_doc.page_content = "Test Wikipedia content"
17
+ mock_loader.return_value.load.return_value = [mock_doc]
18
+ result = wiki_search("test query")
19
+ self.assertIn("wiki_results", result)
20
+ self.assertIn("Test Wikipedia content", result["wiki_results"])
21
+
22
+ @patch("tools.searchTools_lg.TavilySearch")
23
+ def test_mini_web_search(self, mock_tavily):
24
+ mock_doc = MagicMock()
25
+ mock_doc.result = [
26
+ {'url': 'https://www.python.org/',
27
+ 'title': 'Welcome to Python.org',
28
+ 'content': '**Notice:** While JavaScript is not essential for this website, your interaction with the content will be limited. # Python 3: Fibonacci series up to n More about defining functions in Python\xa03 # Python 3: List comprehensions Lists (known as arrays in other languages) are one of the compound data types that Python understands. More about lists in Python\xa03 # Python 3: Simple arithmetic More about simple math functions in Python\xa03. >>> print("Hello, I\'m Python!") Hello, I\'m Python! Python Hi, Python. Experienced programmers in any other language can pick up Python very quickly, and beginners find the clean syntax and indentation structure easy to learn. Latest: Python 3.13.7 docs.python.org jobs.python.org * Python 3.14.0rc2 and 3.13.7 are go!',
29
+ 'score': 0.98583,
30
+ 'raw_content': None},
31
+ {'url': 'https://www.w3schools.com/python/python_intro.asp',
32
+ 'title': 'Introduction to Python - W3Schools',
33
+ 'content': 'PYTHON # Python Introduction ## What is Python? Python is a popular programming language. ### What can Python do? ## Why Python? * Python has syntax that allows developers to write programs with fewer lines than some other programming languages. * In this tutorial Python will be written in a text editor. ### Python Syntax compared to other programming languages ##### Top Tutorials HTML Tutorial CSS Tutorial How To Tutorial Python Tutorial W3.CSS Tutorial ##### Top References HTML Reference CSS Reference Python Reference W3.CSS Reference ##### Top Examples HTML Examples CSS Examples How To Examples Python Examples W3.CSS Examples CSS Certificate Python Certificate Tutorials, references, and examples are constantly reviewed to avoid errors, but we cannot warrant full correctness ',
34
+ 'score': 0.98365,
35
+ 'raw_content': None},
36
+ {'url': 'https://en.wikipedia.org/wiki/Python_(programming_language)',
37
+ 'title': 'Python (programming language) - Wikipedia', 'content': 'Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.',
38
+ 'score': 0.97793,
39
+ 'raw_content': None}]
40
+
41
+ mock_tavily.return_value.invoke.return_value = [mock_doc]
42
+ result = mini_web_search("test query")
43
+ self.assertTrue(len(result) > 0)
44
+
45
+
46
+ @patch("tools.searchTools_lg.ArxivLoader")
47
+ def test_arvix_search(self, mock_arxiv):
48
+ mock_doc = MagicMock()
49
+ mock_doc.metadata = {'Published': '2024-04-05', 'Title': 'Egglog Python: A Pythonic Library for E-graphs', 'Authors': 'Saul Shanabrook', 'Summary': 'E-graphs have emerged as a versatile data structure with applications in\nsynthesis, optimization, and verification through techniques such as equality\nsaturation. This paper introduces Python bindings for the experimental egglog\nlibrary (previously called egg-smol), which aims to bring the benefits of\ne-graphs to the Python ecosystem. The bindings offer a high-level, Pythonic API\nproviding an accessible and familiar interface for Python users. By integrating\ne-graph techniques with Python, we hope to enable collaboration and innovation\nacross various domains in the scientific computing and machine learning\ncommunities. We discuss the advantages of using Python bindings for both Python\nand existing egg-smol users, as well as possible future directions for\ndevelopment.'}
50
+ mock_doc.page_content = "'Egg-smol Python: A Pythonic Library for E-graphs\nSaul Shanabrook\ns.shanabrook@gmail.com\nAbstract\nE-graphs have emerged as a versatile data structure with ap-\nplications in synthesis, optimization, and verification through\ntechniques such as equality saturation."
51
+ mock_arxiv.return_value.load.return_value = [mock_doc]
52
+ result = arvix_search("Python programming")
53
+ self.assertTrue(len(result) > 0)
54
+
55
+
56
+ def test_wiki_search_integration(self):
57
+ result = wiki_search("Python programming")
58
+ self.assertIn("wiki_results", result)
59
+ self.assertTrue(len(result["wiki_results"]) > 0)
60
+ #print("Wiki search result:", result["wiki_results"][:200])
61
+
62
+ def test_mini_web_search_integration(self):
63
+ result = mini_web_search("Python programming")
64
+ self.assertIn("results", result)
65
+ self.assertTrue(len(result["results"]) > 0)
66
+
67
+
68
+ def test_arvix_search_integration(self):
69
+ result = arvix_search("Python programming")
70
+ self.assertTrue(len(result) > 0)
71
+
72
+
73
+
74
+ if __name__ == "__main__":
75
+ # load environment variables from .env file
76
+ load_dotenv()
77
+ unittest.main()
tools/searchTools_lg.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.tools import tool
2
+ from langchain_tavily import TavilySearch
3
+ from langchain_community.document_loaders import WikipediaLoader
4
+ from langchain_community.document_loaders import ArxivLoader
5
+ from langchain_community.vectorstores import SupabaseVectorStore
6
+
7
+ @tool
8
+ def wiki_search(query: str) -> str:
9
+ """Search Wikipedia for a query and return maximum 2 results.
10
+
11
+ Args:
12
+ query: The search query."""
13
+ search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
14
+ formatted_search_docs = "\n\n---\n\n".join(
15
+ [
16
+ f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
17
+ for doc in search_docs
18
+ ])
19
+ return {"wiki_results": formatted_search_docs}
20
+
21
+ @tool
22
+ def mini_web_search(query: str) -> str:
23
+ """Search Tavily for a query and return maximum 3 results.
24
+
25
+ Args:
26
+ query: The search query."""
27
+ search_docs = TavilySearch(max_results=3, topic="general",).invoke({"query":query})
28
+ # formatted_search_docs = "\n\n---\n\n".join(
29
+ # [
30
+ # f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
31
+ # for doc in search_docs
32
+ # ])
33
+ # return {"web_results": formatted_search_docs}
34
+ return search_docs
35
+ @tool
36
+ def arvix_search(query: str) -> str:
37
+ """Search for scientific papers in Arxiv for a query and return maximum 3 result.
38
+
39
+ Args:
40
+ query: The search query."""
41
+ search_docs = ArxivLoader(query=query, load_max_docs=3).load()
42
+ # formatted_search_docs = "\n\n---\n\n".join(
43
+ # [
44
+ # f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content[:1000]}\n</Document>'
45
+ # for doc in search_docs
46
+ # ])
47
+ return search_docs