José Enrique commited on
Commit
9ccff9e
·
1 Parent(s): 61c17f1

updated evaluation files

Browse files
evaluation.py CHANGED
@@ -124,16 +124,16 @@ def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name):
124
  ) as root_span:
125
  root_span.update(input=item.input)
126
  task_id = item.metadata["task_id"]
127
- #if task_id == "7bd855d8-463d-4ed5-93ca-5fe35145f733":
128
- try:
129
- output = run_agent(agent,item.input,trace_name,item.metadata)
130
- responses.append({"task_id": task_id, "submitted_answer": output})
131
- root_span.update(output=output)
132
- except Exception as e:
133
- output = f"Error running agent: {e}"
134
 
135
- # score the result against the expected output
136
- root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output))
137
 
138
  # Link the trace to the dataset item for analysis
139
  # item.link(
@@ -167,13 +167,13 @@ def evaluate():
167
  print("Starting agent...")
168
  agent = build_agents()
169
  print("Agent built successfully.")
170
- #run_evaluation(agent,"GAIA_Evaluation_Dataset","Single Smolagent with tools OpenAI 4o 3planningSteps","OpenAI gpt4o","smolagent-trace")
171
- simple_evaluation("Dimitry","Clasu")
172
- print("comparison", simple_evaluation("Dimitry","Clasu"))
173
- print("sain", simple_evaluation('"Saint Petersburg"',"Saint Petersburg"))
174
- print("pages", simple_evaluation('"132,133,136,195,245"',"132, 133, 134, 197, 245"))
175
- print("veg", simple_evaluation('"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"',"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"))
176
- print("right", simple_evaluation('"right"',"Right"))
177
 
178
  if __name__ == "__main__":
179
  evaluate()
 
124
  ) as root_span:
125
  root_span.update(input=item.input)
126
  task_id = item.metadata["task_id"]
127
+ if task_id == "a1e91b78-d3d8-4675-bb8d-62741b4b68a6":
128
+ try:
129
+ output = run_agent(agent,item.input,trace_name,item.metadata)
130
+ responses.append({"task_id": task_id, "submitted_answer": output})
131
+ root_span.update(output=output)
132
+ except Exception as e:
133
+ output = f"Error running agent: {e}"
134
 
135
+ # score the result against the expected output
136
+ root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output))
137
 
138
  # Link the trace to the dataset item for analysis
139
  # item.link(
 
167
  print("Starting agent...")
168
  agent = build_agents()
169
  print("Agent built successfully.")
170
+ run_evaluation(agent,"GAIA_Evaluation_Dataset","Single Smolagent with tools OpenAI 4o 3planningSteps youtube","OpenAI gpt4o","smolagent-trace")
171
+ # simple_evaluation("Dimitry","Clasu")
172
+ # print("comparison", simple_evaluation("Dimitry","Clasu"))
173
+ # print("sain", simple_evaluation('"Saint Petersburg"',"Saint Petersburg"))
174
+ # print("pages", simple_evaluation('"132,133,136,195,245"',"132, 133, 134, 197, 245"))
175
+ # print("veg", simple_evaluation('"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"',"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"))
176
+ # print("right", simple_evaluation('"right"',"Right"))
177
 
178
  if __name__ == "__main__":
179
  evaluate()
evaluation_langgraph.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from dotenv import load_dotenv
4
+ from opentelemetry.trace import format_trace_id, get_tracer
5
+ from opentelemetry import trace
6
+ from opentelemetry.sdk.trace import TracerProvider
7
+ from opentelemetry.sdk.trace.export import SimpleSpanProcessor
8
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
9
+ from openinference.instrumentation.smolagents import SmolagentsInstrumentor
10
+ from langfuse import observe
11
+ from PIL import Image
12
+
13
+ from langgraph_agent import build_agents
14
+ from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
15
+
16
+ from langfuse import Langfuse
17
+ # Load environment variables
18
+ load_dotenv()
19
+ langfuse = Langfuse()
20
+ # Initialize OpenTelemetry Tracer
21
+ #trace_provider = TracerProvider()
22
+ #trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
23
+ #trace.set_tracer_provider(trace_provider) # Set as global provider
24
+ #tracer = trace.get_tracer(__name__) # Get a tracer instance
25
+
26
+ #SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
27
+
28
+ def add_image(metadata)->list:
29
+ images = []
30
+ task_id = metadata["task_id"]
31
+ attachment = metadata.get("attachment", False)
32
+ if attachment:
33
+ os.file_path = f"attachments/{attachment}"
34
+ if os.path.exists(os.file_path):
35
+ print("Attachments found for task_id:", task_id)
36
+ # with open(os.file_path, "rb") as file:
37
+ # question += f"\n\nAttachments: {file.read().decode('utf-8')}"
38
+ else:
39
+ print(f"No attachments found for task_id: {task_id}")
40
+ # if the file is an image, we can add it to the question
41
+
42
+ if os.path.isfile(os.file_path) and os.path.splitext(os.file_path)[1].lower() in ['.jpg', '.jpeg', '.png']:
43
+ # open the image and convert it to RGB
44
+ with open(os.file_path, "rb") as file:
45
+ # Read the image file and convert it to RGB
46
+ image = Image.open(file).convert("RGB")
47
+ images.append(image)
48
+ return images
49
+
50
+
51
+ #@observe()
52
+ def run_agent(agent, question,trace_name,metadata):
53
+ # with tracer.start_as_current_span(trace_name) as span:
54
+ # span.set_attribute("langfuse.tag", "dataset-run")
55
+ # span.set_attribute("langfuse.input", question)
56
+ # if the question has attachments:
57
+ # find file under /attachments with the same task_id
58
+ images = add_image(metadata)
59
+
60
+ question = question + " The task_id is: " + metadata["task_id"]
61
+ messages = [HumanMessage(content=question )]
62
+
63
+ try:
64
+ messages = agent.invoke(
65
+ {"messages": messages}
66
+ )
67
+ except Exception as e:
68
+ print(f"Error running agent: {e}")
69
+ output = f"Error running agent: {e}"
70
+
71
+ #span.set_attribute("langfuse.output", output)
72
+
73
+ # current_span = trace.get_current_span()
74
+ # span_context = current_span.get_span_context()
75
+ # trace_id = span_context.trace_id
76
+ # formatted_trace_id = format_trace_id(trace_id)
77
+ # langfuse_trace = langfuse.get_trace(id=formatted_trace_id)
78
+
79
+ # langfuse_trace = langfuse.trace(
80
+ # id=formatted_trace_id,
81
+ # input=question,
82
+ # output=output
83
+ # )
84
+ return output
85
+ def simple_evaluation(output, expected_output):
86
+
87
+ trimmed_output = str(output).strip().strip('"').strip("$")
88
+ # see if the output is a list:
89
+ expected_output_list = [item.strip() for item in expected_output.split(",") if item.strip()]
90
+ output_list = [item.strip() for item in trimmed_output.split(",") if item.strip()]
91
+ similarity = 0.0
92
+ if not expected_output_list and not output_list:
93
+ similarity = 0.0
94
+ if trimmed_output == expected_output:
95
+ similarity = 1.0
96
+ elif expected_output.toLower() == trimmed_output.toLower():
97
+ similarity = 0.8
98
+ else:
99
+ similarity = 0.0
100
+ return similarity
101
+ common_items_count = 0
102
+ matched_items = [False]*len(output_list)
103
+ for item in expected_output_list:
104
+ for i,item2 in enumerate(output_list):
105
+ if not matched_items[i]:
106
+ similarity = 0.0
107
+ if item == item2:
108
+ similarity = 1.0
109
+ elif item.lower() == item2.lower():
110
+ similarity = 0.8
111
+ else:
112
+ similarity = 0.0
113
+ if similarity >= 0.8:
114
+ matched_items[i] = True
115
+ break
116
+ common_items_count = sum(matched_items) / len(expected_output_list)
117
+ return common_items_count
118
+
119
+
120
+ def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name):
121
+ dataset = langfuse.get_dataset(langfuse_dataset)
122
+ responses = []
123
+ # Run our agent against each dataset item (limited to first 10 above)
124
+ for item in dataset.items:
125
+ print(f"Processing item with task_id: {item.metadata['task_id']}")
126
+ with item.run(
127
+ run_name = run_name
128
+ ) as root_span:
129
+ root_span.update(input=item.input)
130
+ task_id = item.metadata["task_id"]
131
+ if task_id == "a1e91b78-d3d8-4675-bb8d-62741b4b68a6":
132
+ try:
133
+ output = run_agent(agent,item.input,trace_name,item.metadata)
134
+ responses.append({"task_id": task_id, "submitted_answer": output})
135
+ root_span.update(output=output)
136
+ except Exception as e:
137
+ output = f"Error running agent: {e}"
138
+
139
+ # score the result against the expected output
140
+ root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output))
141
+
142
+ # Link the trace to the dataset item for analysis
143
+ # item.link(
144
+ # langfuse_trace,
145
+ # run_name=run_name,
146
+ # run_metadata={ "model": model_id }
147
+ # )
148
+
149
+ # Optionally, store a quick evaluation score for demonstration
150
+ # langfuse_trace.score(
151
+ # name="<example_eval>",
152
+ # value=1,
153
+ # comment="This is a comment"
154
+ # )
155
+
156
+ # Flush data to ensure all telemetry is sent
157
+ langfuse.flush()
158
+
159
+ # Save the responses to a JSON lines file
160
+ print("Saving responses to file...")
161
+ responses = [{"task_id": item["task_id"], "submitted_answer": item["submitted_answer"]} for item in responses]
162
+
163
+ filename = langfuse_dataset+run_name
164
+ output_file = f"responses_{filename}.json"
165
+ with open(output_file, "w") as f:
166
+ json.dump(responses, f, indent=4)
167
+ print(f"Responses saved to {output_file}")
168
+
169
+
170
+ def evaluate():
171
+ print("Starting agent...")
172
+ agent = build_agents()
173
+ print("Agent built successfully.")
174
+ run_evaluation(agent,"GAIA_Evaluation_Dataset","Single Langraph agent","OpenAI gpt4o","langraph-trace")
175
+ # simple_evaluation("Dimitry","Clasu")
176
+ # print("comparison", simple_evaluation("Dimitry","Clasu"))
177
+ # print("sain", simple_evaluation('"Saint Petersburg"',"Saint Petersburg"))
178
+ # print("pages", simple_evaluation('"132,133,136,195,245"',"132, 133, 134, 197, 245"))
179
+ # print("veg", simple_evaluation('"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"',"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"))
180
+ # print("right", simple_evaluation('"right"',"Right"))
181
+
182
+ if __name__ == "__main__":
183
+ evaluate()
langgraph_agent.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, TypedDict, Annotated, Optional
3
+ from langgraph.graph import StateGraph, START, END
4
+ from langchain_openai import ChatOpenAI
5
+ from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
6
+ from langgraph.graph.message import add_messages
7
+ from langgraph.prebuilt import ToolNode, tools_condition
8
+ from tools.searchTools import wiki_search, mini_web_search, arvix_search
9
+
10
+ class AgentState(TypedDict):
11
+ input_file: Optional[str]
12
+ messages: Annotated[list[AnyMessage], add_messages]
13
+
14
+
15
+ # add toools:
16
+ tools = [
17
+ wiki_search,
18
+ mini_web_search,
19
+ arvix_search,
20
+
21
+ ]
22
+
23
+
24
+ # LLM model and tools
25
+ vision_llm = ChatOpenAI(model="gpt-4o")
26
+ llm = ChatOpenAI(model="gpt-4o")
27
+ llm_withtools = llm.bind_tools(tools, parallel_tool_calls = False)
28
+
29
+ def agent(state:AgentState):
30
+ tools_description = """
31
+ wiki_search(query: str) -> str:
32
+ Search Wikipedia for a query and return maximum 2 results.
33
+
34
+ Args:
35
+ query: The search query.
36
+
37
+ Search Tavily for a query and return maximum 3 results.
38
+
39
+ mini_web_search(query: str) -> str:
40
+ Args:
41
+ query: The search query
42
+
43
+ arvix_search(query: str) -> str:
44
+ Search Arxiv for a query and return maximum 3 result.
45
+
46
+ Args:
47
+ query: The search query.
48
+ """
49
+ sys_message = SystemMessage(content=f"""You are a helpful AI agent that can use tools to answer questions.
50
+ You can use the following tools:{tools_description}
51
+ PLEASE FOLLOW THE INSTRUCTIONS FOR ANSWERING CAREFULLY:
52
+ Your answer should follow the template: FINAL ANSWER: [YOUR FINAL ANSWER].
53
+ YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
54
+ If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
55
+ If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
56
+ If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
57
+ """)
58
+
59
+ return {
60
+ "input_file": state["input_file"],
61
+ "messages": [llm_withtools.invoke([sys_message]+ state["messages"])]
62
+ }
63
+
64
+
65
+ def build_agents():
66
+
67
+ builder = StateGraph(AgentState)
68
+ builder.add_node("agent",agent)
69
+ builder.add_node("tools",ToolNode(tools))
70
+
71
+ builder.add_edge(START, "agent")
72
+ builder.add_conditional_edges(
73
+ "agent",
74
+ tools_condition,
75
+ )
76
+ builder.add_edge("tools", "agent")
77
+ react_graph = builder.compile()
78
+
79
+ return react_graph
80
+
81
+
82
+
requirements.txt CHANGED
@@ -31,4 +31,6 @@ pandas
31
  numpy
32
  beautifulsoup4
33
  openai
34
- pydub
 
 
 
31
  numpy
32
  beautifulsoup4
33
  openai
34
+ pydub
35
+ yt-dlp
36
+ opencv-python
responses_GAIA_Evaluation_DatasetSingle Smolagent with tools OpenAI 4o 3planningSteps youtube.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
4
+ "submitted_answer": "The task could not be completed due to limitations in accessing and analyzing video content directly through current tools."
5
+ }
6
+ ]
responses_GAIA_Evaluation_DatasetSingle Smolagent with tools OpenAI 4o 3planningSteps.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
4
+ "submitted_answer": "Dmitry"
5
+ },
6
+ {
7
+ "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
8
+ "submitted_answer": "$89706.00"
9
+ },
10
+ {
11
+ "task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
12
+ "submitted_answer": "Yamasaki, Uehara"
13
+ },
14
+ {
15
+ "task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
16
+ "submitted_answer": "LUX"
17
+ },
18
+ {
19
+ "task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
20
+ "submitted_answer": "Saint Petersburg"
21
+ },
22
+ {
23
+ "task_id": "840bfca7-4f7b-481a-8794-c560c340185d",
24
+ "submitted_answer": "The work performed by R. G. Arendt related to the paper \"The Population of the Galactic Center Filaments: Position Angle Distribution Reveals a Degree-scale Collimated Outflow from Sgr A*\" was supported under the NASA award number 80GSFC21M0002."
25
+ },
26
+ {
27
+ "task_id": "1f975693-876d-457b-a649-393859e79bf3",
28
+ "submitted_answer": "132,133,134,197,245"
29
+ },
30
+ {
31
+ "task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
32
+ "submitted_answer": 519
33
+ },
34
+ {
35
+ "task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
36
+ "submitted_answer": 0
37
+ },
38
+ {
39
+ "task_id": "305ac316-eef6-4446-960a-92d80d542f82",
40
+ "submitted_answer": "Wojciech"
41
+ },
42
+ {
43
+ "task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
44
+ "submitted_answer": "cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"
45
+ },
46
+ {
47
+ "task_id": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
48
+ "submitted_answer": "broccoli, celery, lettuce, sweet potatoes"
49
+ },
50
+ {
51
+ "task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
52
+ "submitted_answer": "Louvrier"
53
+ },
54
+ {
55
+ "task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
56
+ "submitted_answer": "Extremely"
57
+ },
58
+ {
59
+ "task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
60
+ "submitted_answer": "b,e"
61
+ },
62
+ {
63
+ "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
64
+ "submitted_answer": "It appears that navigating through the available content has yet to yield direct information regarding the nominator of the \"Giganotosaurus\" featured article nomination. However, using data available from my training, the nomination of \"Giganotosaurus\" as a Featured Article was put forward by the Wikipedia user \"FunkMonk.\""
65
+ },
66
+ {
67
+ "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
68
+ "submitted_answer": "Error running agent: 'str' object has no attribute 'token_usage'"
69
+ },
70
+ {
71
+ "task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
72
+ "submitted_answer": "right"
73
+ },
74
+ {
75
+ "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
76
+ "submitted_answer": "3"
77
+ },
78
+ {
79
+ "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
80
+ "submitted_answer": "To determine how many studio albums Mercedes Sosa published between 2000 and 2009, we can rely on the extracted information from a review of Mercedes Sosa's discography and award records listed on or linked from Wikipedia:\n\nFrom the search results and other visible references on Wikipedia pages:\n- **2000**: Misa Criolla (acknowledged with a Grammy award)\n- **2003**: Ac\u00fastico\n- **2006**: Coraz\u00f3n Libre\n- **2009**: Cantora 1 (part of Cantora, un Viaje \u00cdntimo)\n\nThese references match the cited awards and recognitions indicating these are studio albums released within the specified time frame. So, there are **four** studio albums released during this period according to the mentioned resources.\n\n**Final Answer**: Mercedes Sosa released **four** studio albums between 2000 and 2009."
81
+ }
82
+ ]
responses_GAIA_Evaluation_DatasetSingle Smolagent with tools Qwen 32B 3planningSteps.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
4
+ "submitted_answer": "Dmitry"
5
+ },
6
+ {
7
+ "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
8
+ "submitted_answer": "89706.00"
9
+ },
10
+ {
11
+ "task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
12
+ "submitted_answer": "Based on the information provided, the pitchers with the numbers before and after Taish\u014d Tamai's number (19) as of July 2023 are:\n\n**Pitcher Before, Pitcher After: Sachiya, Kenta**\n\nSo, the final answer is:\n**Sachiya, Kenta**"
13
+ },
14
+ {
15
+ "task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
16
+ "submitted_answer": "Given the issues encountered with the CSV file, I will use a different approach to solve the task. I will manually extract the data from the Olympedia website and combine it with the IOC country codes.\n\nHere is the plan:\n1. Extract the athlete counts for each country from the Olympedia webpage.\n2. Manually map the country names to their IOC country codes.\n3. Sort the data by the number of athletes in ascending order and then alphabetically by IOC country code.\n4. Identify the country with the least number of athletes.\n5. Provide the IOC country code of the identified country as the final answer.\n\nLet's proceed with this plan.\n\n### Step 1: Extract the athlete counts for each country from the Olympedia webpage.\n\nI will manually extract the data from the Olympedia webpage and store it in a dictionary.\n\n### Step 2: Manually map the country names to their IOC country codes.\n\nI will use a dictionary to map the country names to their IOC country codes.\n\n### Step 3: Sort the data by the number of athletes in ascending order and then alphabetically by IOC country code.\n\n### Step 4: Identify the country with the least number of athletes.\n\n### Step 5: Provide the IOC country code of the identified country as the final answer.\n\nLet's implement this plan.\n```py\n# Step 1: Manually extract the athlete counts for each country from the Olympedia webpage\nathlete_counts = {\n 'ARG': 81,\n 'AUS': 18,\n 'AUT': 73,\n 'BEL': 187,\n 'BUL': 5,\n 'CAN': 68,\n 'CHI': 38,\n 'CUB': 1,\n 'DEN': 91,\n 'EGY': 32,\n 'ESP': 80,\n 'EST': 20,\n 'FIN': 69,\n 'FRA': 294,\n 'GBR': 232,\n 'GER': 298,\n 'GRE': 23,\n 'HAI': 2,\n 'HUN': 109,\n 'IND': 21,\n 'IRL': 38,\n 'ITA': 174,\n 'JPN': 40,\n 'LAT': 17,\n 'LTU': 12,\n 'LUX': 48,\n 'MEX': 30,\n 'MLT': 9,\n 'MON': 10,\n 'NED': 266,\n 'NOR': 52,\n 'NZL': 9,\n 'PAN': 1,\n 'PHI': 4,\n 'POL': 100,\n 'POR': 32,\n 'ROU': 21,\n 'RSA': 24,\n 'SUI': 133,\n 'SWE': 100,\n 'TCH': 69,\n 'TUR': 31,\n 'URU': 17,\n 'USA': 281,\n 'YUG': 34,\n 'ZIM': 2\n}\n\n# Step 2: Manually map the country names to their IOC country codes\ncountry_to_ioc = {\n 'Argentina': 'ARG',\n 'Australia': 'AUS',\n 'Austria': 'AUT',\n 'Belgium': 'BEL',\n 'Bulgaria': 'BUL',\n 'Canada': 'CAN',\n 'Chile': 'CHI',\n 'Cuba': 'CUB',\n 'Denmark': 'DEN',\n 'Egypt': 'EGY',\n 'Spain': 'ESP',\n 'Estonia': 'EST',\n 'Finland': 'FIN',\n 'France': 'FRA',\n 'Great Britain': 'GBR',\n 'Germany': 'GER',\n 'Greece': 'GRE',\n 'Haiti': 'HAI',\n 'Hungary': 'HUN',\n 'India': 'IND',\n 'Ireland': 'IRL',\n 'Italy': 'ITA',\n 'Japan': 'JPN',\n 'Latvia': 'LAT',\n 'Lithuania': 'LTU',\n 'Luxembourg': 'LUX',\n 'Mexico': 'MEX',\n 'Malta': 'MLT',\n 'Monaco': 'MON',\n 'Netherlands': 'NED',\n 'Norway': 'NOR',\n 'New Zealand': 'NZL',\n 'Panama': 'PAN',\n 'Philippines': 'PHI',\n 'Poland': 'POL',\n 'Portugal': 'POR',\n 'Romania': 'ROU',\n 'South Africa': 'RSA',\n 'Switzerland': 'SUI',\n 'Sweden': 'SWE',\n 'Czechoslovakia': 'TCH',\n 'Turkey': 'TUR',\n 'Uruguay': 'URU',\n 'United States': 'USA',\n 'Yugoslavia': 'YUG',\n 'Zimbabwe': 'ZIM'\n}\n\n# Step 3: Sort the data by the number of athletes in ascending order and then alphabetically by IOC country code\nsorted_countries = sorted(athlete_counts.items(), key=lambda x: (x[1], x[0]))\n\n# Step 4: Identify the country with the least number of athletes\nleast_athletes_country_code = sorted_countries[0][0]\n\n# Step 5: Provide the IOC country code of the identified country as the final answer\nprint(f\"The IOC country code of the country with the least number of athletes is {least_athletes_country_code}.\")\n```<end_code>\nCalling tools:\n[{'id': 'call_15', 'type': 'function', 'function': {'name': 'python_interpreter', 'arguments': \"# Step 1: Manually extract the athlete counts for each country from the Olympedia webpage\\nathlete_counts = {\\n 'ARG': 81,\\n 'AUS': 18,\\n 'AUT': 73,\\n 'BEL': 187,\\n 'BUL': 5,\\n 'CAN': 68,\\n 'CHI': 38,\\n 'CUB': 1,\\n 'DEN': 91,\\n 'EGY': 32,\\n 'ESP': 80,\\n 'EST': 20,\\n 'FIN': 69,\\n 'FRA': 294,\\n 'GBR': 232,\\n 'GER': 298,\\n 'GRE': 23,\\n 'HAI': 2,\\n 'HUN': 109,\\n 'IND': 21,\\n 'IRL': 38,\\n 'ITA': 174,\\n 'JPN': 40,\\n 'LAT': 17,\\n 'LTU': 12,\\n 'LUX': 48,\\n 'MEX': 30,\\n 'MLT': 9,\\n 'MON': 10,\\n 'NED': 266,\\n 'NOR': 52,\\n 'NZL': 9,\\n 'PAN': 1,\\n 'PHI': 4,\\n 'POL': 100,\\n 'POR': 32,\\n 'ROU': 21,\\n 'RSA': 24,\\n 'SUI': 133,\\n 'SWE': 100,\\n 'TCH': 69,\\n 'TUR': 31,\\n 'URU': 17,\\n 'USA': 281,\\n 'YUG': 34,\\n 'ZIM': 2\\n}\\n\\n# Step 2: Manually map the country names to their IOC country codes\\ncountry_to_ioc = {\\n 'Argentina': 'ARG',\\n 'Australia': 'AUS',\\n 'Austria': 'AUT',\\n 'Belgium': 'BEL',\\n 'Bulgaria': 'BUL',\\n 'Canada': 'CAN',\\n 'Chile': 'CHI',\\n 'Cuba': 'CUB',\\n 'Denmark': 'DEN',\\n 'Egypt': 'EGY',\\n 'Spain': 'ESP',\\n 'Estonia': 'EST',\\n 'Finland': 'FIN',\\n 'France': 'FRA',\\n 'Great Britain': 'GBR',\\n 'Germany': 'GER',\\n 'Greece': 'GRE',\\n 'Haiti': 'HAI',\\n 'Hungary': 'HUN',\\n 'India': 'IND',\\n 'Ireland': 'IRL',\\n 'Italy': 'ITA',\\n 'Japan': 'JPN',\\n 'Latvia': 'LAT',\\n 'Lithuania': 'LTU',\\n 'Luxembourg': 'LUX',\\n 'Mexico': 'MEX',\\n 'Malta': 'MLT"
17
+ },
18
+ {
19
+ "task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
20
+ "submitted_answer": "Error running agent: Error in generating model output:\n503 Server Error: Service Temporarily Unavailable for url: https://router.huggingface.co/together/v1/chat/completions"
21
+ },
22
+ {
23
+ "task_id": "840bfca7-4f7b-481a-8794-c560c340185d",
24
+ "submitted_answer": "Error running agent: Error in generating model output:\n422 Client Error: Unprocessable Entity for url: https://router.huggingface.co/together/v1/chat/completions (Request ID: o1M6n9q-4YNCb4-958746faadec07fb)\n\n{'message': 'Input validation error: `inputs` tokens + `max_new_tokens` must be <= 32769. Given: 45347 `inputs` tokens and 2048 `max_new_tokens`', 'type': 'invalid_request_error', 'param': None, 'code': None}\n{\n \"id\": \"o1M6n9q-4YNCb4-958746faadec07fb\",\n \"error\": {\n \"message\": \"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 32769. Given: 45347 `inputs` tokens and 2048 `max_new_tokens`\",\n \"type\": \"invalid_request_error\",\n \"param\": null,\n \"code\": null\n }\n}\n"
25
+ },
26
+ {
27
+ "task_id": "1f975693-876d-457b-a649-393859e79bf3",
28
+ "submitted_answer": "132,197,245"
29
+ },
30
+ {
31
+ "task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
32
+ "submitted_answer": ""
33
+ }
34
+ ]
single_smolagent.py CHANGED
@@ -19,6 +19,7 @@ from openinference.instrumentation.smolagents import SmolagentsInstrumentor
19
  from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
20
  from opentelemetry.sdk.trace.export import SimpleSpanProcessor
21
  from mcp import StdioServerParameters
 
22
 
23
  trace_provider = TracerProvider()
24
  trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
@@ -62,6 +63,7 @@ def build_agents():
62
  transcribe_mp3_with_whisper],
63
  model=model,
64
  additional_authorized_imports=["time","pandas","json","numpy","markdownify","requests","re","openpyxl","beautifulsoup4"],
 
65
  planning_interval=3,
66
  max_steps=10,
67
  add_base_tools=True)
 
19
  from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
20
  from opentelemetry.sdk.trace.export import SimpleSpanProcessor
21
  from mcp import StdioServerParameters
22
+ from tools.transcribe import load_images
23
 
24
  trace_provider = TracerProvider()
25
  trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
 
63
  transcribe_mp3_with_whisper],
64
  model=model,
65
  additional_authorized_imports=["time","pandas","json","numpy","markdownify","requests","re","openpyxl","beautifulsoup4"],
66
+ step_callbacks=[load_images],
67
  planning_interval=3,
68
  max_steps=10,
69
  add_base_tools=True)
tools/transcribe.py CHANGED
@@ -7,6 +7,42 @@ from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLo
7
 
8
  from langchain.schema import Document
9
  from smolagents import tool
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  @tool
12
  def parse_youtube_video(url:str,task_id:str,save_dir:str="attachments")->Document:
@@ -26,4 +62,69 @@ def parse_youtube_video(url:str,task_id:str,save_dir:str="attachments")->Documen
26
  else:
27
  parser = OpenAIWhisperParser()
28
  document = GenericLoader(loader, parser).load()
29
- return document
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  from langchain.schema import Document
9
  from smolagents import tool
10
+ from yt_dlp import YoutubeDL
11
+ from PIL import Image
12
+ import cv2
13
+ import numpy as np
14
+ from smolagents.agents import ActionStep
15
+ from smolagents import CodeAgent
16
+
17
+ def get_video_frames(video_path:str,task_id:str)->list[str]:
18
+ vidcap = cv2.VideoCapture(video_path)
19
+ total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
20
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
21
+
22
+ frames = []
23
+ frame_indices = np.linspace(0, total_frames - 1, 7, dtype=int)
24
+
25
+ for i in frame_indices:
26
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
27
+ success, image = vidcap.read()
28
+ if success:
29
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert from BGR to RGB
30
+ pil_image = Image.fromarray(image)
31
+ timestamp = round(i / fps, 2)
32
+ frames.append((pil_image, timestamp))
33
+
34
+ vidcap.release()
35
+ observations = []
36
+ for frame in frames:
37
+ image, timestamp = frame
38
+ image.save(f"attachments/frame_{timestamp}.png", format='PNG')
39
+ observations.append({
40
+ "image": f"attachments/{task_id}_frame_{timestamp}.png",
41
+ "timestamp": timestamp
42
+ })
43
+
44
+ return observations
45
+
46
 
47
  @tool
48
  def parse_youtube_video(url:str,task_id:str,save_dir:str="attachments")->Document:
 
62
  else:
63
  parser = OpenAIWhisperParser()
64
  document = GenericLoader(loader, parser).load()
65
+ return document
66
+
67
+ @tool
68
+ def download_youtube_video(url:str,task_id:str,save_dir:str="attachments")->list[str]:
69
+ """Returns a list of framws of a YouTube video.
70
+ Args:
71
+ url (str): The URL of the YouTube video.
72
+ task_id (str): The task ID to save the transcript.
73
+ save_dir (str): The directory to save the downloaded video. Defaults to "attachments".
74
+ Returns:
75
+ str: The path to the downloaded video file."""
76
+ print(f"Downloading audio from YouTube: {url}")
77
+ #output_path = generate_unique_filename(".wav")
78
+ ydl_opts = {
79
+ 'format': 'bestvideo[ext=mp4][height<=480]/bestvideo[ext=mp4]/bestvideo',
80
+ 'outtmpl': os.path.join(save_dir, task_id+'.%(ext)s'),
81
+ # 'postprocessors': [{
82
+ # 'key': 'FFmpegVideoConvertor',
83
+ # 'preferredformat': 'mp4',
84
+ # }],
85
+ 'progress_hooks': [lambda d: print(d['status'])],
86
+ 'ignoreerrors': True,
87
+ 'no_warnings': False,
88
+ 'log_verbosity': 'quiet',
89
+ }
90
+ try:
91
+ with YoutubeDL(ydl_opts) as ydl:
92
+ info_dict = ydl.extract_info(url, download=True)
93
+ video_title = info_dict.get('title', 'video')
94
+ print(f"Successfully downloaded '{video_title}' as low-quality MP4 (video-only) to '{save_dir}'")
95
+ except Exception as e:
96
+ print(f"An error occurred: {e}")
97
+ # get the video frames:
98
+ observations = []
99
+ observations = get_video_frames(os.path.join(save_dir, task_id+'.mp4'),task_id)
100
+
101
+ return observations
102
+
103
+ def load_images(step_log: ActionStep, agent: CodeAgent) -> None:
104
+ current_step = step_log.step_number
105
+
106
+
107
+ #for step_logs in agent.logs: # Remove previous screenshots from logs for lean processing
108
+ # if isinstance(step_log, ActionStep) and step_log.step_number <= current_step - 2:
109
+ # step_logs.observations_images = None
110
+ print(step_log.model_output)
111
+ # if isinstance(step_logs,
112
+ # image = Image.open(BytesIO(png_bytes))
113
+ # print(f"Captured a browser screenshot: {image.size} pixels")
114
+ # step_log.observations_images = [image.copy()] # Create a copy to ensure it persists, important!
115
+
116
+ # # Update observations with current URL
117
+ # url_info = f"Current url: {driver.current_url}"
118
+ # step_log.observations = url_info if step_logs.observations is None else step_log.observations + "\n" + url_info
119
+ return
120
+
121
+
122
+
123
+ if __name__ == "__main__":
124
+
125
+ url = "https://www.youtube.com/watch?v=1htKBjuUWec"
126
+ # https://www.youtube.com/watch?v=L1vXCYZAYYM
127
+ task_id = "test_task"
128
+ save_dir = "attachments"
129
+ # Test the YouTube video parsing
130
+ download_youtube_video(url, task_id, save_dir)