Final_Assignment_Template

Sleeping

App Files Files Community

José Enrique commited on Sep 11, 2025

Commit

9ccff9e

1 Parent(s): 61c17f1

updated evaluation files

Browse files

Files changed (9) hide show

evaluation.py +16 -16
evaluation_langgraph.py +183 -0
langgraph_agent.py +82 -0
requirements.txt +3 -1
responses_GAIA_Evaluation_DatasetSingle Smolagent with tools OpenAI 4o 3planningSteps youtube.json +6 -0
responses_GAIA_Evaluation_DatasetSingle Smolagent with tools OpenAI 4o 3planningSteps.json +82 -0
responses_GAIA_Evaluation_DatasetSingle Smolagent with tools Qwen 32B 3planningSteps.json +34 -0
single_smolagent.py +2 -0
tools/transcribe.py +102 -1

evaluation.py CHANGED Viewed

@@ -124,16 +124,16 @@ def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name):
         ) as root_span:
             root_span.update(input=item.input)
             task_id = item.metadata["task_id"]
-            #if task_id == "7bd855d8-463d-4ed5-93ca-5fe35145f733":
-            try:
-                output = run_agent(agent,item.input,trace_name,item.metadata)
-                responses.append({"task_id": task_id, "submitted_answer": output})
-                root_span.update(output=output)
-            except Exception as e:
-                output = f"Error running agent: {e}"
-            # score the result against the expected output
-            root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output))
             # Link the trace to the dataset item for analysis
             # item.link(
@@ -167,13 +167,13 @@ def evaluate():
     print("Starting agent...")
     agent = build_agents()
     print("Agent built successfully.")
-    #run_evaluation(agent,"GAIA_Evaluation_Dataset","Single Smolagent with tools OpenAI 4o 3planningSteps","OpenAI gpt4o","smolagent-trace")
-    simple_evaluation("Dimitry","Clasu")
-    print("comparison", simple_evaluation("Dimitry","Clasu"))
-    print("sain", simple_evaluation('"Saint Petersburg"',"Saint Petersburg"))
-    print("pages", simple_evaluation('"132,133,136,195,245"',"132, 133, 134, 197, 245"))
-    print("veg", simple_evaluation('"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"',"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"))
-    print("right", simple_evaluation('"right"',"Right"))
 if __name__ == "__main__":
     evaluate()

         ) as root_span:
             root_span.update(input=item.input)
             task_id = item.metadata["task_id"]
+            if task_id == "a1e91b78-d3d8-4675-bb8d-62741b4b68a6":
+                try:
+                    output = run_agent(agent,item.input,trace_name,item.metadata)
+                    responses.append({"task_id": task_id, "submitted_answer": output})
+                    root_span.update(output=output)
+                except Exception as e:
+                    output = f"Error running agent: {e}"
+                # score the result against the expected output
+                root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output))
             # Link the trace to the dataset item for analysis
             # item.link(
     print("Starting agent...")
     agent = build_agents()
     print("Agent built successfully.")
+    run_evaluation(agent,"GAIA_Evaluation_Dataset","Single Smolagent with tools OpenAI 4o 3planningSteps youtube","OpenAI gpt4o","smolagent-trace")
+    # simple_evaluation("Dimitry","Clasu")
+    # print("comparison", simple_evaluation("Dimitry","Clasu"))
+    # print("sain", simple_evaluation('"Saint Petersburg"',"Saint Petersburg"))
+    # print("pages", simple_evaluation('"132,133,136,195,245"',"132, 133, 134, 197, 245"))
+    # print("veg", simple_evaluation('"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"',"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"))
+    # print("right", simple_evaluation('"right"',"Right"))
 if __name__ == "__main__":
     evaluate()

evaluation_langgraph.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import os
+import json
+from dotenv import load_dotenv
+from opentelemetry.trace import format_trace_id, get_tracer
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+from openinference.instrumentation.smolagents import SmolagentsInstrumentor
+from langfuse import observe
+from PIL import Image
+from langgraph_agent import build_agents
+from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
+from langfuse import Langfuse
+# Load environment variables
+load_dotenv()
+langfuse = Langfuse()
+# Initialize OpenTelemetry Tracer
+#trace_provider = TracerProvider()
+#trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
+#trace.set_tracer_provider(trace_provider) # Set as global provider
+#tracer = trace.get_tracer(__name__) # Get a tracer instance
+#SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
+def add_image(metadata)->list:
+    images = []
+    task_id = metadata["task_id"]
+    attachment = metadata.get("attachment", False)
+    if attachment:
+        os.file_path = f"attachments/{attachment}"
+        if os.path.exists(os.file_path):
+            print("Attachments found for task_id:", task_id)
+        #    with open(os.file_path, "rb") as file:
+        #        question += f"\n\nAttachments: {file.read().decode('utf-8')}"
+        else:
+            print(f"No attachments found for task_id: {task_id}")
+        # if the file is an image, we can add it to the question
+        if os.path.isfile(os.file_path) and os.path.splitext(os.file_path)[1].lower() in ['.jpg', '.jpeg', '.png']:
+            # open the image and convert it to RGB
+            with open(os.file_path, "rb") as file:
+                # Read the image file and convert it to RGB
+                image = Image.open(file).convert("RGB")
+                images.append(image)
+    return images
+#@observe()
+def run_agent(agent, question,trace_name,metadata):
+    # with tracer.start_as_current_span(trace_name) as span:
+    #     span.set_attribute("langfuse.tag", "dataset-run")
+    #     span.set_attribute("langfuse.input", question)
+        # if the question has attachments:
+        # find file under /attachments with the same task_id
+    images = add_image(metadata)
+    question = question + " The task_id is: " + metadata["task_id"]
+    messages = [HumanMessage(content=question )]
+    try:
+        messages = agent.invoke(
+            {"messages": messages}
+        )
+    except Exception as e:
+        print(f"Error running agent: {e}")
+        output = f"Error running agent: {e}"
+    #span.set_attribute("langfuse.output", output)
+    # current_span = trace.get_current_span()
+    # span_context = current_span.get_span_context()
+    # trace_id = span_context.trace_id
+    # formatted_trace_id = format_trace_id(trace_id)
+    # langfuse_trace = langfuse.get_trace(id=formatted_trace_id)
+    # langfuse_trace = langfuse.trace(
+    #     id=formatted_trace_id,
+    #     input=question,
+    #     output=output
+    # )
+    return output
+def simple_evaluation(output, expected_output):
+  trimmed_output = str(output).strip().strip('"').strip("$")
+  # see if the output is a list:
+  expected_output_list = [item.strip() for item in expected_output.split(",") if item.strip()]
+  output_list = [item.strip() for item in trimmed_output.split(",") if item.strip()]
+  similarity = 0.0
+  if not expected_output_list and not output_list:
+    similarity = 0.0
+    if trimmed_output == expected_output:
+        similarity = 1.0
+    elif expected_output.toLower() == trimmed_output.toLower():
+        similarity = 0.8
+    else:
+        similarity = 0.0
+    return similarity
+  common_items_count = 0
+  matched_items = [False]*len(output_list)
+  for item in expected_output_list:
+      for i,item2 in enumerate(output_list):
+          if not matched_items[i]:
+              similarity = 0.0
+              if item == item2:
+                  similarity = 1.0
+              elif item.lower() == item2.lower():
+                  similarity = 0.8
+              else:
+                  similarity = 0.0
+              if similarity >= 0.8:
+                  matched_items[i] = True
+                  break
+  common_items_count = sum(matched_items) / len(expected_output_list)
+  return common_items_count
+def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name):
+    dataset = langfuse.get_dataset(langfuse_dataset)
+    responses = []
+    # Run our agent against each dataset item (limited to first 10 above)
+    for item in dataset.items:
+        print(f"Processing item with task_id: {item.metadata['task_id']}")
+        with item.run(
+            run_name = run_name
+        ) as root_span:
+            root_span.update(input=item.input)
+            task_id = item.metadata["task_id"]
+            if task_id == "a1e91b78-d3d8-4675-bb8d-62741b4b68a6":
+                try:
+                    output = run_agent(agent,item.input,trace_name,item.metadata)
+                    responses.append({"task_id": task_id, "submitted_answer": output})
+                    root_span.update(output=output)
+                except Exception as e:
+                    output = f"Error running agent: {e}"
+                # score the result against the expected output
+                root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output))
+            # Link the trace to the dataset item for analysis
+            # item.link(
+            #     langfuse_trace,
+            #     run_name=run_name,
+            #     run_metadata={ "model": model_id }
+            # )
+            # Optionally, store a quick evaluation score for demonstration
+            # langfuse_trace.score(
+            #     name="<example_eval>",
+            #     value=1,
+            #     comment="This is a comment"
+            # )
+        # Flush data to ensure all telemetry is sent
+    langfuse.flush()
+    # Save the responses to a JSON lines file
+    print("Saving responses to file...")
+    responses = [{"task_id": item["task_id"], "submitted_answer": item["submitted_answer"]} for item in responses]
+    filename = langfuse_dataset+run_name
+    output_file = f"responses_{filename}.json"
+    with open(output_file, "w") as f:
+        json.dump(responses, f, indent=4)
+    print(f"Responses saved to {output_file}")
+def evaluate():
+    print("Starting agent...")
+    agent = build_agents()
+    print("Agent built successfully.")
+    run_evaluation(agent,"GAIA_Evaluation_Dataset","Single Langraph agent","OpenAI gpt4o","langraph-trace")
+    # simple_evaluation("Dimitry","Clasu")
+    # print("comparison", simple_evaluation("Dimitry","Clasu"))
+    # print("sain", simple_evaluation('"Saint Petersburg"',"Saint Petersburg"))
+    # print("pages", simple_evaluation('"132,133,136,195,245"',"132, 133, 134, 197, 245"))
+    # print("veg", simple_evaluation('"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"',"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"))
+    # print("right", simple_evaluation('"right"',"Right"))
+if __name__ == "__main__":
+    evaluate()

langgraph_agent.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import os
+from typing import List, TypedDict, Annotated, Optional
+from langgraph.graph import StateGraph, START, END
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
+from langgraph.graph.message import add_messages
+from langgraph.prebuilt import ToolNode, tools_condition
+from tools.searchTools import wiki_search, mini_web_search, arvix_search
+class AgentState(TypedDict):
+    input_file: Optional[str]
+    messages: Annotated[list[AnyMessage], add_messages]
+# add toools:
+tools = [
+    wiki_search,
+    mini_web_search,
+    arvix_search,
+]
+# LLM model and tools
+vision_llm = ChatOpenAI(model="gpt-4o")
+llm = ChatOpenAI(model="gpt-4o")
+llm_withtools = llm.bind_tools(tools, parallel_tool_calls = False)
+def agent(state:AgentState):
+    tools_description = """
+    wiki_search(query: str) -> str:
+    Search Wikipedia for a query and return maximum 2 results.
+    Args:
+        query: The search query.
+    Search Tavily for a query and return maximum 3 results.
+    mini_web_search(query: str) -> str:
+    Args:
+        query: The search query
+    arvix_search(query: str) -> str:
+    Search Arxiv for a query and return maximum 3 result.
+    Args:
+        query: The search query.
+    """
+    sys_message = SystemMessage(content=f"""You are a helpful AI agent that can use tools to answer questions.
+    You can use the following tools:{tools_description}
+    PLEASE FOLLOW THE INSTRUCTIONS FOR ANSWERING CAREFULLY:
+    Your answer should follow the template: FINAL ANSWER: [YOUR FINAL ANSWER].
+    YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+    If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
+    If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
+    If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
+    """)
+    return {
+    "input_file": state["input_file"],
+    "messages": [llm_withtools.invoke([sys_message]+ state["messages"])]
+    }
+def build_agents():
+    builder = StateGraph(AgentState)
+    builder.add_node("agent",agent)
+    builder.add_node("tools",ToolNode(tools))
+    builder.add_edge(START, "agent")
+    builder.add_conditional_edges(
+        "agent",
+        tools_condition,
+    )
+    builder.add_edge("tools", "agent")
+    react_graph = builder.compile()
+    return react_graph

requirements.txt CHANGED Viewed

@@ -31,4 +31,6 @@ pandas
 numpy
 beautifulsoup4
 openai
-pydub

 numpy
 beautifulsoup4
 openai
+pydub
+yt-dlp
+opencv-python

responses_GAIA_Evaluation_DatasetSingle Smolagent with tools OpenAI 4o 3planningSteps youtube.json ADDED Viewed

	@@ -0,0 +1,6 @@

+[
+    {
+        "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
+        "submitted_answer": "The task could not be completed due to limitations in accessing and analyzing video content directly through current tools."
+    }
+]

responses_GAIA_Evaluation_DatasetSingle Smolagent with tools OpenAI 4o 3planningSteps.json ADDED Viewed

	@@ -0,0 +1,82 @@

+[
+    {
+        "task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
+        "submitted_answer": "Dmitry"
+    },
+    {
+        "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
+        "submitted_answer": "$89706.00"
+    },
+    {
+        "task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
+        "submitted_answer": "Yamasaki, Uehara"
+    },
+    {
+        "task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
+        "submitted_answer": "LUX"
+    },
+    {
+        "task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
+        "submitted_answer": "Saint Petersburg"
+    },
+    {
+        "task_id": "840bfca7-4f7b-481a-8794-c560c340185d",
+        "submitted_answer": "The work performed by R. G. Arendt related to the paper \"The Population of the Galactic Center Filaments: Position Angle Distribution Reveals a Degree-scale Collimated Outflow from Sgr A*\" was supported under the NASA award number 80GSFC21M0002."
+    },
+    {
+        "task_id": "1f975693-876d-457b-a649-393859e79bf3",
+        "submitted_answer": "132,133,134,197,245"
+    },
+    {
+        "task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
+        "submitted_answer": 519
+    },
+    {
+        "task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
+        "submitted_answer": 0
+    },
+    {
+        "task_id": "305ac316-eef6-4446-960a-92d80d542f82",
+        "submitted_answer": "Wojciech"
+    },
+    {
+        "task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
+        "submitted_answer": "cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"
+    },
+    {
+        "task_id": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
+        "submitted_answer": "broccoli, celery, lettuce, sweet potatoes"
+    },
+    {
+        "task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
+        "submitted_answer": "Louvrier"
+    },
+    {
+        "task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
+        "submitted_answer": "Extremely"
+    },
+    {
+        "task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
+        "submitted_answer": "b,e"
+    },
+    {
+        "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
+        "submitted_answer": "It appears that navigating through the available content has yet to yield direct information regarding the nominator of the \"Giganotosaurus\" featured article nomination. However, using data available from my training, the nomination of \"Giganotosaurus\" as a Featured Article was put forward by the Wikipedia user \"FunkMonk.\""
+    },
+    {
+        "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
+        "submitted_answer": "Error running agent: 'str' object has no attribute 'token_usage'"
+    },
+    {
+        "task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
+        "submitted_answer": "right"
+    },
+    {
+        "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
+        "submitted_answer": "3"
+    },
+    {
+        "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
+        "submitted_answer": "To determine how many studio albums Mercedes Sosa published between 2000 and 2009, we can rely on the extracted information from a review of Mercedes Sosa's discography and award records listed on or linked from Wikipedia:\n\nFrom the search results and other visible references on Wikipedia pages:\n- **2000**: Misa Criolla (acknowledged with a Grammy award)\n- **2003**: Ac\u00fastico\n- **2006**: Coraz\u00f3n Libre\n- **2009**: Cantora 1 (part of Cantora, un Viaje \u00cdntimo)\n\nThese references match the cited awards and recognitions indicating these are studio albums released within the specified time frame. So, there are **four** studio albums released during this period according to the mentioned resources.\n\n**Final Answer**: Mercedes Sosa released **four** studio albums between 2000 and 2009."
+    }
+]

responses_GAIA_Evaluation_DatasetSingle Smolagent with tools Qwen 32B 3planningSteps.json ADDED Viewed

	@@ -0,0 +1,34 @@

+[
+    {
+        "task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
+        "submitted_answer": "Dmitry"
+    },
+    {
+        "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
+        "submitted_answer": "89706.00"
+    },
+    {
+        "task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
+        "submitted_answer": "Based on the information provided, the pitchers with the numbers before and after Taish\u014d Tamai's number (19) as of July 2023 are:\n\n**Pitcher Before, Pitcher After: Sachiya, Kenta**\n\nSo, the final answer is:\n**Sachiya, Kenta**"
+    },
+    {
+        "task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
+        "submitted_answer": "Given the issues encountered with the CSV file, I will use a different approach to solve the task. I will manually extract the data from the Olympedia website and combine it with the IOC country codes.\n\nHere is the plan:\n1. Extract the athlete counts for each country from the Olympedia webpage.\n2. Manually map the country names to their IOC country codes.\n3. Sort the data by the number of athletes in ascending order and then alphabetically by IOC country code.\n4. Identify the country with the least number of athletes.\n5. Provide the IOC country code of the identified country as the final answer.\n\nLet's proceed with this plan.\n\n### Step 1: Extract the athlete counts for each country from the Olympedia webpage.\n\nI will manually extract the data from the Olympedia webpage and store it in a dictionary.\n\n### Step 2: Manually map the country names to their IOC country codes.\n\nI will use a dictionary to map the country names to their IOC country codes.\n\n### Step 3: Sort the data by the number of athletes in ascending order and then alphabetically by IOC country code.\n\n### Step 4: Identify the country with the least number of athletes.\n\n### Step 5: Provide the IOC country code of the identified country as the final answer.\n\nLet's implement this plan.\n```py\n# Step 1: Manually extract the athlete counts for each country from the Olympedia webpage\nathlete_counts = {\n    'ARG': 81,\n    'AUS': 18,\n    'AUT': 73,\n    'BEL': 187,\n    'BUL': 5,\n    'CAN': 68,\n    'CHI': 38,\n    'CUB': 1,\n    'DEN': 91,\n    'EGY': 32,\n    'ESP': 80,\n    'EST': 20,\n    'FIN': 69,\n    'FRA': 294,\n    'GBR': 232,\n    'GER': 298,\n    'GRE': 23,\n    'HAI': 2,\n    'HUN': 109,\n    'IND': 21,\n    'IRL': 38,\n    'ITA': 174,\n    'JPN': 40,\n    'LAT': 17,\n    'LTU': 12,\n    'LUX': 48,\n    'MEX': 30,\n    'MLT': 9,\n    'MON': 10,\n    'NED': 266,\n    'NOR': 52,\n    'NZL': 9,\n    'PAN': 1,\n    'PHI': 4,\n    'POL': 100,\n    'POR': 32,\n    'ROU': 21,\n    'RSA': 24,\n    'SUI': 133,\n    'SWE': 100,\n    'TCH': 69,\n    'TUR': 31,\n    'URU': 17,\n    'USA': 281,\n    'YUG': 34,\n    'ZIM': 2\n}\n\n# Step 2: Manually map the country names to their IOC country codes\ncountry_to_ioc = {\n    'Argentina': 'ARG',\n    'Australia': 'AUS',\n    'Austria': 'AUT',\n    'Belgium': 'BEL',\n    'Bulgaria': 'BUL',\n    'Canada': 'CAN',\n    'Chile': 'CHI',\n    'Cuba': 'CUB',\n    'Denmark': 'DEN',\n    'Egypt': 'EGY',\n    'Spain': 'ESP',\n    'Estonia': 'EST',\n    'Finland': 'FIN',\n    'France': 'FRA',\n    'Great Britain': 'GBR',\n    'Germany': 'GER',\n    'Greece': 'GRE',\n    'Haiti': 'HAI',\n    'Hungary': 'HUN',\n    'India': 'IND',\n    'Ireland': 'IRL',\n    'Italy': 'ITA',\n    'Japan': 'JPN',\n    'Latvia': 'LAT',\n    'Lithuania': 'LTU',\n    'Luxembourg': 'LUX',\n    'Mexico': 'MEX',\n    'Malta': 'MLT',\n    'Monaco': 'MON',\n    'Netherlands': 'NED',\n    'Norway': 'NOR',\n    'New Zealand': 'NZL',\n    'Panama': 'PAN',\n    'Philippines': 'PHI',\n    'Poland': 'POL',\n    'Portugal': 'POR',\n    'Romania': 'ROU',\n    'South Africa': 'RSA',\n    'Switzerland': 'SUI',\n    'Sweden': 'SWE',\n    'Czechoslovakia': 'TCH',\n    'Turkey': 'TUR',\n    'Uruguay': 'URU',\n    'United States': 'USA',\n    'Yugoslavia': 'YUG',\n    'Zimbabwe': 'ZIM'\n}\n\n# Step 3: Sort the data by the number of athletes in ascending order and then alphabetically by IOC country code\nsorted_countries = sorted(athlete_counts.items(), key=lambda x: (x[1], x[0]))\n\n# Step 4: Identify the country with the least number of athletes\nleast_athletes_country_code = sorted_countries[0][0]\n\n# Step 5: Provide the IOC country code of the identified country as the final answer\nprint(f\"The IOC country code of the country with the least number of athletes is {least_athletes_country_code}.\")\n```<end_code>\nCalling tools:\n[{'id': 'call_15', 'type': 'function', 'function': {'name': 'python_interpreter', 'arguments': \"# Step 1: Manually extract the athlete counts for each country from the Olympedia webpage\\nathlete_counts = {\\n    'ARG': 81,\\n    'AUS': 18,\\n    'AUT': 73,\\n    'BEL': 187,\\n    'BUL': 5,\\n    'CAN': 68,\\n    'CHI': 38,\\n    'CUB': 1,\\n    'DEN': 91,\\n    'EGY': 32,\\n    'ESP': 80,\\n    'EST': 20,\\n    'FIN': 69,\\n    'FRA': 294,\\n    'GBR': 232,\\n    'GER': 298,\\n    'GRE': 23,\\n    'HAI': 2,\\n    'HUN': 109,\\n    'IND': 21,\\n    'IRL': 38,\\n    'ITA': 174,\\n    'JPN': 40,\\n    'LAT': 17,\\n    'LTU': 12,\\n    'LUX': 48,\\n    'MEX': 30,\\n    'MLT': 9,\\n    'MON': 10,\\n    'NED': 266,\\n    'NOR': 52,\\n    'NZL': 9,\\n    'PAN': 1,\\n    'PHI': 4,\\n    'POL': 100,\\n    'POR': 32,\\n    'ROU': 21,\\n    'RSA': 24,\\n    'SUI': 133,\\n    'SWE': 100,\\n    'TCH': 69,\\n    'TUR': 31,\\n    'URU': 17,\\n    'USA': 281,\\n    'YUG': 34,\\n    'ZIM': 2\\n}\\n\\n# Step 2: Manually map the country names to their IOC country codes\\ncountry_to_ioc = {\\n    'Argentina': 'ARG',\\n    'Australia': 'AUS',\\n    'Austria': 'AUT',\\n    'Belgium': 'BEL',\\n    'Bulgaria': 'BUL',\\n    'Canada': 'CAN',\\n    'Chile': 'CHI',\\n    'Cuba': 'CUB',\\n    'Denmark': 'DEN',\\n    'Egypt': 'EGY',\\n    'Spain': 'ESP',\\n    'Estonia': 'EST',\\n    'Finland': 'FIN',\\n    'France': 'FRA',\\n    'Great Britain': 'GBR',\\n    'Germany': 'GER',\\n    'Greece': 'GRE',\\n    'Haiti': 'HAI',\\n    'Hungary': 'HUN',\\n    'India': 'IND',\\n    'Ireland': 'IRL',\\n    'Italy': 'ITA',\\n    'Japan': 'JPN',\\n    'Latvia': 'LAT',\\n    'Lithuania': 'LTU',\\n    'Luxembourg': 'LUX',\\n    'Mexico': 'MEX',\\n    'Malta': 'MLT"
+    },
+    {
+        "task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
+        "submitted_answer": "Error running agent: Error in generating model output:\n503 Server Error: Service Temporarily Unavailable for url: https://router.huggingface.co/together/v1/chat/completions"
+    },
+    {
+        "task_id": "840bfca7-4f7b-481a-8794-c560c340185d",
+        "submitted_answer": "Error running agent: Error in generating model output:\n422 Client Error: Unprocessable Entity for url: https://router.huggingface.co/together/v1/chat/completions (Request ID: o1M6n9q-4YNCb4-958746faadec07fb)\n\n{'message': 'Input validation error: `inputs` tokens + `max_new_tokens` must be <= 32769. Given: 45347 `inputs` tokens and 2048 `max_new_tokens`', 'type': 'invalid_request_error', 'param': None, 'code': None}\n{\n  \"id\": \"o1M6n9q-4YNCb4-958746faadec07fb\",\n  \"error\": {\n    \"message\": \"Input validation error: `inputs` tokens + `max_new_tokens` must be <= 32769. Given: 45347 `inputs` tokens and 2048 `max_new_tokens`\",\n    \"type\": \"invalid_request_error\",\n    \"param\": null,\n    \"code\": null\n  }\n}\n"
+    },
+    {
+        "task_id": "1f975693-876d-457b-a649-393859e79bf3",
+        "submitted_answer": "132,197,245"
+    },
+    {
+        "task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
+        "submitted_answer": ""
+    }
+]

single_smolagent.py CHANGED Viewed

@@ -19,6 +19,7 @@ from openinference.instrumentation.smolagents import SmolagentsInstrumentor
 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
 from opentelemetry.sdk.trace.export import SimpleSpanProcessor
 from mcp import StdioServerParameters
 trace_provider = TracerProvider()
 trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
@@ -62,6 +63,7 @@ def build_agents():
             transcribe_mp3_with_whisper],
         model=model,
         additional_authorized_imports=["time","pandas","json","numpy","markdownify","requests","re","openpyxl","beautifulsoup4"],
         planning_interval=3,
         max_steps=10,
         add_base_tools=True)

 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
 from opentelemetry.sdk.trace.export import SimpleSpanProcessor
 from mcp import StdioServerParameters
+from tools.transcribe import load_images
 trace_provider = TracerProvider()
 trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
             transcribe_mp3_with_whisper],
         model=model,
         additional_authorized_imports=["time","pandas","json","numpy","markdownify","requests","re","openpyxl","beautifulsoup4"],
+        step_callbacks=[load_images],
         planning_interval=3,
         max_steps=10,
         add_base_tools=True)

tools/transcribe.py CHANGED Viewed

@@ -7,6 +7,42 @@ from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLo
 from langchain.schema import Document
 from smolagents import tool
 @tool
 def parse_youtube_video(url:str,task_id:str,save_dir:str="attachments")->Document:
@@ -26,4 +62,69 @@ def parse_youtube_video(url:str,task_id:str,save_dir:str="attachments")->Documen
     else:
         parser = OpenAIWhisperParser()
         document = GenericLoader(loader, parser).load()
-    return document

 from langchain.schema import Document
 from smolagents import tool
+from yt_dlp import YoutubeDL
+from PIL import Image
+import cv2
+import numpy as np
+from smolagents.agents import ActionStep
+from smolagents import CodeAgent
+def get_video_frames(video_path:str,task_id:str)->list[str]:
+    vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    frame_indices = np.linspace(0, total_frames - 1, 7, dtype=int)
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert from BGR to RGB
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
+    vidcap.release()
+    observations = []
+    for frame in frames:
+        image, timestamp = frame
+        image.save(f"attachments/frame_{timestamp}.png", format='PNG')
+        observations.append({
+            "image": f"attachments/{task_id}_frame_{timestamp}.png",
+            "timestamp": timestamp
+        })
+    return observations
 @tool
 def parse_youtube_video(url:str,task_id:str,save_dir:str="attachments")->Document:
     else:
         parser = OpenAIWhisperParser()
         document = GenericLoader(loader, parser).load()
+    return document
+@tool
+def download_youtube_video(url:str,task_id:str,save_dir:str="attachments")->list[str]:
+    """Returns a list of framws of a YouTube video.
+    Args:
+        url (str): The URL of the YouTube video.
+        task_id (str): The task ID to save the transcript.
+        save_dir (str): The directory to save the downloaded video. Defaults to "attachments".
+    Returns:
+        str: The path to the downloaded video file."""
+    print(f"Downloading audio from YouTube: {url}")
+    #output_path = generate_unique_filename(".wav")
+    ydl_opts = {
+        'format': 'bestvideo[ext=mp4][height<=480]/bestvideo[ext=mp4]/bestvideo',
+        'outtmpl': os.path.join(save_dir, task_id+'.%(ext)s'),
+        # 'postprocessors': [{
+        #     'key': 'FFmpegVideoConvertor',
+        #     'preferredformat': 'mp4',
+        # }],
+        'progress_hooks': [lambda d: print(d['status'])],
+        'ignoreerrors': True,
+        'no_warnings': False,
+        'log_verbosity': 'quiet',
+    }
+    try:
+        with YoutubeDL(ydl_opts) as ydl:
+            info_dict = ydl.extract_info(url, download=True)
+            video_title = info_dict.get('title', 'video')
+            print(f"Successfully downloaded '{video_title}' as low-quality MP4 (video-only) to '{save_dir}'")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    # get the video frames:
+    observations = []
+    observations = get_video_frames(os.path.join(save_dir, task_id+'.mp4'),task_id)
+    return observations
+def load_images(step_log: ActionStep, agent: CodeAgent) -> None:
+    current_step = step_log.step_number
+    #for step_logs in agent.logs:  # Remove previous screenshots from logs for lean processing
+    #        if isinstance(step_log, ActionStep) and step_log.step_number <= current_step - 2:
+    #            step_logs.observations_images = None
+    print(step_log.model_output)
+    #     if isinstance(step_logs,
+    #     image = Image.open(BytesIO(png_bytes))
+    #     print(f"Captured a browser screenshot: {image.size} pixels")
+    #     step_log.observations_images = [image.copy()]  # Create a copy to ensure it persists, important!
+    # # Update observations with current URL
+    # url_info = f"Current url: {driver.current_url}"
+    # step_log.observations = url_info if step_logs.observations is None else step_log.observations + "\n" + url_info
+    return
+if __name__ == "__main__":
+    url = "https://www.youtube.com/watch?v=1htKBjuUWec"
+    # https://www.youtube.com/watch?v=L1vXCYZAYYM
+    task_id = "test_task"
+    save_dir = "attachments"
+    # Test the YouTube video parsing
+    download_youtube_video(url, task_id, save_dir)