Final_Assignment_Template

Runtime error

App Files Files Community

silasyl commited on May 8, 2025

Commit

ecbc0b3

1 Parent(s): 2705160

Initial commit with LFS-tracked files

Browse files

Files changed (13) hide show

.gitattributes +1 -0
README.md +7 -4
app.py +262 -0
files/1f975693-876d-457b-a649-393859e79bf3.mp3 +3 -0
files/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx +0 -0
files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 +3 -0
files/cca530fc-4052-43b2-b130-b30968d8aa44.png +0 -0
files/f918266a-b3e0-4914-865d-4faa564f1aef.py +35 -0
final_answer_llm.py +50 -0
questions_data.json +122 -0
requirements.txt +0 -0
tools.py +386 -0
vision_llm.py +78 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,15 @@
 ---
 title: Template Final Assignment
-emoji: 💻
-colorFrom: red
-colorTo: pink
 sdk: gradio
 sdk_version: 5.25.2
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Template Final Assignment
+emoji: 🕵🏻‍♂️
+colorFrom: indigo
+colorTo: indigo
 sdk: gradio
 sdk_version: 5.25.2
 app_file: app.py
 pinned: false
+hf_oauth: true
+# optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.
+hf_oauth_expiration_minutes: 480
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import os
+import gradio as gr
+import inspect
+import json
+import numpy as np
+import pandas as pd
+import requests
+from smolagents import CodeAgent, DuckDuckGoSearchTool, OpenAIServerModel, Tool, VisitWebpageTool
+from tools import WikipediaSummaryTool, WikipediaPageTool, YouTubeVisionAnalyzer, YouTubeTranscriptTool, AudioFileTranscriptTool, PythonFileReader, ExcelFileLoader
+from vision_llm import call_vision_llm
+from final_answer_llm import check_final_answer
+# (Keep Constants as is)
+# --- Constants ---
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+# --- Basic Agent Definition ---
+class BasicAgent:
+    def __init__(self, api_url):
+        # database credentials
+        OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+        os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
+        self.api_url = api_url
+        # Initialize the LLM
+        model = OpenAIServerModel(
+            api_key=OPENAI_API_KEY,
+            model_id='gpt-4o-mini',
+            temperature=0,
+        )
+        self.model = model
+        # Create main agent with all the tools
+        self.main_agent = CodeAgent(
+            tools=[
+                DuckDuckGoSearchTool(),
+                VisitWebpageTool(),
+                WikipediaSummaryTool(),
+                WikipediaPageTool(),
+                YouTubeVisionAnalyzer(),
+                YouTubeTranscriptTool(),
+                AudioFileTranscriptTool(),
+                PythonFileReader(),
+                ExcelFileLoader(),
+                call_vision_llm,
+            ],
+            model=model,
+            max_steps=15,
+            planning_interval=5,
+            additional_authorized_imports=[
+                "pandas",
+                "json",
+                "numpy",
+            ],
+        )
+        print("BasicAgent initialized.")
+    def __call__(self, question_data: dict) -> str:
+        task_id = question_data.get("task_id")
+        question = question_data.get("question")
+        file_name = question_data.get("file_name")
+        print(f"Agent received question (first 50 chars): {question[:50]}...")
+        # In case the question has file
+        if file_name != '':
+            # Add metadata for file download
+            question = f"User query:\n{question}\n\nfile_id:\n{task_id}\n\nfile_url:\n{self.api_url}"
+        response = self.main_agent.run(question)
+        final_response = check_final_answer(question, response)
+        print(f"Agent returning response: {final_response}")
+        return final_response
+def run_and_submit_all( profile: gr.OAuthProfile | None):
+    """
+    Fetches all questions, runs the BasicAgent on them, submits all answers,
+    and displays the results.
+    """
+    # --- Determine HF Space Runtime URL and Repo URL ---
+    space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
+    if profile:
+        username= f"{profile.username}"
+        print(f"User logged in: {username}")
+    else:
+        print("User not logged in.")
+        return "Please Login to Hugging Face with the button.", None
+    api_url = DEFAULT_API_URL
+    questions_url = f"{api_url}/questions"
+    submit_url = f"{api_url}/submit"
+    # 1. Instantiate Agent ( modify this part to create your agent)
+    try:
+        agent = BasicAgent(api_url)
+    except Exception as e:
+        print(f"Error instantiating agent: {e}")
+        return f"Error initializing agent: {e}", None
+    # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
+    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
+    print(agent_code)
+    # 2. Fetch Questions
+    print(f"Fetching questions from: {questions_url}")
+    try:
+        response = requests.get(questions_url, timeout=15)
+        response.raise_for_status()
+        questions_data = response.json()
+        if not questions_data:
+             print("Fetched questions list is empty.")
+             return "Fetched questions list is empty or invalid format.", None
+        print(f"Fetched {len(questions_data)} questions.")
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching questions: {e}")
+        print("Trying to load questions from backup JSON...")
+        try:
+            with open("questions_data.json", "r", encoding="utf-8") as f:
+                questions_data = json.load(f)
+                print(f"Loaded {len(questions_data)} questions.")
+        except Exception as json_e:
+            print(f"Failed to load backup questions: {json_e}")
+            return f"Error fetching from API and backup failed: {json_e}", None
+    except requests.exceptions.JSONDecodeError as e:
+         print(f"Error decoding JSON response from questions endpoint: {e}")
+         print(f"Response text: {response.text[:500]}")
+         return f"Error decoding server response for questions: {e}", None
+    except Exception as e:
+        print(f"An unexpected error occurred fetching questions: {e}")
+        return f"An unexpected error occurred fetching questions: {e}", None
+    # 3. Run your Agent
+    results_log = []
+    answers_payload = []
+    questions_data = [questions_data[3], questions_data[9], questions_data[11], questions_data[18]] # Remove this later
+    print(f"Running agent on {len(questions_data)} questions...")
+    for item in questions_data:
+        task_id = item.get("task_id")
+        question_text = item.get("question")
+        if not task_id or question_text is None:
+            print(f"Skipping item with missing task_id or question: {item}")
+            continue
+        try:
+            submitted_answer = agent(item)
+            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
+            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
+        except Exception as e:
+             print(f"Error running agent on task {task_id}: {e}")
+             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
+    if not answers_payload:
+        print("Agent did not produce any answers to submit.")
+        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
+    # 4. Prepare Submission
+    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
+    status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
+    print(status_update)
+    # 5. Submit
+    print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
+#    try:
+#        response = requests.post(submit_url, json=submission_data, timeout=60)
+#        response.raise_for_status()
+#        result_data = response.json()
+#        final_status = (
+#            f"Submission Successful!\n"
+#            f"User: {result_data.get('username')}\n"
+#            f"Overall Score: {result_data.get('score', 'N/A')}% "
+#            f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
+#            f"Message: {result_data.get('message', 'No message received.')}"
+#        )
+#        print("Submission successful.")
+#        results_df = pd.DataFrame(results_log)
+#        return final_status, results_df
+#    except requests.exceptions.HTTPError as e:
+#        error_detail = f"Server responded with status {e.response.status_code}."
+#        try:
+#            error_json = e.response.json()
+#            error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
+#        except requests.exceptions.JSONDecodeError:
+#            error_detail += f" Response: {e.response.text[:500]}"
+#        status_message = f"Submission Failed: {error_detail}"
+#        print(status_message)
+#        results_df = pd.DataFrame(results_log)
+#        return status_message, results_df
+#    except requests.exceptions.Timeout:
+#        status_message = "Submission Failed: The request timed out."
+#        print(status_message)
+#        results_df = pd.DataFrame(results_log)
+#        return status_message, results_df
+#    except requests.exceptions.RequestException as e:
+#        status_message = f"Submission Failed: Network error - {e}"
+#        print(status_message)
+#        results_df = pd.DataFrame(results_log)
+#        return status_message, results_df
+#    except Exception as e:
+#        status_message = f"An unexpected error occurred during submission: {e}"
+#        print(status_message)
+#        results_df = pd.DataFrame(results_log)
+#        return status_message, results_df
+    return "finished", pd.DataFrame()
+# --- Build Gradio Interface using Blocks ---
+with gr.Blocks() as demo:
+    gr.Markdown("# Basic Agent Evaluation Runner")
+    gr.Markdown(
+        """
+        **Instructions:**
+        1.  Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
+        2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
+        3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
+        ---
+        **Disclaimers:**
+        Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
+        This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
+        """
+    )
+    gr.LoginButton()
+    run_button = gr.Button("Run Evaluation & Submit All Answers")
+    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
+    # Removed max_rows=10 from DataFrame constructor
+    results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
+    run_button.click(
+        fn=run_and_submit_all,
+        outputs=[status_output, results_table]
+    )
+if __name__ == "__main__":
+    print("tests 3 (chess png), 9 (mp3 recipe), 11 (py), 18 (xlsx)")
+    print("\n" + "-"*30 + " App Starting " + "-"*30)
+    # Check for SPACE_HOST and SPACE_ID at startup for information
+    space_host_startup = os.getenv("SPACE_HOST")
+    space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
+    if space_host_startup:
+        print(f"✅ SPACE_HOST found: {space_host_startup}")
+        print(f"   Runtime URL should be: https://{space_host_startup}.hf.space")
+    else:
+        print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
+    if space_id_startup: # Print repo URLs if SPACE_ID is found
+        print(f"✅ SPACE_ID found: {space_id_startup}")
+        print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
+        print(f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
+    else:
+        print("ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
+    print("-"*(60 + len(" App Starting ")) + "\n")
+    print("Launching Gradio Interface for Basic Agent Evaluation...")
+    demo.launch(debug=True, share=False)

files/1f975693-876d-457b-a649-393859e79bf3.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:200f767e732b49efef5c05d128903ee4d2c34e66fdce7f5593ac123b2e637673
+size 280868

files/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx ADDED Viewed

Binary file (5.29 kB). View file

files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b218c951c1f888f0bbe6f46c080f57afc7c9348fffc7ba4da35749ff1e2ac40f
+size 179304

files/cca530fc-4052-43b2-b130-b30968d8aa44.png ADDED Viewed

files/f918266a-b3e0-4914-865d-4faa564f1aef.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from random import randint
+import time
+class UhOh(Exception):
+    pass
+class Hmm:
+    def __init__(self):
+        self.value = randint(-100, 100)
+    def Yeah(self):
+        if self.value == 0:
+            return True
+        else:
+            raise UhOh()
+def Okay():
+    while True:
+        yield Hmm()
+def keep_trying(go, first_try=True):
+    maybe = next(go)
+    try:
+        if maybe.Yeah():
+            return maybe.value
+    except UhOh:
+        if first_try:
+            print("Working...")
+            print("Please wait patiently...")
+        time.sleep(0.1)
+        return keep_trying(go, first_try=False)
+if __name__ == "__main__":
+    go = Okay()
+    print(f"{keep_trying(go)}")

final_answer_llm.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+from smolagents import OpenAIServerModel
+system_text = """
+Your task is to make the given answer as concise as possible **without changing its meaning** or introducing any new information.
+Only shorten it by removing redundancy or unnecessary phrasing. You must preserve the original answer's facts and structure. Do not generate new content.
+Only output the revised answer. Do not include explanations or formatting like "Answer:" or "Final Answer".
+"""
+def check_final_answer(question: str, answer: str) -> str:
+    """
+    Pass the question and answer to a LLM, to make it proper for GAIA comparison.
+    Args:
+        question: Question.
+        answer: Original final answer.
+    """
+    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+    answer_model = OpenAIServerModel(
+        api_key=OPENAI_API_KEY,
+        model_id='gpt-4o-mini',
+        temperature=0,
+    )
+    messages = [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": system_text
+                }
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"Question:\n{question}\n\nOriginal Answer:\n{answer}",
+                }
+            ]
+        }
+    ]
+    response = answer_model(messages).content
+    return response

questions_data.json ADDED Viewed

	@@ -0,0 +1,122 @@

+[
+    {
+        "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
+        "question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
+        "question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
+        "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
+        "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
+        "Level": "1",
+        "file_name": "cca530fc-4052-43b2-b130-b30968d8aa44.png"
+    },
+    {
+        "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
+        "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
+        "question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
+        "question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
+        "question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
+        "question": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
+        "question": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",
+        "Level": "1",
+        "file_name": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3"
+    },
+    {
+        "task_id": "305ac316-eef6-4446-960a-92d80d542f82",
+        "question": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
+        "question": "What is the final numeric output from the attached Python code?",
+        "Level": "1",
+        "file_name": "f918266a-b3e0-4914-865d-4faa564f1aef.py"
+    },
+    {
+        "task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
+        "question": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "1f975693-876d-457b-a649-393859e79bf3",
+        "question": "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
+        "Level": "1",
+        "file_name": "1f975693-876d-457b-a649-393859e79bf3.mp3"
+    },
+    {
+        "task_id": "840bfca7-4f7b-481a-8794-c560c340185d",
+        "question": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
+        "question": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
+        "question": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
+        "question": "Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
+        "Level": "1",
+        "file_name": ""
+    },
+    {
+        "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
+        "question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
+        "Level": "1",
+        "file_name": "7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx"
+    },
+    {
+        "task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
+        "question": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?",
+        "Level": "1",
+        "file_name": ""
+    }
+]

requirements.txt ADDED Viewed

Binary file (4.21 kB). View file

tools.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import base64
+import cv2
+import io
+import os
+import requests
+import whisper
+import wikipedia
+import yt_dlp
+from dotenv import load_dotenv
+from PIL import Image
+from smolagents import CodeAgent, DuckDuckGoSearchTool, OpenAIServerModel, Tool, VisitWebpageTool
+from youtube_transcript_api import YouTubeTranscriptApi
+load_dotenv()
+# database credentials
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
+def get_file_content(file_id: str, url: str = None):
+    # Simulates download, I am using this because currently I am blocked from downloading too much
+    # Look for any file with that ID regardless of extension
+    folder_path = "files"
+    for filename in os.listdir(folder_path):
+        if filename.startswith(file_id):
+            file_path = os.path.join(folder_path, filename)
+            with open(file_path, "rb") as f:
+                content = f.read()
+            # Simulate response.content
+            return io.BytesIO(content).getvalue()
+class WikipediaSummaryTool(Tool):
+    name = "wikipedia_summary"
+    description = "Fetches a summary of a topic from Wikipedia."
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The topic to search on Wikipedia."
+        }
+    }
+    output_type = "string"
+    def __init__(self):
+        wikipedia.set_lang("en")
+    def is_initialized(self) -> bool:
+        return True
+    def forward(self, query: str):
+        # Calls wikipedia api
+        response = wikipedia.summary(query)
+        return response
+class WikipediaPageTool(Tool):
+    name = "wikipedia_page"
+    description = "Fetches the complete page of a topic from Wikipedia."
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The topic to search on Wikipedia."
+        }
+    }
+    output_type = "string"
+    def __init__(self):
+        wikipedia.set_lang("en")
+    def is_initialized(self) -> bool:
+        return True
+    def forward(self, query: str):
+        # Calls wikipedia api
+        page = wikipedia.page(query)
+        return page.content
+class YouTubeVisionAnalyzer(Tool):
+    name = "youtube_vision_analyzer"
+    description = "Analyzes visual content from YouTube videos by extracting and processing frames. It does not process audio or subtitles, and is best used for tasks involving objects, scenes, or visual patterns appearing in the video."
+    inputs = {
+        "video_url": {
+            "type": "string",
+            "description": "The URL of the YouTube video to process."
+        },
+        "user_query": {
+            "type": "string",
+            "description": "The user's query."
+        }
+    }
+    output_type = "string"
+    def __init__(self):
+        pass
+    def is_initialized(self) -> bool:
+        return True
+    @staticmethod
+    def download_youtube_video(url: str):
+        # Download the video using yt-dlp (saves as youtube_video.mp4)
+        ydl_opts = {
+            'format': 'mp4',
+            'outtmpl': 'youtube_video.mp4'
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+        return 'youtube_video.mp4'
+    @staticmethod
+    def extract_frames(video_path: str, output_dir="frames"):
+        os.makedirs(output_dir, exist_ok=True)
+        cap = cv2.VideoCapture(video_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_interval = int(fps * 5)  # 5 seconds
+        frame_count = 0
+        saved_count = 0
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            if frame_count % frame_interval == 0:
+                frame_filename = os.path.join(output_dir, f"frame_{saved_count:03d}.jpg")
+                cv2.imwrite(frame_filename, frame)
+                saved_count += 1
+            frame_count += 1
+        cap.release()
+        return output_dir
+    @staticmethod
+    def encode_image(image_path:str, new_size=512):
+        # Resize image to upper 512 pixels and return in base64 format
+        with Image.open(image_path) as image:
+            original_width, original_height = image.size
+            if original_width > original_height:
+                ratio = new_size / original_width
+            else:
+                ratio = new_size / original_height
+            new_width = int(original_width * ratio)
+            new_height = int(original_height * ratio)
+            resized_image = image.resize((new_width, new_height))
+            buffered = io.BytesIO()
+            resized_image.save(buffered, format='JPEG')
+            return base64.b64encode(buffered.getvalue()).decode('utf-8')
+    @staticmethod
+    def call_vision_llm(folder_path: str, user_query: str):
+        encoded_images = []
+        responses = []
+        model = OpenAIServerModel(
+            api_key=OPENAI_API_KEY,
+            model_id='gpt-4o-mini',
+            temperature=0,
+        )
+        for filename in sorted(os.listdir(folder_path)):
+            if filename.endswith(".jpg"):
+                img_path = os.path.join(folder_path, filename)
+                encoded_image = YouTubeVisionAnalyzer.encode_image(img_path)
+                encoded_images.append(encoded_image)
+        batch_size = 12
+        for i in range(0, len(encoded_images), batch_size):
+            batch = encoded_images[i:i+batch_size]
+            messages = [
+                {
+                    "role": "system",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "You are an assistant analyzing image frames extracted from a video. If the user query refers to a video, remember these are frames from the video. Do not provide extra information or external inference.",
+                        }
+                    ]
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": user_query,
+                        },
+                        *[
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{encoded_image}",
+                                    "detail": "low"
+                                }
+                            }
+                            for encoded_image in batch
+                        ]
+                    ]
+                }
+            ]
+            responses.append(model(messages).content)
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant that summarizes and extracts the correct answer from multiple partial observations. Each partial response comes from analyzing a batch of video frames. Given the user's query and the list of partial responses, your task is to provide the best final answer to the user's query. Be concise in the final answer."
+            },
+            {
+                "role": "user",
+                "content": f"User's query:\n{user_query}.\n\nPartial responses:\n" + "\n".join(f"- {response}" for response in responses)
+            }
+        ]
+        final_response = model(messages).content
+        return final_response
+    @staticmethod
+    def delete_video_file(video_path: str, folder_path: str):
+        if os.path.exists(video_path):
+            os.remove(video_path)
+        if os.path.exists(folder_path):
+            for filename in os.listdir(folder_path):
+                if filename.endswith(".jpg"):
+                    file_path = os.path.join(folder_path, filename)
+                    os.remove(file_path)
+    def forward(self, video_url: str, user_query: str):
+        # Process video: download, extract frames, detect objects, call llm
+        video_path = YouTubeVisionAnalyzer.download_youtube_video(video_url)
+        folder_path = YouTubeVisionAnalyzer.extract_frames(video_path)
+        response = YouTubeVisionAnalyzer.call_vision_llm(folder_path, user_query)
+        YouTubeVisionAnalyzer.delete_video_file(video_path, folder_path)
+        return response
+class YouTubeTranscriptTool(Tool):
+    name = "youtube_transcript_tool"
+    description = "Extracts textual transcripts (captions) from YouTube videos to analyze spoken content. This tool is useful for identifying what is said in the video, such as dialogue, spoken instructions, or narration. It does not analyze visual elements like scenes or objects. Pay attention because transcriptions may be truncated."
+    inputs = {
+        "video_url": {
+            "type": "string",
+            "description": "The YouTube video URL."
+        }
+    }
+    output_type = "string"
+    def __init__(self):
+        pass
+    def is_initialized(self) -> bool:
+        return True
+    def forward(self, video_url: str):
+        # Extract the video ID from the URL
+        video_id = video_url.split("v=")[-1]
+        try:
+            # Fetch the transcript using YouTubeTranscriptApi
+            transcript = YouTubeTranscriptApi.get_transcript(video_id)
+            return transcript
+        except Exception as e:
+            return str(e)
+class AudioFileTranscriptTool(Tool):
+    name = "audio_file_transcript_tool"
+    description = "Extracts text transcripts from uploaded audio files (e.g., MP3, WAV). Use this tool to analyze spoken content from user-provided files, not from YouTube or video links. It only processes audio, not visual information."
+    inputs = {
+        "file_id": {
+            "type": "string",
+            "description": "Metadata required to download the audio."
+        },
+        "file_url": {
+            "type": "string",
+            "description": "Metadata required to download the audio."
+        },
+    }
+    output_type = "string"
+    def __init__(self):
+        # Load Whisper model
+        self.whisper_model = whisper.load_model("base", device="cpu")
+    def is_initialized(self) -> bool:
+        return True
+    def forward(self, file_id: str, file_url: str):
+        # Downloads an audio file and transcript it to text
+        #questions_files = f"{file_url}/files"
+        #response = requests.get(f"{questions_files}/{file_id}", timeout=15)
+        response = get_file_content(file_id, file_url)
+        # Save MP3 bytes to a file
+        with open("audio.mp3", "wb") as f:
+            f.write(response.content)
+        # Transcribe the audio
+        result = self.whisper_model.transcribe("audio.mp3", language="en", fp16=False)
+        # Remove file
+        os.remove("audio.mp3")
+        return result['text']
+class PythonFileReader(Tool):
+    name = "python_file_reader"
+    description = "Extracts the full text content of a Python (.py) file so that it can be analyzed by the agent."
+    inputs = {
+        "file_id": {
+            "type": "string",
+            "description": "Metadata required to download the file."
+        },
+        "file_url": {
+            "type": "string",
+            "description": "Metadata required to download the file."
+        },
+    }
+    output_type = "string"
+    def __init__(self):
+        pass
+    def is_initialized(self) -> bool:
+        return True
+    def forward(self, file_id: str, file_url: str):
+        # Downloads a python file and decode it
+        #questions_files = f"{file_url}/files"
+        #response = requests.get(f"{questions_files}/{file_id}", timeout=15)
+        response = get_file_content(file_id, file_url)
+        # Decode bytes to text
+        code_content = response.content.decode("utf-8")
+        return code_content
+class ExcelFileLoader(Tool):
+    name = "excel_file_loader"
+    description = "Downloads and stores an Excel spreadsheet (.xlsx) locally as 'sheet.xlsx' so it can be programmatically analyzed by the agent using tools like pandas. This tool does not interpret or summarize the data itself — it only ensures the file is available in the environment."
+    inputs = {
+        "file_id": {
+            "type": "string",
+            "description": "Metadata required to download the file."
+        },
+        "file_url": {
+            "type": "string",
+            "description": "Metadata required to download the file."
+        },
+    }
+    output_type = "string"
+    def __init__(self):
+        pass
+    def is_initialized(self) -> bool:
+        return True
+    def forward(self, file_id: str, file_url: str):
+        # Downloads a spreadsheet and saves it
+        #questions_files = f"{file_url}/files"
+        #response = requests.get(f"{questions_files}/{file_id}", timeout=15)
+        response = get_file_content(file_id, file_url)
+        # Save bytes to a spreadsheet file
+        with open("sheet.xlsx", "wb") as f:
+            f.write(response.content)
+        return "sheet.xlsx"

vision_llm.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import io
+import base64
+import os
+import requests
+from PIL import Image
+from smolagents import tool, OpenAIServerModel
+from tools import get_file_content
+def encode_image(image_bytes: bytes, new_size=512):
+    # Resize image to upper 512 pixels and return in base64 format
+    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    original_width, original_height = image.size
+    if original_width > original_height:
+        ratio = new_size / original_width
+    else:
+        ratio = new_size / original_height
+    new_width = int(original_width * ratio)
+    new_height = int(original_height * ratio)
+    resized_image = image.resize((new_width, new_height))
+    buffered = io.BytesIO()
+    resized_image.save(buffered, format='JPEG')
+    return base64.b64encode(buffered.getvalue()).decode('utf-8')
+def download_image(task_id: str, api_url: str) -> None:
+    # Downloads an image file and encode it in base64 format
+    #questions_files = f"{api_url}/files"
+    #response = requests.get(f"{questions_files}/{task_id}", timeout=15)
+    response = get_file_content(task_id, api_url)
+    encoded_image = encode_image(response.content)
+    return encoded_image
+@tool
+def call_vision_llm(user_query: str, file_id: str, file_url: str) -> str:
+    """
+    Downloads the image using the file_id and file_url, then analyzes it using a vision-based LLM, following user query.
+    Args:
+        user_query: User request on image.
+        file_id: metadata required to download the image.
+        file_url: metadata required to download the image.
+    """
+    encoded_image = download_image(file_id, file_url)
+    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+    vision_model = OpenAIServerModel(
+        api_key=OPENAI_API_KEY,
+        model_id='gpt-4o-mini',
+        temperature=0,
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": user_query,
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{encoded_image}",
+                        "detail": "low"
+                    }
+                }
+            ]
+        }
+    ]
+    response = vision_model(messages).content
+    return response