Agent_Course_Final_Assignment

Sleeping

App Files Files Community

giulia-fontanella commited on Jun 6, 2025

Commit

7a40d3a

unverified ·

1 Parent(s): 16262d0

test agent

Browse files

Signed-off-by: giulia fontanella <giulia.fontanella@secomind.com>

Files changed (6) hide show

app.py +66 -42
notebooks/test.ipynb +203 -0
requirements.txt +4 -0
src/__init__.py +0 -0
agent.py → src/agent.py +87 -33
tools.py → src/tools.py +136 -138

app.py CHANGED Viewed

@@ -1,32 +1,34 @@
 import os
 import gradio as gr
-import requests
-import inspect
 import pandas as pd
-from agent import BasicAgent
-from langchain_huggingface import HuggingFaceEndpoint
 from langchain_openai import ChatOpenAI
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-REPO_ID = "meta-llama/Llama-3.1-8B-Instruct"
 PROVIDER_TYPE = "openai"  # "openai" or "huggingface"
-def run_and_submit_all( profile: gr.OAuthProfile | None):
-    """
-    Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
-    space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
     if profile:
-        username= f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
@@ -36,27 +38,28 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
-    # 1. Instantiate Agent
     try:
         if PROVIDER_TYPE == "huggingface":
             llm = HuggingFaceEndpoint(
-                repo_id=REPO_ID,
                 huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
             )
             chat = ChatHuggingFace(llm=llm, verbose=True)
         elif PROVIDER_TYPE == "openai":
-            chat = ChatOpenAI(model="gpt-4o")
         else:
             print(f"Provider {PROVIDER_TYPE} not supported.")
             return f"Provider {PROVIDER_TYPE} not supported", None
-        agent = BasicAgent(chat)
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
-    # In the case of an app running as a hugging Face space, this link points toward your codebase
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     print(agent_code)
@@ -67,16 +70,16 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         response.raise_for_status()
         questions_data = response.json()
         if not questions_data:
-             print("Fetched questions list is empty.")
-             return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
     except requests.exceptions.JSONDecodeError as e:
-         print(f"Error decoding JSON response from questions endpoint: {e}")
-         print(f"Response text: {response.text[:500]}")
-         return f"Error decoding server response for questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
@@ -89,10 +92,10 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         task_id = item.get("task_id")
         question_text = item.get("question")
         file_name = item.get("file_name")
-        if file_name!='':
             files_url = f"{api_url}/files/{task_id}"
             file = requests.get(files_url, timeout=15)
-            with open(file_name, 'wb') as f:
                 f.write(file.content)
             print(f"Downloaded {files_url}.")
         if not task_id or question_text is None:
@@ -100,18 +103,36 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             continue
         try:
             submitted_answer = agent(question_text, file_name)
-            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
-            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
-             print(f"Error running agent on task {task_id}: {e}")
-             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
-    # 4. Prepare Submission
-    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
     status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
     print(status_update)
@@ -180,20 +201,19 @@ with gr.Blocks() as demo:
     run_button = gr.Button("Run Evaluation & Submit All Answers")
-    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
     # Removed max_rows=10 from DataFrame constructor
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
-    run_button.click(
-        fn=run_and_submit_all,
-        outputs=[status_output, results_table]
-    )
 if __name__ == "__main__":
-    print("\n" + "-"*30 + " App Starting " + "-"*30)
     # Check for SPACE_HOST and SPACE_ID at startup for information
     space_host_startup = os.getenv("SPACE_HOST")
-    space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
     if space_host_startup:
         print(f"✅ SPACE_HOST found: {space_host_startup}")
@@ -201,14 +221,18 @@ if __name__ == "__main__":
     else:
         print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
-    if space_id_startup: # Print repo URLs if SPACE_ID is found
         print(f"✅ SPACE_ID found: {space_id_startup}")
         print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
-        print(f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
     else:
-        print("ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
-    print("-"*(60 + len(" App Starting ")) + "\n")
-    print("Launching Gradio Interface for Basic Agent Evaluation...")
-    demo.launch(debug=True, share=False)

+import inspect
 import os
 import gradio as gr
 import pandas as pd
+import requests
+from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
 from langchain_openai import ChatOpenAI
+from src.agent import SmartAgent
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+MODEL = "gpt-4o"  # "gpt-4o", "meta-llama/Llama-3.1-8B-Instruct", ...
 PROVIDER_TYPE = "openai"  # "openai" or "huggingface"
+def run_and_submit_all(profile: gr.OAuthProfile | None):
+    """Run the agent and submit the results.
+    Fetches all questions, runs the SmartAgent on them, submits all answers,
     and displays the results.
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
+    space_id = os.getenv("SPACE_ID")  # Get the SPACE_ID for sending link to the code
     if profile:
+        username = f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
+    # 1. Instantiate Agent
     try:
         if PROVIDER_TYPE == "huggingface":
             llm = HuggingFaceEndpoint(
+                repo_id=MODEL,
                 huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
             )
             chat = ChatHuggingFace(llm=llm, verbose=True)
         elif PROVIDER_TYPE == "openai":
+            chat = ChatOpenAI(model=MODEL, temperature=0.2)
         else:
             print(f"Provider {PROVIDER_TYPE} not supported.")
             return f"Provider {PROVIDER_TYPE} not supported", None
+        agent = SmartAgent(chat)
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
+    # In the case of an app running as a hugging Face space,
+    # this link points toward your codebase
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     print(agent_code)
         response.raise_for_status()
         questions_data = response.json()
         if not questions_data:
+            print("Fetched questions list is empty.")
+            return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
     except requests.exceptions.JSONDecodeError as e:
+        print(f"Error decoding JSON response from questions endpoint: {e}")
+        print(f"Response text: {response.text[:500]}")
+        return f"Error decoding server response for questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
         task_id = item.get("task_id")
         question_text = item.get("question")
         file_name = item.get("file_name")
+        if file_name != "":
             files_url = f"{api_url}/files/{task_id}"
             file = requests.get(files_url, timeout=15)
+            with open(file_name, "wb") as f:
                 f.write(file.content)
             print(f"Downloaded {files_url}.")
         if not task_id or question_text is None:
             continue
         try:
             submitted_answer = agent(question_text, file_name)
+            answers_payload.append(
+                {"task_id": task_id, "submitted_answer": submitted_answer}
+            )
+            results_log.append(
+                {
+                    "Task ID": task_id,
+                    "Question": question_text,
+                    "Submitted Answer": submitted_answer,
+                }
+            )
         except Exception as e:
+            print(f"Error running agent on task {task_id}: {e}")
+            results_log.append(
+                {
+                    "Task ID": task_id,
+                    "Question": question_text,
+                    "Submitted Answer": f"AGENT ERROR: {e}",
+                }
+            )
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
+    # 4. Prepare Submission
+    submission_data = {
+        "username": username.strip(),
+        "agent_code": agent_code,
+        "answers": answers_payload,
+    }
     status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
     print(status_update)
     run_button = gr.Button("Run Evaluation & Submit All Answers")
+    status_output = gr.Textbox(
+        label="Run Status / Submission Result", lines=5, interactive=False
+    )
     # Removed max_rows=10 from DataFrame constructor
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
+    run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 if __name__ == "__main__":
+    print("\n" + "-" * 30 + " App Starting " + "-" * 30)
     # Check for SPACE_HOST and SPACE_ID at startup for information
     space_host_startup = os.getenv("SPACE_HOST")
+    space_id_startup = os.getenv("SPACE_ID")  # Get SPACE_ID at startup
     if space_host_startup:
         print(f"✅ SPACE_HOST found: {space_host_startup}")
     else:
         print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
+    if space_id_startup:  # Print repo URLs if SPACE_ID is found
         print(f"✅ SPACE_ID found: {space_id_startup}")
         print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
+        print(
+            f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main"
+        )
     else:
+        print(
+            "ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined."
+        )
+    print("-" * (60 + len(" App Starting ")) + "\n")
+    print("Launching Gradio Interface for Agent Evaluation...")
+    demo.launch(debug=True, share=False)

notebooks/test.ipynb ADDED Viewed

	@@ -0,0 +1,203 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abf90ca5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import requests\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4299f37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "\n",
+    "sys.path.append(os.path.abspath(\"../src\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73b38064",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from agent import SmartAgent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f925adb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# --- Constants ---\n",
+    "DEFAULT_API_URL = \"https://agents-course-unit4-scoring.hf.space\"\n",
+    "HUGGINGFACEHUB_API_TOKEN = os.getenv(\"HUGGINGFACEHUB_API_TOKEN\")\n",
+    "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
+    "\n",
+    "REPO_ID = \"meta-llama/Llama-3.1-8B-Instruct\"\n",
+    "PROVIDER_TYPE = \"openai\"  # \"openai\" or \"huggingface\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "541ebb1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TAVILY_API_KEY = os.getenv(\"TAVILY_API_KEY\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "320e99b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "api_url = DEFAULT_API_URL\n",
+    "questions_url = f\"{api_url}/questions\"\n",
+    "submit_url = f\"{api_url}/submit\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f31b88db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 1. Instantiate Agent\n",
+    "try:\n",
+    "    if PROVIDER_TYPE == \"huggingface\":\n",
+    "        llm = HuggingFaceEndpoint(\n",
+    "            repo_id=REPO_ID,\n",
+    "            huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,\n",
+    "        )\n",
+    "        chat = ChatHuggingFace(llm=llm, verbose=True)\n",
+    "    elif PROVIDER_TYPE == \"openai\":\n",
+    "        chat = ChatOpenAI(model=\"gpt-4o\", temperature=0.2)\n",
+    "    else:\n",
+    "        print(f\"Provider {PROVIDER_TYPE} not supported.\")\n",
+    "\n",
+    "    agent = SmartAgent(chat)\n",
+    "\n",
+    "except Exception as e:\n",
+    "    print(f\"Error instantiating agent: {e}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4d18d12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 2. Fetch Questions\n",
+    "print(f\"Fetching questions from: {questions_url}\")\n",
+    "try:\n",
+    "    response = requests.get(questions_url, timeout=15)\n",
+    "    response.raise_for_status()\n",
+    "    questions_data = response.json()\n",
+    "    if not questions_data:\n",
+    "        print(\"Fetched questions list is empty.\")\n",
+    "    print(f\"Fetched {len(questions_data)} questions.\")\n",
+    "except requests.exceptions.RequestException as e:\n",
+    "    print(f\"Error fetching questions: {e}\")\n",
+    "except requests.exceptions.JSONDecodeError as e:\n",
+    "    print(f\"Error decoding JSON response from questions endpoint: {e}\")\n",
+    "    print(f\"Response text: {response.text[:500]}\")\n",
+    "except Exception as e:\n",
+    "    print(f\"An unexpected error occurred fetching questions: {e}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9627e327",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 3. Run your Agent\n",
+    "results_log = []\n",
+    "answers_payload = []\n",
+    "\n",
+    "item = questions_data[0]\n",
+    "print(f\"Running agent on question: {item}\")\n",
+    "\n",
+    "task_id = item.get(\"task_id\")\n",
+    "question_text = item.get(\"question\")\n",
+    "file_name = item.get(\"file_name\")\n",
+    "if file_name != \"\":\n",
+    "    files_url = f\"{api_url}/files/{task_id}\"\n",
+    "    file = requests.get(files_url, timeout=15)\n",
+    "    with open(file_name, \"wb\") as f:\n",
+    "        f.write(file.content)\n",
+    "    print(f\"Downloaded {files_url}.\")\n",
+    "if not task_id or question_text is None:\n",
+    "    print(f\"Skipping item with missing task_id or question: {item}\")\n",
+    "try:\n",
+    "    submitted_answer = agent(question_text, file_name)\n",
+    "    answers_payload.append({\"task_id\": task_id, \"submitted_answer\": submitted_answer})\n",
+    "    results_log.append(\n",
+    "        {\n",
+    "            \"Task ID\": task_id,\n",
+    "            \"Question\": question_text,\n",
+    "            \"Submitted Answer\": submitted_answer,\n",
+    "        }\n",
+    "    )\n",
+    "except Exception as e:\n",
+    "    print(f\"Error running agent on task {task_id}: {e}\")\n",
+    "    results_log.append(\n",
+    "        {\n",
+    "            \"Task ID\": task_id,\n",
+    "            \"Question\": question_text,\n",
+    "            \"Submitted Answer\": f\"AGENT ERROR: {e}\",\n",
+    "        }\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "699cba0f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

requirements.txt CHANGED Viewed

@@ -11,3 +11,7 @@ wikipedia
 arxiv
 pymupdf
 feedparser

 arxiv
 pymupdf
 feedparser
+ffmpeg-python
+yt_dlp
+openpyxl
+openai-whisper

src/__init__.py ADDED Viewed

File without changes

agent.py → src/agent.py RENAMED Viewed

@@ -1,35 +1,65 @@
 import os
-from typing import TypedDict, Annotated, Optional
-from langgraph.graph.message import add_messages
 from langchain_core.messages import AnyMessage, HumanMessage, SystemMessage
 from langchain_openai import ChatOpenAI
-from langchain.tools import tool
-from langgraph.prebuilt import ToolNode
 from langgraph.graph import START, StateGraph
-from langgraph.prebuilt import tools_condition
-from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
-from langfuse.callback import CallbackHandler
-from tools import ExtractTextFromImage, DescribeImage, TranscribeAudio, read_excel, read_python, wiki_search, web_search, arxiv_search
 class AgentState(TypedDict):
     messages: Annotated[list[AnyMessage], add_messages]
-class BasicAgent():
     def __init__(self, chat):
         self.multimodal_model = ChatOpenAI(model="gpt-4o")
-        extract_text_from_image = tool(ExtractTextFromImage(self.multimodal_model).__call__)
-        describe_image = tool(DescribeImage(self.multimodal_model).__call__)
-        transcribe_audio = tool(TranscribeAudio(self.multimodal_model).__call__)
-        self.tools = [extract_text_from_image, describe_image, transcribe_audio, read_excel, read_python, wiki_search, web_search, arxiv_search]
         self.chat_with_tools = chat.bind_tools(self.tools)
         self._initialize_graph()
         self._initialize_telemetry()
     def _initialize_graph(self):
         builder = StateGraph(AgentState)
         # Define nodes
@@ -38,7 +68,7 @@ class BasicAgent():
         # Define edges
         builder.add_edge(START, "assistant")
-        builder.add_conditional_edges("assistant",tools_condition)
         builder.add_edge("tools", "assistant")
         # Compile the graph
@@ -46,41 +76,65 @@ class BasicAgent():
         print("Agent initialized.")
     def _initialize_telemetry(self):
         LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY")
         LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY")
         LANGFUSE_HOST = "https://cloud.langfuse.com"
-        self.langfuse_handler = CallbackHandler(
             public_key=LANGFUSE_PUBLIC_KEY,
             secret_key=LANGFUSE_SECRET_KEY,
-            host=LANGFUSE_HOST
         )
         print("Telemetry initialized.")
-    def __call__(self, question: str, file_name : str) -> str:
-        sys_msg = SystemMessage(content=f"""
-            You are a general AI assistant. I will ask you a question. Reason step by step and search for the information you need using available tools.
-            Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
-            YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
-            When providing the final answer, ONLY give [YOUR FINAL ANSWER]. Do not add anything else, no additional motivation or explanation, and do not return 'FINAL ANSWER:'.
-            """)
         print(f"Agent received question: {question}.")
-        if file_name is not None and file_name!='':
             print(f"Provided file: {file_name}.")
-            messages=[sys_msg] + [HumanMessage(content=f"{question}. The file you have access to is {file_name}.")]
         else:
-            messages=[sys_msg] + [HumanMessage(content=question)]
-        response = self.agent.invoke({"messages":messages}, config={"callbacks": [self.langfuse_handler]})
-        answer = response['messages'][-1].content
         print(f"Agent returning answer: {answer}")
         return answer
     def assistant(self, state: AgentState):
         response = self.chat_with_tools.invoke(state["messages"])
         return {
-            "messages":  state["messages"] + [response],
         }

 import os
+from typing import Annotated, TypedDict
+from langchain.tools import tool
 from langchain_core.messages import AnyMessage, HumanMessage, SystemMessage
 from langchain_openai import ChatOpenAI
+from langfuse import Langfuse
+from langfuse.langchain import CallbackHandler
 from langgraph.graph import START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.prebuilt import ToolNode, tools_condition
+from .tools import (
+    DescribeImage,
+    ExtractTextFromImage,
+    arxiv_search,
+    download_youtube_video,
+    extract_audio_from_video,
+    read_excel,
+    read_python,
+    transcribe_audio,
+    web_search,
+    wiki_search,
+)
 class AgentState(TypedDict):
+    """Class representing the state for agent graph."""
     messages: Annotated[list[AnyMessage], add_messages]
+class SmartAgent:
     def __init__(self, chat):
+        """Initialize agent, multimodal model and tools."""
         self.multimodal_model = ChatOpenAI(model="gpt-4o")
+        extract_text_from_image = tool(
+            ExtractTextFromImage(self.multimodal_model).__call_extract_text_from_image__
+        )
+        describe_image = tool(
+            DescribeImage(self.multimodal_model).__call_describe_image__
+        )
+        self.tools = [
+            extract_text_from_image,
+            describe_image,
+            transcribe_audio,
+            read_excel,
+            read_python,
+            wiki_search,
+            web_search,
+            arxiv_search,
+            download_youtube_video,
+            extract_audio_from_video,
+        ]
         self.chat_with_tools = chat.bind_tools(self.tools)
         self._initialize_graph()
         self._initialize_telemetry()
     def _initialize_graph(self):
+        """Initialize and compile the agent graph."""
         builder = StateGraph(AgentState)
         # Define nodes
         # Define edges
         builder.add_edge(START, "assistant")
+        builder.add_conditional_edges("assistant", tools_condition)
         builder.add_edge("tools", "assistant")
         # Compile the graph
         print("Agent initialized.")
     def _initialize_telemetry(self):
+        """Initialize langfuse telemetry using CallbackHandler."""
         LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY")
         LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY")
         LANGFUSE_HOST = "https://cloud.langfuse.com"
+        langfuse = Langfuse(
             public_key=LANGFUSE_PUBLIC_KEY,
             secret_key=LANGFUSE_SECRET_KEY,
+            host=LANGFUSE_HOST,  # or your custom host if applicable
         )
+        # Create a Langchain callback handler using the initialized client
+        self.langfuse_handler = CallbackHandler()
         print("Telemetry initialized.")
+    def __call__(self, question: str, file_name: str) -> str:
+        """Call the agent, passing system prompt and eventual file name."""
+        sys_msg = SystemMessage(
+            content="""You are a general AI assistant. You will be asked a factual question.
+                1. Reason step by step and search for the information using available tools if needed.
+                2. Finish your response with this exact format:
+                FINAL ANSWER: [YOUR FINAL ANSWER]
+                IMPORTANT RULES for [YOUR FINAL ANSWER]:
+                - If the answer is a number, provide only the number, with no commas, units, or symbols, do not write it as a string.
+                - If the answer is a string, provide only the core noun phrase with no articles or abbreviations.
+                - If the answer is a list, return a comma-separated list applying the above rules per item.
+                - DO NOT include any other text before or after the final answer.
+                - DO NOT explain or justify the answer after it is given.
+                - DO NOT repeat the question.
+                - DO NOT include the words 'FINAL ANSWER: '.
+                Strictly follow these formatting rules.
+           """
+        )
         print(f"Agent received question: {question}.")
+        if file_name is not None and file_name != "":
             print(f"Provided file: {file_name}.")
+            messages = [sys_msg] + [
+                HumanMessage(
+                    content=f"{question}. The file you have access to is {file_name}."
+                )
+            ]
         else:
+            messages = [sys_msg] + [HumanMessage(content=question)]
+        response = self.agent.invoke(
+            {"messages": messages}, config={"callbacks": [self.langfuse_handler]}
+        )
+        answer = response["messages"][-1].content
         print(f"Agent returning answer: {answer}")
         return answer
     def assistant(self, state: AgentState):
+        """Assistant node which calls the model initialized with tools."""
         response = self.chat_with_tools.invoke(state["messages"])
         return {
+            "messages": state["messages"] + [response],
         }

tools.py → src/tools.py RENAMED Viewed

@@ -1,17 +1,18 @@
 import base64
 import pandas as pd
-from langchain_core.messages import HumanMessage
 from langchain.tools import tool
-from langchain_community.tools.tavily_search import TavilySearchResults
-from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
-# import yt_dlp
-# import ffmpeg
 @tool
 def read_excel(file_path: str) -> str:
-    """
-    Extract readable text from an Excel file (.xlsx or .xls).
     Args:
         file_path: Path to the Excel file.
@@ -23,9 +24,15 @@ def read_excel(file_path: str) -> str:
         df_dict = pd.read_excel(file_path, sheet_name=None)  # Read all sheets
         result = []
         for sheet_name, sheet_df in df_dict.items():
-            sheet_text = sheet_df.to_string(index=False)
-            result.append(f"Sheet: {sheet_name}\n{sheet_text}")
-        return "\n\n".join(result)
     except Exception as e:
         return f"Error reading Excel file: {str(e)}"
@@ -33,8 +40,7 @@ def read_excel(file_path: str) -> str:
 @tool
 def read_python(file_path: str) -> str:
-    """
-    Extract source code from a Python (.py) file.
     Args:
         file_path: Path to the Python file.
@@ -48,29 +54,31 @@ def read_python(file_path: str) -> str:
     except Exception as e:
         return f"Error reading Python file: {str(e)}"
 class ExtractTextFromImage:
     def __init__(self, multimodal_model):
         self.multimodal_model = multimodal_model
-    def __call__(self, img_path: str) -> str:
-        """
-        Extract text from an image file.
         Args:
             img_path: A string representing the path to an image (e.g., PNG, JPEG).
         Returns:
-            A single string containing the concatenated text extracted from the image.
         """
         all_text = ""
         try:
             # Read image and encode as base64
             with open(img_path, "rb") as image_file:
                 image_bytes = image_file.read()
             image_base64 = base64.b64encode(image_bytes).decode("utf-8")
             # Prepare the prompt including the base64 image data
             message = [
                 HumanMessage(
@@ -91,13 +99,13 @@ class ExtractTextFromImage:
                     ]
                 )
             ]
             # Call the vision-capable model
             response = self.multimodal_model.invoke(message)
             # Append extracted text
             all_text += response.content + "\n\n"
             return all_text.strip()
         except Exception as e:
             error_msg = f"Error extracting text: {str(e)}"
@@ -106,21 +114,24 @@ class ExtractTextFromImage:
 class DescribeImage:
     def __init__(self, multimodal_model):
         self.multimodal_model = multimodal_model
-    def __call__(self, img_path: str, query: str) -> str:
-        """
-        Generate a detailed description of an image.
-        This function reads a image from an url, encodes it, and sends it to a
-        vision-capable language model to obtain a comprehensive, natural language
         description of the image's content, including its objects, actions, and context,
         following a specific query.
         Args:
             img_path: A string representing the path to an image (e.g., PNG, JPEG).
             query: Information to extract from the image.
         Returns:
             A single string containing a detailed description of the image.
         """
@@ -128,9 +139,9 @@ class DescribeImage:
             # Read image and encode as base64
             with open(img_path, "rb") as image_file:
                 image_bytes = image_file.read()
             image_base64 = base64.b64encode(image_bytes).decode("utf-8")
             # Prepare message payload
             message = [
                 HumanMessage(
@@ -138,7 +149,8 @@ class DescribeImage:
                         {
                             "type": "text",
                             "text": (
-                                f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}"                        ),
                         },
                         {
                             "type": "image_url",
@@ -151,151 +163,137 @@ class DescribeImage:
             ]
             response = self.multimodal_model.invoke(message)
             return response.content.strip()
         except Exception as e:
             error_msg = f"Error describing image: {str(e)}"
             print(error_msg)
             return ""
-class TranscribeAudio:
-    def __init__(self, multimodal_model):
-        self.multimodal_model = multimodal_model
-    def __call__(self, audio_path: str, query:str) -> str:
-        """
-        Transcribe an MP3 file.
-        Args:
-            audio_path: Path to the MP3 audio file.
-        Returns:
-            Transcribed text as a string.
-        """
-        try:
-            with open(audio_path, "rb") as audio_file:
-                audio_bytes = audio_file.read()
-            audio_data = AudioFile(
-                mime_type="audio/mpeg",  # MP3 MIME type
-                data=audio_bytes
-            )
-            message = [
-                HumanMessage(
-                    content=[
-                        {
-                            "type": "text",
-                            "text": (
-                                "Transcribe the speech from this audio file. "
-                                "Return only the transcribed text, with no extra commentary."
-                            ),
-                        },
-                        {
-                            "type": "audio",
-                            "audio": audio_data,
-                        },
-                    ]
-                )
-            ]
-            response = self.audio_llm.invoke(message)
-            return response.content.strip()
-        except Exception as e:
-            error_msg = f"Error transcribing audio: {str(e)}"
-            print(error_msg)
-            return ""
-# @tool
-# def download_youtube_video(youtube_url: str, output_path: str) -> str:
-#     """
-#     Download a YouTube video as an MP4 file.
-#     Args:
-#         youtube_url: The YouTube video URL.
-#         output_path: Desired output path for the downloaded MP4 file.
-#     Returns:
-#         Path to the saved video file.
-#     """
-#     ydl_opts = {
-#         'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
-#         'outtmpl': output_path,
-#         'merge_output_format': 'mp4',
-#         'quiet': True,
-#     }
-#     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-#         ydl.download([youtube_url])
-#     return output_path
-# @tool
-# def extract_audio_from_video(video_path: str, audio_output: str) -> str:
-#     """
-#     Extracts audio from an MP4 video file and saves it as MP3.
-#     Args:
-#         video_path: Path to the input MP4 video file.
-#         audio_output: Path for the output MP3 file.
-#     Returns:
-#         Path to the audio file.
-#     """
-#     try:
-#         (
-#             ffmpeg
-#             .input(video_path)
-#             .output(audio_output, format='mp3', acodec='libmp3lame', t=60)  # limit to 60 sec
-#             .overwrite_output()
-#             .run(quiet=True)
-#         )
-#         return audio_output
-#     except ffmpeg.Error as e:
-#         raise RuntimeError(f"FFmpeg error: {e.stderr.decode()}") from e
 @tool
 def wiki_search(query: str) -> str:
     """Search Wikipedia for a query and return maximum 2 results.
     Args:
-        query: The search query."""
     search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
     formatted_search_docs = "\n\n---\n\n".join(
         [
             f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
             for doc in search_docs
-        ])
     return {"wiki_results": formatted_search_docs}
 @tool
 def web_search(query: str) -> str:
     """Search Tavily for a query and return maximum 3 results.
     Args:
-        query: The search query."""
     search_docs = TavilySearchResults(max_results=3).invoke(query)
     formatted_search_docs = "\n\n---\n\n".join(
         [
-            f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
             for doc in search_docs
-        ])
     return {"web_results": formatted_search_docs}
 @tool
 def arxiv_search(query: str) -> str:
-    """Search Arxiv for a query and return maximum 3 result.
     Args:
-        query: The search query."""
-    search_docs = ArxivLoader(query=query, load_max_docs=3).load()
     formatted_search_docs = "\n\n---\n\n".join(
         [
-            f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content[:1000]}\n</Document>'
             for doc in search_docs
-        ])
     return {"arvix_results": formatted_search_docs}

 import base64
+import ffmpeg
 import pandas as pd
+import whisper
+import yt_dlp
 from langchain.tools import tool
+from langchain.tools.tavily_search import TavilySearchResults
+from langchain_community.document_loaders import ArxivLoader, WikipediaLoader
+from langchain_core.messages import HumanMessage
 @tool
 def read_excel(file_path: str) -> str:
+    """Extract readable text from an Excel file (.xlsx or .xls).
     Args:
         file_path: Path to the Excel file.
         df_dict = pd.read_excel(file_path, sheet_name=None)  # Read all sheets
         result = []
         for sheet_name, sheet_df in df_dict.items():
+            sheet_text = sheet_df.to_json(orient="records", lines=False)
+            result.append({f"Sheet: {sheet_name}": sheet_text})
+        full_text = ""
+        for sheet in result:
+            for sheet_name, sheet_data in sheet.items():
+                full_text += f"{sheet_name}\n{sheet_data}\n\n"
+        return full_text
     except Exception as e:
         return f"Error reading Excel file: {str(e)}"
 @tool
 def read_python(file_path: str) -> str:
+    """Extract source code from a Python (.py) file.
     Args:
         file_path: Path to the Python file.
     except Exception as e:
         return f"Error reading Python file: {str(e)}"
 class ExtractTextFromImage:
+    """Class to initialize the extract_text_from_image tool."""
     def __init__(self, multimodal_model):
+        """Initialize multimodal model."""
         self.multimodal_model = multimodal_model
+    def __call_extract_text_from_image__(self, img_path: str) -> str:
+        """Extract text from an image file.
         Args:
             img_path: A string representing the path to an image (e.g., PNG, JPEG).
         Returns:
+            A single string containing the concatenated text extracted from the image.
         """
         all_text = ""
         try:
             # Read image and encode as base64
             with open(img_path, "rb") as image_file:
                 image_bytes = image_file.read()
             image_base64 = base64.b64encode(image_bytes).decode("utf-8")
             # Prepare the prompt including the base64 image data
             message = [
                 HumanMessage(
                     ]
                 )
             ]
             # Call the vision-capable model
             response = self.multimodal_model.invoke(message)
             # Append extracted text
             all_text += response.content + "\n\n"
             return all_text.strip()
         except Exception as e:
             error_msg = f"Error extracting text: {str(e)}"
 class DescribeImage:
+    """Class to initialize the describe_image tool."""
     def __init__(self, multimodal_model):
+        """Initialize multimodal model."""
         self.multimodal_model = multimodal_model
+    def __call_describe_image__(self, img_path: str, query: str) -> str:
+        """Generate a detailed description of an image.
+        This function reads a image from an url, encodes it, and sends it to a
+        vision-capable language model to obtain a comprehensive, natural language
         description of the image's content, including its objects, actions, and context,
         following a specific query.
         Args:
             img_path: A string representing the path to an image (e.g., PNG, JPEG).
             query: Information to extract from the image.
         Returns:
             A single string containing a detailed description of the image.
         """
             # Read image and encode as base64
             with open(img_path, "rb") as image_file:
                 image_bytes = image_file.read()
             image_base64 = base64.b64encode(image_bytes).decode("utf-8")
             # Prepare message payload
             message = [
                 HumanMessage(
                         {
                             "type": "text",
                             "text": (
+                                f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}"
+                            ),
                         },
                         {
                             "type": "image_url",
             ]
             response = self.multimodal_model.invoke(message)
             return response.content.strip()
         except Exception as e:
             error_msg = f"Error describing image: {str(e)}"
             print(error_msg)
             return ""
+@tool
+def transcribe_audio(audio_path: str) -> str:
+    """Transcribe an MP3 file.
+    Args:
+        audio_path: Path to the MP3 audio file.
+    Returns:
+        Transcribed text as a string.
+    """
+    try:
+        model = whisper.load_model("small")  # or "tiny", "small", "medium", "large"
+        result = model.transcribe(audio_path)
+        return result
+    except Exception as e:
+        error_msg = f"Error transcribing audio: {str(e)}"
+        print(error_msg)
+        return ""
+@tool
+def download_youtube_video(youtube_url: str, output_path: str) -> str:
+    """Download a YouTube video as an MP4 file.
+    Args:
+        youtube_url: The YouTube video URL.
+        output_path: Desired output path for the downloaded MP4 file.
+    Returns:
+        Path to the saved video file.
+    """
+    ydl_opts = {
+        "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
+        "outtmpl": output_path,
+        "merge_output_format": "mp4",
+        "quiet": True,
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([youtube_url])
+    return output_path
+@tool
+def extract_audio_from_video(video_path: str, audio_output: str) -> str:
+    """Extracts audio from an MP4 video file and saves it as MP3.
+    Args:
+        video_path: Path to the input MP4 video file.
+        audio_output: Path for the output MP3 file.
+    Returns:
+        Path to the audio file.
+    """
+    try:
+        (
+            ffmpeg.input(video_path)
+            .output(
+                audio_output, format="mp3", acodec="libmp3lame", t=60
+            )  # limit to 60 sec
+            .overwrite_output()
+            .run(quiet=True)
+        )
+        return audio_output
+    except Exception as e:
+        error_msg = f"Error transcribing audio: {str(e)}"
+        print(error_msg)
+        return ""
 @tool
 def wiki_search(query: str) -> str:
     """Search Wikipedia for a query and return maximum 2 results.
     Args:
+        query: The search query.
+    """
     search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
     formatted_search_docs = "\n\n---\n\n".join(
         [
             f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
             for doc in search_docs
+        ]
+    )
     return {"wiki_results": formatted_search_docs}
 @tool
 def web_search(query: str) -> str:
     """Search Tavily for a query and return maximum 3 results.
     Args:
+        query: The search query.
+    """
     search_docs = TavilySearchResults(max_results=3).invoke(query)
     formatted_search_docs = "\n\n---\n\n".join(
         [
+            f'<Document source="{doc["url"]}" title="{doc["title"]}" score="{doc.get("score", "")}">\n{doc["content"]}\n</Document>'
             for doc in search_docs
+        ]
+    )
     return {"web_results": formatted_search_docs}
 @tool
 def arxiv_search(query: str) -> str:
+    """Search Arxiv for a query and return maximum 2 result.
     Args:
+        query: The search query.
+    """
+    search_docs = ArxivLoader(query=query, load_max_docs=2).load()
     formatted_search_docs = "\n\n---\n\n".join(
         [
+            (
+                f'<Document title="{doc.metadata.get("Title", "")}" '
+                f'published="{doc.metadata.get("Published", "")}" '
+                f'authors="{doc.metadata.get("Authors", "")}">\n'
+                f'Summary: {doc.metadata.get("Summary", "")}\n\n'
+                f"{doc.page_content}\n"
+                f"</Document>"
+            )
             for doc in search_docs
+        ]
+    )
     return {"arvix_results": formatted_search_docs}