Final_Assignment_Template

Sleeping

App Files Files Community

dalybuilds commited on Jul 19, 2025

Commit

3a1e7f5

verified ·

1 Parent(s): 991459e

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -53

app.py CHANGED Viewed

@@ -4,12 +4,14 @@ import requests
 import pandas as pd
 from io import BytesIO
 import re
-# --- Video & Audio Tool Imports ---
 from pytube import YouTube
-import moviepy.editor as mp
-# --- LangChain & Dependency Imports ---
 from groq import Groq
 from langchain_groq import ChatGroq
 from langchain.agents import AgentExecutor, create_tool_calling_agent
@@ -17,18 +19,13 @@ from langchain_tavily import TavilySearchResults
 from langchain_core.prompts import ChatPromptTemplate
 from langchain.tools import Tool
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 TEMP_DIR = "/tmp"
 # --- Tool Definition: Audio File Transcription ---
 def transcribe_audio_file(task_id: str) -> str:
-    """
-    Downloads an audio file (.mp3) for a given task_id, transcribes it, and returns the text.
-    Use this tool ONLY when a question explicitly mentions an audio file, .mp3, recording, or voice memo.
-    """
     print(f"Tool 'transcribe_audio_file' called with task_id: {task_id}")
     try:
         file_url = f"{DEFAULT_API_URL}/files/{task_id}"
@@ -36,31 +33,25 @@ def transcribe_audio_file(task_id: str) -> str:
         audio_response.raise_for_status()
         audio_bytes = BytesIO(audio_response.content)
         audio_bytes.name = f"{task_id}.mp3"
         client = Groq(api_key=os.getenv("GROQ_API_KEY"))
         transcription = client.audio.transcriptions.create(file=audio_bytes, model="whisper-large-v3", response_format="text")
         return str(transcription)
     except Exception as e:
         return f"Error during audio file transcription: {e}"
-# --- Tool Definition: Video Transcription ---
 def transcribe_youtube_video(video_url: str) -> str:
-    """
-    Downloads a YouTube video from a URL, extracts its audio, and transcribes it to text.
-    Use this tool ONLY when a question provides a youtube.com URL.
-    """
-    print(f"Tool 'transcribe_youtube_video' called with URL: {video_url}")
     video_path, audio_path = None, None
     try:
         os.makedirs(TEMP_DIR, exist_ok=True)
         yt = YouTube(video_url)
-        stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
         video_path = stream.download(output_path=TEMP_DIR)
-        video_clip = mp.VideoFileClip(video_path)
-        audio_path = os.path.join(TEMP_DIR, "temp_audio.mp3")
-        video_clip.audio.write_audiofile(audio_path, codec='mp3', logger=None)
         client = Groq(api_key=os.getenv("GROQ_API_KEY"))
         with open(audio_path, "rb") as audio_file:
             transcription = client.audio.transcriptions.create(file=audio_file, model="whisper-large-v3", response_format="text")
@@ -71,43 +62,61 @@ def transcribe_youtube_video(video_url: str) -> str:
         if video_path and os.path.exists(video_path): os.remove(video_path)
         if audio_path and os.path.exists(audio_path): os.remove(audio_path)
 # --- Agent Definition ---
 class LangChainAgent:
-    def __init__(self, groq_api_key: str, tavily_api_key: str):
         self.llm = ChatGroq(model_name="llama3-70b-8192", groq_api_key=groq_api_key, temperature=0.0)
         self.tools = [
-            TavilySearchResults(
-                name="web_search",
-                max_results=3,
-                tavily_api_key=tavily_api_key,
-                description="A search engine for finding up-to-date information, facts, and news on the internet."
-            ),
-            Tool(
-                name="audio_file_transcriber",
-                func=transcribe_audio_file,
-                description="Use this ONLY for questions mentioning an audio file (.mp3, recording). Input MUST be the task_id.",
-            ),
-            Tool(
-                name="youtube_video_transcriber",
-                func=transcribe_youtube_video,
-                description="Use this ONLY for questions providing a youtube.com URL. Input MUST be the URL.",
-            ),
         ]
         prompt = ChatPromptTemplate.from_messages([
             ("system", (
                 "You are a powerful problem-solving agent. Your goal is to answer the user's question accurately. "
-                "You have access to a web search tool, an audio file transcriber, and a YouTube video transcriber.\n\n"
                 "**REASONING PROCESS:**\n"
-                "1.  **Analyze the question:** Is it a general knowledge question, or does it mention a file/URL?\n"
-                "2.  **Select ONE tool:**\n"
-                "    - If the question requires current events, facts, or general knowledge, use `web_search`.\n"
-                "    - If the question *explicitly* mentions an audio file, .mp3, or voice memo, use `audio_file_transcriber` with the provided `task_id`.\n"
-                "    - If the question *explicitly* provides a `youtube.com` URL, use `youtube_video_transcriber` with that URL.\n"
-                "    - If no tool is needed (e.g., math, logic puzzles), answer directly.\n"
-                "3.  **Execute and Answer:** After using a tool, analyze the result and provide ONLY THE FINAL ANSWER. Do not explain your actions or apologize for errors."
             )),
             ("human", "Question: {input}\nTask ID: {task_id}"),
             ("placeholder", "{agent_scratchpad}"),
@@ -121,7 +130,6 @@ class LangChainAgent:
         input_for_agent = {"input": question, "task_id": task_id}
         if urls and "youtube.com" in urls[0]:
             input_for_agent['video_url'] = urls[0]
         try:
             response = self.agent_executor.invoke(input_for_agent)
             return response.get("output", "Agent failed to produce an answer.")
@@ -136,8 +144,9 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     try:
         groq_api_key = os.getenv("GROQ_API_KEY")
         tavily_api_key = os.getenv("TAVILY_API_KEY")
-        if not all([groq_api_key, tavily_api_key]): raise ValueError("GROQ or TAVILY API key is missing.")
-        agent = LangChainAgent(groq_api_key=groq_api_key, tavily_api_key=tavily_api_key)
     except Exception as e: return f"Error initializing agent: {e}", None
     questions_url = f"{DEFAULT_API_URL}/questions"
@@ -171,8 +180,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
-    gr.Markdown("# Ultimate Agent Runner (Search + Audio + Video)")
-    gr.Markdown("This agent can search, transcribe audio files, and transcribe YouTube videos.")
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
@@ -181,7 +190,7 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     print("\n" + "-"*30 + " App Starting " + "-"*30)
-    for key in ["GROQ_API_KEY", "TAVILY_API_KEY"]:
         print(f"✅ {key} secret is set." if os.getenv(key) else f"⚠️ WARNING: {key} secret is not set.")
     print("-"*(60 + len(" App Starting ")) + "\n")
     demo.launch(debug=True, share=False)

 import pandas as pd
 from io import BytesIO
 import re
+import subprocess
+import base64
+# --- Tool-specific Imports ---
 from pytube import YouTube
+from langchain_huggingface import HuggingFaceInferenceAPI
+# --- LangChain & Groq Imports ---
 from groq import Groq
 from langchain_groq import ChatGroq
 from langchain.agents import AgentExecutor, create_tool_calling_agent
 from langchain_core.prompts import ChatPromptTemplate
 from langchain.tools import Tool
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 TEMP_DIR = "/tmp"
 # --- Tool Definition: Audio File Transcription ---
 def transcribe_audio_file(task_id: str) -> str:
+    # (This function is complete and correct from the previous version)
     print(f"Tool 'transcribe_audio_file' called with task_id: {task_id}")
     try:
         file_url = f"{DEFAULT_API_URL}/files/{task_id}"
         audio_response.raise_for_status()
         audio_bytes = BytesIO(audio_response.content)
         audio_bytes.name = f"{task_id}.mp3"
         client = Groq(api_key=os.getenv("GROQ_API_KEY"))
         transcription = client.audio.transcriptions.create(file=audio_bytes, model="whisper-large-v3", response_format="text")
         return str(transcription)
     except Exception as e:
         return f"Error during audio file transcription: {e}"
+# --- Tool Definition: Video Transcription via FFmpeg ---
 def transcribe_youtube_video(video_url: str) -> str:
+    # (This function is complete and correct from the previous version)
+    print(f"Tool 'transcribe_youtube_video' (ffmpeg) called with URL: {video_url}")
     video_path, audio_path = None, None
     try:
         os.makedirs(TEMP_DIR, exist_ok=True)
         yt = YouTube(video_url)
+        stream = yt.streams.filter(only_audio=True).first()
         video_path = stream.download(output_path=TEMP_DIR)
+        audio_path = os.path.join(TEMP_DIR, "output.mp3")
+        command = ["ffmpeg", "-i", video_path, "-y", "-q:a", "0", "-map", "a", audio_path]
+        subprocess.run(command, check=True, capture_output=True, text=True)
         client = Groq(api_key=os.getenv("GROQ_API_KEY"))
         with open(audio_path, "rb") as audio_file:
             transcription = client.audio.transcriptions.create(file=audio_file, model="whisper-large-v3", response_format="text")
         if video_path and os.path.exists(video_path): os.remove(video_path)
         if audio_path and os.path.exists(audio_path): os.remove(audio_path)
+# --- NEW TOOL Definition: Image Analysis ---
+def analyze_image_from_task_id(task_id: str) -> str:
+    """
+    Downloads an image file for a given task_id and analyzes it using a Vision-Language Model.
+    Use this tool ONLY when a question explicitly mentions an image.
+    """
+    print(f"Tool 'analyze_image_from_task_id' called with task_id: {task_id}")
+    try:
+        file_url = f"{DEFAULT_API_URL}/files/{task_id}"
+        print(f"Downloading image from: {file_url}")
+        response = requests.get(file_url)
+        response.raise_for_status()
+        # Initialize the VLM client
+        vlm_client = HuggingFaceInferenceAPI(
+            model_id="llava-hf/llava-1.5-7b-hf",
+            token=os.getenv("HF_TOKEN")
+        )
+        print("Analyzing image with Llava...")
+        # The prompt for the VLM needs to be specific.
+        # We can just ask it to describe the image in detail.
+        text_prompt = "Describe the image in detail."
+        result = vlm_client.image_to_text(image=response.content, prompt=text_prompt)
+        print(f"Image analysis successful. Result: {result}")
+        return result
+    except Exception as e:
+        return f"Error during image analysis: {e}"
 # --- Agent Definition ---
 class LangChainAgent:
+    def __init__(self, groq_api_key: str, tavily_api_key: str, hf_token: str):
         self.llm = ChatGroq(model_name="llama3-70b-8192", groq_api_key=groq_api_key, temperature=0.0)
         self.tools = [
+            TavilySearchResults(name="web_search", max_results=3, tavily_api_key=tavily_api_key, description="A search engine for finding up-to-date information on the internet."),
+            Tool(name="audio_file_transcriber", func=transcribe_audio_file, description="Use this for questions mentioning an audio file (.mp3, recording). Input MUST be the task_id."),
+            Tool(name="youtube_video_transcriber", func=transcribe_youtube_video, description="Use this for questions with a youtube.com URL. Input MUST be the URL."),
+            Tool(name="image_analyzer", func=analyze_image_from_task_id, description="Use this for questions mentioning an image. Input MUST be the task_id."),
         ]
         prompt = ChatPromptTemplate.from_messages([
             ("system", (
                 "You are a powerful problem-solving agent. Your goal is to answer the user's question accurately. "
+                "You have access to a web search tool, an audio file transcriber, a YouTube video transcriber, and an image analyzer.\n\n"
                 "**REASONING PROCESS:**\n"
+                "1.  **Analyze the question:** Determine if a tool is needed. Is it a general knowledge question, or does it mention a specific file type (audio, video, image) or URL?\n"
+                "2.  **Select ONE tool based on the question:**\n"
+                "    - For general knowledge, facts, or current events: use `web_search`.\n"
+                "    - For an audio file, .mp3, or voice memo: use `audio_file_transcriber` with the `task_id`.\n"
+                "    - For a youtube.com URL: use `youtube_video_transcriber` with the URL.\n"
+                "    - For an image: use `image_analyzer` with the `task_id`.\n"
+                "    - For math or simple logic: answer directly.\n"
+                "3.  **Execute and Answer:** After using a tool, analyze the result and provide ONLY THE FINAL ANSWER."
             )),
             ("human", "Question: {input}\nTask ID: {task_id}"),
             ("placeholder", "{agent_scratchpad}"),
         input_for_agent = {"input": question, "task_id": task_id}
         if urls and "youtube.com" in urls[0]:
             input_for_agent['video_url'] = urls[0]
         try:
             response = self.agent_executor.invoke(input_for_agent)
             return response.get("output", "Agent failed to produce an answer.")
     try:
         groq_api_key = os.getenv("GROQ_API_KEY")
         tavily_api_key = os.getenv("TAVILY_API_KEY")
+        hf_token = os.getenv("HF_TOKEN")
+        if not all([groq_api_key, tavily_api_key, hf_token]): raise ValueError("An API key (GROQ, TAVILY, or HF) is missing.")
+        agent = LangChainAgent(groq_api_key=groq_api_key, tavily_api_key=tavily_api_key, hf_token=hf_token)
     except Exception as e: return f"Error initializing agent: {e}", None
     questions_url = f"{DEFAULT_API_URL}/questions"
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
+    gr.Markdown("# Ultimate Agent Runner (Search, Audio, Video, Vision)")
+    gr.Markdown("This agent can search, transcribe audio files, transcribe YouTube videos, and analyze images.")
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
 if __name__ == "__main__":
     print("\n" + "-"*30 + " App Starting " + "-"*30)
+    for key in ["GROQ_API_KEY", "TAVILY_API_KEY", "HF_TOKEN"]:
         print(f"✅ {key} secret is set." if os.getenv(key) else f"⚠️ WARNING: {key} secret is not set.")
     print("-"*(60 + len(" App Starting ")) + "\n")
     demo.launch(debug=True, share=False)