GAIA_Agent_Final

Runtime error

App Files Files Community

pkduongsu commited on May 2, 2025

Commit

737d955

1 Parent(s): 1131e24

eval 45/100, still cannot access files for questions

Browse files

Files changed (12) hide show

Screenshot 2025-05-02 144021.png +0 -0
agent.py +33 -16
app.py +41 -7
requirements.txt +3 -1
system_prompt.txt +12 -39
testing.py +3 -0
tools/analyze_audio.py +69 -0
tools/analyze_excel.py +2 -2
tools/{extract_text_from_image.py → analyze_image.py} +18 -5
tools/analyze_youtube.py +261 -0
tools/download_file.py +1 -1
tools/read_file.py +1 -1

Screenshot 2025-05-02 144021.png ADDED Viewed

agent.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import sys
-import asyncio
 from dotenv import load_dotenv
 # Add the project root directory to the Python path
@@ -16,17 +16,17 @@ from tools.web_search import web_search # Corrected import alias if needed, or u
 from tools.analyze_csv import analyze_csv
 from tools.analyze_excel import analyze_excel
 from tools.download_file import download_file
-from tools.extract_text_from_image import extract_text_from_image
 from tools.read_file import read_and_save_file
 #switch to using gemini 2.0 model
 from langchain_google_genai import ChatGoogleGenerativeAI
-from langchain_core.messages import HumanMessage
-#use LangGraph to create the agent
 from langgraph.graph import START, StateGraph, MessagesState
 from langgraph.prebuilt import ToolNode, tools_condition
-from langchain.tools.base import BaseTool
 load_dotenv()
@@ -41,14 +41,24 @@ tools = [
     analyze_csv,
     analyze_excel,
     download_file,
-    extract_text_from_image,
     read_and_save_file,
-]
 def create_agent(): #build graph
     try:
-        llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-preview-04-17",
-        convert_system_message_to_human=True)
     except Exception as e:
         print(f"Error initializing LLM: {e}")
         return None
@@ -72,7 +82,6 @@ def create_agent(): #build graph
         builder.add_edge("tools", "assistant")
         react_graph = builder.compile()
-        print("Agent created successfully.")
         return react_graph
     except Exception as e:
         print(f"Error creating Agent {e}")
@@ -89,12 +98,20 @@ def main(): # Define an async main function
                 if query.lower() == 'quit':
                     break
                 if query:
-                    input_msg = [HumanMessage(content=query)]
                     # Assuming agent.run is the correct async method for FunctionAgent
-                    response = agent.invoke({"messages": input_msg})
-                    for m in response['messages']:
-                        m.pretty_print()
             except EOFError:
                 break
             except KeyboardInterrupt:

 import os
 import sys
+from typing import List, TypedDict, Annotated, Optional
 from dotenv import load_dotenv
 # Add the project root directory to the Python path
 from tools.analyze_csv import analyze_csv
 from tools.analyze_excel import analyze_excel
 from tools.download_file import download_file
+from tools.analyze_image import analyze_image
 from tools.read_file import read_and_save_file
+from tools.analyze_audio import analyze_audio
+from tools.analyze_youtube import answer_question_about_youtube_video
 #switch to using gemini 2.0 model
 from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
+from langgraph.graph.message import add_messages
 from langgraph.graph import START, StateGraph, MessagesState
 from langgraph.prebuilt import ToolNode, tools_condition
 load_dotenv()
     analyze_csv,
     analyze_excel,
     download_file,
+    analyze_image,
     read_and_save_file,
+    analyze_audio,
+    answer_question_about_youtube_video,]
+with open("system_prompt.txt", "r", encoding="utf-8") as f:
+    system = f.read()
+system_message = SystemMessage(content=system)
+class AgentState(TypedDict):
+    input_file: Optional[str] #contains the input file path if there is any
+    messages: Annotated[List[AnyMessage], add_messages] #contains the messages exchanged between the user and the agent
 def create_agent(): #build graph
     try:
+        llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
     except Exception as e:
         print(f"Error initializing LLM: {e}")
         return None
         builder.add_edge("tools", "assistant")
         react_graph = builder.compile()
         return react_graph
     except Exception as e:
         print(f"Error creating Agent {e}")
                 if query.lower() == 'quit':
                     break
                 if query:
                     # Assuming agent.run is the correct async method for FunctionAgent
+                    # Construct the initial messages list including the system prompt
+                    initial_messages = [
+                        system_message, # Include the system prompt read earlier
+                        HumanMessage(content=query)
+                    ]
+                    # Invoke the agent with the messages state
+                    response = agent.invoke({"messages": initial_messages})
+                    # The final response from the graph is in the 'messages' list
+                    # Get the last message, which should be the AI's response
+                    answer = response["messages"][-1].content
+                    # Print only the final answer without the "Agent: " prefix
+                    print(answer)
             except EOFError:
                 break
             except KeyboardInterrupt:

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import gradio as gr
 import requests
 import asyncio
 import inspect
 import pandas as pd
@@ -11,10 +12,10 @@ from langchain_core.messages import SystemMessage, HumanMessage
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-with open("system_prompt.txt", "r") as file:
-    system_prompt = file.read()
-system_message = SystemMessage(content=system_prompt)
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
@@ -81,11 +82,44 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
-            # Geeting the agent responses
-            input_msg = [system_message, HumanMessage(content=question_text)]
-            agent_response = agent.invoke({"messages": input_msg})
             answer = agent_response['messages'][-1].content
-            submitted_answer = answer[14:] # Extract string response
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             print(f"Task ID: {task_id}, Question: {question_text}, Submitted Answer: {submitted_answer}")
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})

 import os
 import gradio as gr
 import requests
+import re
 import asyncio
 import inspect
 import pandas as pd
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+with open("system_prompt.txt", "r", encoding="utf-8") as f:
+    system = f.read()
+system_message = SystemMessage(content=system)
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
+            # --- Check for associated file ---
+            file_path = None
+            files_url = f"{api_url}/files/{task_id}"
+            try:
+                file_response = requests.get(files_url, timeout=10)
+                if file_response.status_code == 200:
+                    # Assuming the response body directly contains the filename/path
+                    file_path = file_response.text.strip().strip('"') # Get path and remove potential quotes
+                    print(f"Task {task_id}: Found associated file '{file_path}'")
+                elif file_response.status_code == 404:
+                    print(f"Task {task_id}: No associated file found.")
+                else:
+                    # Log other non-404 errors but don't stop the process
+                    print(f"Task {task_id}: Warning - Error checking for file ({file_response.status_code}): {file_response.text[:100]}")
+            except requests.exceptions.RequestException as file_err:
+                print(f"Task {task_id}: Warning - Network error checking for file: {file_err}")
+            # --- Prepare agent input ---
+            agent_input = {
+                "messages": [system_message, HumanMessage(content=question_text)]
+            }
+            if file_path:
+                agent_input["input_file"] = file_path # Add file path if found
+            # --- Invoke Agent ---
+            agent_response = agent.invoke(agent_input)
             answer = agent_response['messages'][-1].content
+            # --- Process Answer ---
+            match = re.search(r"FINAL ANSWER:.*", answer, flags=re.IGNORECASE)
+            answer_line = match.group(0).strip() if match else answer.strip()
+            answer_line = answer_line.replace("FINAL ANSWER:", "").strip()  # Clean up the answer
+            submitted_answer = answer_line # Extract string response
+            # response_dict = agent.invoke({"input": question_text})
+            # # Extract the answer, handling potential missing key
+            # submitted_answer = response_dict.get("output", f"AGENT ERROR: No 'output' key in response: {response_dict}")
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             print(f"Task ID: {task_id}, Question: {question_text}, Submitted Answer: {submitted_answer}")
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})

requirements.txt CHANGED Viewed

@@ -6,8 +6,10 @@ python-dotenv
 langchain-tavily
 langchain
 langchain-community
 arxiv
 langchain-google-genai
 langgraph
 gradio[oauth]
-pymupdf

 langchain-tavily
 langchain
 langchain-community
+langgraph
 arxiv
 langchain-google-genai
 langgraph
 gradio[oauth]
+pymupdf
+yt-dlp

system_prompt.txt CHANGED Viewed

@@ -1,49 +1,22 @@
-You are a helpful assistant tasked with answering questions using a set of tools that allow you to directly understand and analyze Youtube videos, images, and other content.
-When you receive a question:
-1. **Plan (Thought)**
-   - Think through the user’s goal.
-   - Identify what information you already know versus what you need to look up or compute.
-   - Decide whether one or more tools can help you achieve the user’s goal more efficiently or accurately.
-2. **Decide on Action**
-   - If you need to retrieve data, run calculations, or fetch external knowledge, choose the appropriate tool and format your call precisely.
-   - If you can answer directly, skip to “Final Answer.”
-3. **Act (Tool Invocation)**
-   - Invoke the chosen tool(s) with clearly structured inputs.
-   - E.g., `<<call:search_tool(query="latest sales figures for X")>>` or `<<call:calculator(expression="a + b")>>`.
-4. **Observe (Tool Output)**
-   - Read the tool’s response carefully.
-   - Extract the relevant facts or results.
-5. **Iterate or Conclude**
-   - If the result fully answers the user’s question, proceed to “Final Answer.”
-   - Otherwise, repeat from **Plan** using the new information.
-6. **Final Answer**
-   - Summarize your findings in clear, concise language.
-   - Cite or reference any tool outputs where appropriate.
-   - Offer next steps or alternative suggestions if relevant.
-**Remember:**
-- Always consider whether a tool can enhance accuracy or save time before answering from memory.
-- Structure your reasoning explicitly in “Thought,” “Action,” “Observation,” and “Answer” steps (the ReAct pattern).
-- Keep your language user-friendly and your explanations transparent.
-Now, I will ask you a question. Report your thoughts, and finish your answer with the following template:
 FINAL ANSWER: [YOUR FINAL ANSWER].
-YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 Your answer should only start with "FINAL ANSWER: ", then follows with the answer.
 Here are a few examples of questions and final answer:
 ---
 Question: What was the actual enrollment count of the clinical trial on H. pylori in acne vulgaris patients from Jan-May 2018 as listed on the NIH website?
-FINAL ANSWER: 90
 ---
 Question: In NASA's Astronomy Picture of the Day on 2006 January 21, two astronauts are visible, with one appearing much smaller than the other. As of August 2023, out of the astronauts in the NASA Astronaut Group that the smaller astronaut was a member of , which one spent the least time in space, and how many minutes did he spend in space, rounded to the nearest minute? Exclude any astronaunts who did not spend any time in space. Give the last name of the astronaunt, separated from the number of minutes by a semicolon.
-FINAL ANSWER: White;5876

+You are a helpful assistant tasked with answering questions using a set of tools.
+Now, I will ask you a question. Think step-by-step **silently** (do NOT reveal your reasoning), and give your answer with the exact following template:
 FINAL ANSWER: [YOUR FINAL ANSWER].
+YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, Apply the rules above for each element (number or string), ensure there is exactly one space after each comma.
 Your answer should only start with "FINAL ANSWER: ", then follows with the answer.
 Here are a few examples of questions and final answer:
 ---
 Question: What was the actual enrollment count of the clinical trial on H. pylori in acne vulgaris patients from Jan-May 2018 as listed on the NIH website?
+Answer: FINAL ANSWER: 90
 ---
 Question: In NASA's Astronomy Picture of the Day on 2006 January 21, two astronauts are visible, with one appearing much smaller than the other. As of August 2023, out of the astronauts in the NASA Astronaut Group that the smaller astronaut was a member of , which one spent the least time in space, and how many minutes did he spend in space, rounded to the nearest minute? Exclude any astronaunts who did not spend any time in space. Give the last name of the astronaunt, separated from the number of minutes by a semicolon.
+Answer: FINAL ANSWER: White;5876
+--
+Question: The attached Excel file contains the sales of menu items for
+a local fast-food chain. What were the total sales that the chain made from
+food (not including drinks)? Express your answer in USD with two decimal
+places.
+Answer: FINAL ANSWER: $89706.00

testing.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from tools.analyze_excel import analyze_excel
2	+
3	+ from langchain_google_genai import ChatGoogleGenerativeAI

tools/analyze_audio.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import base64
+from langchain_core.tools import tool
+from langchain_core.messages import HumanMessage
+from langchain_google_genai import ChatGoogleGenerativeAI
+import httpx
+from dotenv import load_dotenv
+load_dotenv()
+llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
+@tool
+def analyze_audio(audio_url: str, question: str) -> str:
+    """
+    Analyze audio data from a URL using a multimodal model.
+    """
+    # Fetch audio data
+    try:
+        # Fetch audio data
+        response = httpx.get(audio_url)
+        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
+        audio_data = base64.b64encode(response.content).decode("utf-8")
+        # Pass to LLM
+        message = [
+            HumanMessage(
+                content = [
+                    {
+                        "type": "text",
+                        "text": "Analyze the audio and answer the following question: " + question,
+                    },
+                    {
+                        "type": "audio",
+                        "source_type": "base64",
+                        "data": audio_data,
+                        "mime_type": "audio/mp3", # Assuming mp3, might need adjustment based on actual content type
+                    },
+                ],
+            )
+        ]
+        llm_response = llm.invoke(message)
+        return llm_response.content.strip()
+    except httpx.MissingSchema as e:
+        error_msg = f"Error analyzing audio: The provided URL '{audio_url}' is missing the 'http://' or 'https://' protocol. Please provide a complete URL."
+        print(error_msg)
+        return error_msg # Return the specific error to the agent
+    except httpx.InvalidURL as e:
+        error_msg = f"Error analyzing audio: The provided URL '{audio_url}' is invalid. Details: {str(e)}"
+        print(error_msg)
+        return error_msg # Return the specific error to the agent
+    except httpx.RequestError as e:
+        # Catch other httpx request errors (network issues, timeouts, 404s, etc.)
+        error_msg = f"Error fetching audio from URL '{audio_url}': {str(e)}"
+        print(error_msg)
+        return error_msg # Return the specific error to the agent
+    except Exception as e:
+        # Catch other potential errors (base64 encoding, LLM invocation, etc.)
+        error_msg = f"An unexpected error occurred during audio analysis: {str(e)}"
+        print(error_msg)
+        return error_msg # Return the specific error to the agent
+if __name__ == "__main__":
+    # Example usage
+    audio_url = "https://www.learningcontainer.com/wp-content/uploads/2020/02/Kalimba.mp3"
+    question = "What is the main topic of this audio?"
+    result = analyze_audio.invoke({"audio_url": audio_url, "question": question})
+    print(result)

tools/analyze_excel.py CHANGED Viewed

@@ -21,7 +21,7 @@ def analyze_excel(file_path: str, question: str) -> str:
         # Read Excel file
         df = pd.read_excel(file_path)
-        # Basic information about the data
         total_rows = len(df)
         total_columns = len(df.columns)
         columns = list(df.columns)
@@ -38,4 +38,4 @@ def analyze_excel(file_path: str, question: str) -> str:
         return summary
     except Exception as e:
-        return f"Error analyzing Excel file: {str(e)}"

         # Read Excel file
         df = pd.read_excel(file_path)
+        # Basic information about the data
         total_rows = len(df)
         total_columns = len(df.columns)
         columns = list(df.columns)
         return summary
     except Exception as e:
+        return f"Error analyzing Excel file: {str(e)}"

tools/{extract_text_from_image.py → analyze_image.py} RENAMED Viewed

@@ -1,10 +1,15 @@
 import base64
-from typing import List, TypedDict, Annotated, Optional
 from langchain_core.tools import tool
 from langchain_core.messages import HumanMessage
 @tool
-def extract_text_from_image(img_path: str) -> str:
     """
     Extract text from an image file using a multimodal model.
     """
@@ -23,8 +28,7 @@ def extract_text_from_image(img_path: str) -> str:
                     {
                         "type": "text",
                         "text": (
-                            "Extract all the text from this image. "
-                            "Return only the extracted text, no explanations."
                         ),
                     },
                     {
@@ -38,7 +42,8 @@ def extract_text_from_image(img_path: str) -> str:
         ]
         # Call the vision-capable model
-        response = vision_llm.invoke(message)
         # Append extracted text
         all_text += response.content + "\n\n"
@@ -49,3 +54,11 @@ def extract_text_from_image(img_path: str) -> str:
         error_msg = f"Error extracting text: {str(e)}"
         print(error_msg)
         return ""

 import base64
 from langchain_core.tools import tool
 from langchain_core.messages import HumanMessage
+from langchain_google_genai import ChatGoogleGenerativeAI
+from dotenv import load_dotenv
+load_dotenv()
+llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
 @tool
+def analyze_image(img_path: str, question: str) -> str:
     """
     Extract text from an image file using a multimodal model.
     """
                     {
                         "type": "text",
                         "text": (
+                            "Analyze the image and answer the following question: " + question
                         ),
                     },
                     {
         ]
         # Call the vision-capable model
+        # Call the vision-capable model with the prepared message list
+        response = llm.invoke(message)
         # Append extracted text
         all_text += response.content + "\n\n"
         error_msg = f"Error extracting text: {str(e)}"
         print(error_msg)
         return ""
+if __name__ == "__main__":
+    # Example usage
+    img_path = r"C:\Users\pkduo\OneDrive\Máy tính\HF Agent Course Final\Final_Assignment_Template\Screenshot 2025-05-02 144021.png"
+    question = "Review the chess position provided in the image. It is white's turn. Provide the correct next move for white which guarantees a win. Please provide your response in algebraic notation.?"
+    # Invoke the tool using the recommended .invoke() method with a dictionary input
+    result = analyze_image.invoke({"img_path": img_path, "question": question})
+    print(result)

tools/analyze_youtube.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import re
+import json
+import os
+from urllib.parse import urlparse, parse_qs
+from dotenv import load_dotenv
+import yt_dlp
+from langchain.tools import tool
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+load_dotenv()
+@tool
+def answer_question_about_youtube_video(url: str, question: str) -> str:
+    """
+    Answers a specific question about a YouTube video using its transcript, title, and description.
+    Fetches video metadata (title, description) and transcript using yt-dlp.
+    If a transcript is available, it uses an LLM to answer the provided question based on the transcript content,
+    using the title and description as additional context.
+    Args:
+        url (str): Full YouTube video URL (or any URL yt-dlp supports).
+        question (str): The specific question to answer about the video's content.
+    Returns:
+        str: The answer to the question based on the video's transcript,
+             or a message indicating the transcript was unavailable or an error occurred.
+    """
+    subtitle_filename = None
+    video_id = None
+    try:
+        # 1. Get video info (title, description) and transcript using yt-dlp
+        ydl_opts = {
+            'writesubtitles': True,
+            'subtitleslangs': ['en'], # Prioritize English
+            'writeautomaticsub': True, # Also try auto-generated captions
+            'subtitlesformat': 'json3',
+            'skip_download': True,
+            'quiet': True,
+            'outtmpl': '%(id)s', # Base name for potential subtitle file
+            'noplaylist': True,
+        }
+        transcript_text = None # Initialize as None to clearly indicate if found
+        title = "N/A"
+        description = "N/A"
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            # Extract info first to get metadata and ID
+            # Use ignoreerrors=True to try and get metadata even if download fails later
+            info_dict = ydl.extract_info(url, download=False, process=False) # process=False avoids some errors here
+            video_id = info_dict.get('id')
+            title = info_dict.get('title', 'Title not found')
+            description = info_dict.get('description', 'Description not found')
+            if not video_id:
+                 # Try extracting ID from URL as a fallback if yt-dlp fails early
+                 try:
+                     parsed = urlparse(url)
+                     if parsed.hostname in ("www.youtube.com", "youtube.com"):
+                         video_id = parse_qs(parsed.query).get("v", [None])[0]
+                     elif parsed.hostname == "youtu.be":
+                         video_id = parsed.path.lstrip("/")
+                     if not video_id:
+                          return f"Error: Could not extract video ID from URL: {url}"
+                 except Exception:
+                      return f"Error: Could not extract video ID from URL: {url}"
+            # Construct expected subtitle filename (best guess, might include lang code later)
+            subtitle_filename_base = f"{video_id}" # yt-dlp adds lang/format
+            # Attempt to download (this will trigger subtitle download)
+            try:
+                 # Re-run extract_info with download=True to trigger download actions
+                 # This is often more reliable for getting subtitles written
+                 ydl.extract_info(url, download=True) # Let yt-dlp handle download logic
+            except yt_dlp.utils.DownloadError as de:
+                 # Log subtitle-specific errors but continue if possible
+                 if "subtitles" in str(de).lower():
+                     print(f"Info: Subtitle download issue for {url}: {de}")
+                 else:
+                     # If it's not a subtitle error, it might be more critical
+                     print(f"Warning: Download error for {url}: {de}")
+                     # Decide if you want to return here or proceed without transcript
+            # Find the actual downloaded subtitle file (json3 format, English preferred)
+            found_subtitle_file = None
+            transcript_status = "not_found" # Possible values: not_found, found_but_empty, found_but_error, processed
+            # List potential subtitle files matching the pattern
+            potential_files = [f for f in os.listdir('.') if f.startswith(video_id) and f.endswith('.json3')]
+            if potential_files:
+                # Prioritize English if available
+                english_file = f"{video_id}.en.json3"
+                if english_file in potential_files:
+                    found_subtitle_file = english_file
+                else:
+                    # Otherwise, take the first one found (yt-dlp usually names it based on lang)
+                    found_subtitle_file = potential_files[0]
+                subtitle_filename = found_subtitle_file # Store the actual found filename for cleanup
+                print(f"Info: Found subtitle file: {found_subtitle_file}")
+                try:
+                    with open(found_subtitle_file, 'r', encoding='utf-8') as f:
+                        subtitle_data = json.load(f)
+                    # Extract text from json3 format
+                    segments = []
+                    for event in subtitle_data.get('events', []):
+                        if event and 'segs' in event:
+                            for seg in event['segs']:
+                                if seg and 'utf8' in seg:
+                                    segments.append(seg['utf8'].strip())
+                    processed_text = " ".join(segments)
+                    if processed_text:
+                        transcript_text = processed_text # Assign only if text was extracted
+                        transcript_status = "processed"
+                    else:
+                        # File exists but no text extracted
+                        print(f"Warning: Transcript file {found_subtitle_file} found but contained no processable text.")
+                        transcript_status = "found_but_empty"
+                        # Keep transcript_text as None
+                except json.JSONDecodeError as jde:
+                    print(f"Warning: Could not parse JSON in subtitle file {found_subtitle_file}: {jde}")
+                    transcript_status = "found_but_error"
+                    # Keep transcript_text as None
+                except Exception as e:
+                    print(f"Warning: Could not read/process subtitle file {found_subtitle_file}: {e}")
+                    transcript_status = "found_but_error"
+                    # Keep transcript_text as None
+            # else: transcript_text remains None, transcript_status remains "not_found"
+        # 2. Check if transcript is available before proceeding to LLM
+        if transcript_text is None:
+            if transcript_status == "not_found":
+                 return f"Transcript not found for video {video_id}. Cannot answer question."
+            elif transcript_status == "found_but_empty":
+                 return f"Transcript file found ({subtitle_filename}) but contained no text. Cannot answer question."
+            elif transcript_status == "found_but_error":
+                 return f"Transcript file found ({subtitle_filename}) but could not be processed. Cannot answer question."
+            else: # Should not happen if transcript_text is None, but as a fallback
+                 return "Transcript unavailable for an unknown reason. Cannot answer question."
+        # 3. Prepare prompt for LLM Q&A
+        qa_prompt_template = """
+        You are an assistant designed to answer questions about a YouTube video based *only* on its provided transcript, title, and description.
+        Video Title: {title}
+        Video Description: {description}
+        Video Transcript:
+        ---
+        {transcript}
+        ---
+        Based *only* on the information provided above (primarily the transcript), answer the following question:
+        Question: {question}
+        If the answer cannot be found in the transcript or the provided context, state that clearly (e.g., "The transcript does not contain information about..."). Do not make assumptions or use external knowledge. Provide a concise answer.
+        Answer:
+        """
+        prompt = PromptTemplate(
+            template=qa_prompt_template,
+            input_variables=["title", "description", "transcript", "question"]
+        )
+        # 4. Query LLM
+        llm = ChatGoogleGenerativeAI(
+            model="gemini-1.5-flash", # Or another suitable model like gemini-pro
+            temperature=0.0, # Keep temperature low for factual Q&A based on context
+        )
+        # Create a simple chain: prompt -> llm -> output parser
+        chain = prompt | llm | StrOutputParser()
+        # Run the chain with the extracted info
+        answer = chain.invoke({
+            "title": title,
+            "description": description if description else "Not Available",
+            "transcript": transcript_text, # Pass the extracted transcript
+            "question": question
+        })
+        return answer
+    except yt_dlp.utils.DownloadError as e:
+        # More specific error for user
+        error_message = f"Error during video data/subtitle download for {url}: {e}. "
+        if "video unavailable" in str(e).lower():
+            error_message += "The video might be private, deleted, or unavailable in your region."
+        elif "subtitles" in str(e).lower():
+             error_message += "Could not fetch subtitles. They might not exist for this video in English."
+        else:
+             error_message += "There was a problem accessing the video data."
+        return error_message
+    except Exception as e:
+        return f"An unexpected error occurred while processing {url}: {e}"
+    finally:
+        # Clean up the downloaded subtitle file if it exists and was identified
+        if subtitle_filename and os.path.exists(subtitle_filename):
+            try:
+                os.remove(subtitle_filename)
+                print(f"Cleaned up subtitle file: {subtitle_filename}")
+            except Exception as e:
+                print(f"Warning: Could not remove subtitle file {subtitle_filename}: {e}")
+        # Attempt cleanup based on video_id if filename wasn't confirmed but ID exists
+        elif video_id:
+             # Check common possible names based on yt-dlp patterns
+             possible_cleanup_files = [f"{video_id}.en.json3", f"{video_id}.json3"]
+             for fname in possible_cleanup_files:
+                  if os.path.exists(fname):
+                      try:
+                          os.remove(fname)
+                          print(f"Cleaned up potential subtitle file: {fname}")
+                      except Exception as e:
+                          print(f"Warning: Could not remove potential subtitle file {fname}: {e}")
+if __name__ == "__main__":
+    # Test case 1: Video with likely available English subtitles
+    test_url_1 = "https://www.youtube.com/watch?v=JGwWNGJdvx8" # Google I/O Keynote
+    test_question_1 = "What models were mentioned in the Gemini family according to the transcript?"
+    # Test case 2: Video likely without subtitles or with non-English ones
+    test_url_2 = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" # Rick Astley
+    test_question_2 = "Does the transcript mention the singer giving someone up?"
+    # Test case 3: Invalid URL (example)
+    # test_url_3 = "https://www.youtube.com/watch?v=invalididxyz"
+    # test_question_3 = "What is this video about?"
+    print(f"--- Test 1: Answering Question for: {test_url_1} ---")
+    print(f"Question: {test_question_1}")
+    # Invoke the tool using the .invoke() method with a dictionary input
+    answer1 = answer_question_about_youtube_video.invoke({"url": test_url_1, "question": test_question_1})
+    print(f"\nAnswer 1:\n{answer1}")
+    print("--- End of Test 1 ---")
+    print(f"\n--- Test 2: Answering Question for: {test_url_2} ---")
+    print(f"Question: {test_question_2}")
+    # Invoke the tool using the .invoke() method with a dictionary input
+    answer2 = answer_question_about_youtube_video.invoke({"url": test_url_2, "question": test_question_2})
+    print(f"\nAnswer 2:\n{answer2}")
+    print("--- End of Test 2 ---")
+    # print(f"\n--- Test 3: Answering Question for: {test_url_3} ---")
+    # print(f"Question: {test_question_3}")
+    # answer3 = answer_question_about_youtube_video(test_url_3, test_question_3)
+    # print(f"\nAnswer 3:\n{answer3}")
+    # print("--- End of Test 3 ---")

tools/download_file.py CHANGED Viewed

@@ -29,6 +29,6 @@ def download_file(url: str) -> str:
             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
-        return temp_file_path
     except Exception as e:
         return f"An error occurred while downloading the file: {e}"

             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
+        return "File downloaded and saved successfully to {temp_file_path}. Read this file to process its content."
     except Exception as e:
         return f"An error occurred while downloading the file: {e}"

tools/read_file.py CHANGED Viewed

@@ -23,4 +23,4 @@ def read_and_save_file(file_path: str) -> str:
     temp_file.write(content)
     temp_file.close()
-    return temp_file.name

     temp_file.write(content)
     temp_file.close()
+    return "File read and saved successfullly to {file_path}. Read this file to process its content."