pkduongsu commited on
Commit
737d955
·
1 Parent(s): 1131e24

eval 45/100, still cannot access files for questions

Browse files
Screenshot 2025-05-02 144021.png ADDED
agent.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import sys
3
- import asyncio
4
  from dotenv import load_dotenv
5
 
6
  # Add the project root directory to the Python path
@@ -16,17 +16,17 @@ from tools.web_search import web_search # Corrected import alias if needed, or u
16
  from tools.analyze_csv import analyze_csv
17
  from tools.analyze_excel import analyze_excel
18
  from tools.download_file import download_file
19
- from tools.extract_text_from_image import extract_text_from_image
20
  from tools.read_file import read_and_save_file
 
 
21
  #switch to using gemini 2.0 model
22
  from langchain_google_genai import ChatGoogleGenerativeAI
23
- from langchain_core.messages import HumanMessage
24
-
25
- #use LangGraph to create the agent
26
  from langgraph.graph import START, StateGraph, MessagesState
27
  from langgraph.prebuilt import ToolNode, tools_condition
28
 
29
- from langchain.tools.base import BaseTool
30
 
31
 
32
  load_dotenv()
@@ -41,14 +41,24 @@ tools = [
41
  analyze_csv,
42
  analyze_excel,
43
  download_file,
44
- extract_text_from_image,
45
  read_and_save_file,
46
- ]
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  def create_agent(): #build graph
49
  try:
50
- llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-preview-04-17",
51
- convert_system_message_to_human=True)
52
  except Exception as e:
53
  print(f"Error initializing LLM: {e}")
54
  return None
@@ -72,7 +82,6 @@ def create_agent(): #build graph
72
  builder.add_edge("tools", "assistant")
73
  react_graph = builder.compile()
74
 
75
- print("Agent created successfully.")
76
  return react_graph
77
  except Exception as e:
78
  print(f"Error creating Agent {e}")
@@ -89,12 +98,20 @@ def main(): # Define an async main function
89
  if query.lower() == 'quit':
90
  break
91
  if query:
92
- input_msg = [HumanMessage(content=query)]
93
  # Assuming agent.run is the correct async method for FunctionAgent
94
- response = agent.invoke({"messages": input_msg})
95
-
96
- for m in response['messages']:
97
- m.pretty_print()
 
 
 
 
 
 
 
 
 
98
  except EOFError:
99
  break
100
  except KeyboardInterrupt:
 
1
  import os
2
  import sys
3
+ from typing import List, TypedDict, Annotated, Optional
4
  from dotenv import load_dotenv
5
 
6
  # Add the project root directory to the Python path
 
16
  from tools.analyze_csv import analyze_csv
17
  from tools.analyze_excel import analyze_excel
18
  from tools.download_file import download_file
19
+ from tools.analyze_image import analyze_image
20
  from tools.read_file import read_and_save_file
21
+ from tools.analyze_audio import analyze_audio
22
+ from tools.analyze_youtube import answer_question_about_youtube_video
23
  #switch to using gemini 2.0 model
24
  from langchain_google_genai import ChatGoogleGenerativeAI
25
+ from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage
26
+ from langgraph.graph.message import add_messages
 
27
  from langgraph.graph import START, StateGraph, MessagesState
28
  from langgraph.prebuilt import ToolNode, tools_condition
29
 
 
30
 
31
 
32
  load_dotenv()
 
41
  analyze_csv,
42
  analyze_excel,
43
  download_file,
44
+ analyze_image,
45
  read_and_save_file,
46
+ analyze_audio,
47
+ answer_question_about_youtube_video,]
48
+
49
+ with open("system_prompt.txt", "r", encoding="utf-8") as f:
50
+ system = f.read()
51
+
52
+ system_message = SystemMessage(content=system)
53
+
54
+ class AgentState(TypedDict):
55
+ input_file: Optional[str] #contains the input file path if there is any
56
+ messages: Annotated[List[AnyMessage], add_messages] #contains the messages exchanged between the user and the agent
57
+
58
 
59
  def create_agent(): #build graph
60
  try:
61
+ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
 
62
  except Exception as e:
63
  print(f"Error initializing LLM: {e}")
64
  return None
 
82
  builder.add_edge("tools", "assistant")
83
  react_graph = builder.compile()
84
 
 
85
  return react_graph
86
  except Exception as e:
87
  print(f"Error creating Agent {e}")
 
98
  if query.lower() == 'quit':
99
  break
100
  if query:
 
101
  # Assuming agent.run is the correct async method for FunctionAgent
102
+ # Construct the initial messages list including the system prompt
103
+ initial_messages = [
104
+ system_message, # Include the system prompt read earlier
105
+ HumanMessage(content=query)
106
+ ]
107
+ # Invoke the agent with the messages state
108
+ response = agent.invoke({"messages": initial_messages})
109
+
110
+ # The final response from the graph is in the 'messages' list
111
+ # Get the last message, which should be the AI's response
112
+ answer = response["messages"][-1].content
113
+ # Print only the final answer without the "Agent: " prefix
114
+ print(answer)
115
  except EOFError:
116
  break
117
  except KeyboardInterrupt:
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import gradio as gr
3
  import requests
 
4
  import asyncio
5
  import inspect
6
  import pandas as pd
@@ -11,10 +12,10 @@ from langchain_core.messages import SystemMessage, HumanMessage
11
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
 
13
 
14
- with open("system_prompt.txt", "r") as file:
15
- system_prompt = file.read()
16
 
17
- system_message = SystemMessage(content=system_prompt)
18
 
19
  def run_and_submit_all( profile: gr.OAuthProfile | None):
20
  """
@@ -81,11 +82,44 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
81
  print(f"Skipping item with missing task_id or question: {item}")
82
  continue
83
  try:
84
- # Geeting the agent responses
85
- input_msg = [system_message, HumanMessage(content=question_text)]
86
- agent_response = agent.invoke({"messages": input_msg})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  answer = agent_response['messages'][-1].content
88
- submitted_answer = answer[14:] # Extract string response
 
 
 
 
 
 
 
 
 
 
89
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
90
  print(f"Task ID: {task_id}, Question: {question_text}, Submitted Answer: {submitted_answer}")
91
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
1
  import os
2
  import gradio as gr
3
  import requests
4
+ import re
5
  import asyncio
6
  import inspect
7
  import pandas as pd
 
12
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
13
 
14
 
15
+ with open("system_prompt.txt", "r", encoding="utf-8") as f:
16
+ system = f.read()
17
 
18
+ system_message = SystemMessage(content=system)
19
 
20
  def run_and_submit_all( profile: gr.OAuthProfile | None):
21
  """
 
82
  print(f"Skipping item with missing task_id or question: {item}")
83
  continue
84
  try:
85
+ # --- Check for associated file ---
86
+ file_path = None
87
+ files_url = f"{api_url}/files/{task_id}"
88
+ try:
89
+ file_response = requests.get(files_url, timeout=10)
90
+ if file_response.status_code == 200:
91
+ # Assuming the response body directly contains the filename/path
92
+ file_path = file_response.text.strip().strip('"') # Get path and remove potential quotes
93
+ print(f"Task {task_id}: Found associated file '{file_path}'")
94
+ elif file_response.status_code == 404:
95
+ print(f"Task {task_id}: No associated file found.")
96
+ else:
97
+ # Log other non-404 errors but don't stop the process
98
+ print(f"Task {task_id}: Warning - Error checking for file ({file_response.status_code}): {file_response.text[:100]}")
99
+ except requests.exceptions.RequestException as file_err:
100
+ print(f"Task {task_id}: Warning - Network error checking for file: {file_err}")
101
+
102
+ # --- Prepare agent input ---
103
+ agent_input = {
104
+ "messages": [system_message, HumanMessage(content=question_text)]
105
+ }
106
+ if file_path:
107
+ agent_input["input_file"] = file_path # Add file path if found
108
+
109
+ # --- Invoke Agent ---
110
+ agent_response = agent.invoke(agent_input)
111
  answer = agent_response['messages'][-1].content
112
+
113
+ # --- Process Answer ---
114
+ match = re.search(r"FINAL ANSWER:.*", answer, flags=re.IGNORECASE)
115
+ answer_line = match.group(0).strip() if match else answer.strip()
116
+ answer_line = answer_line.replace("FINAL ANSWER:", "").strip() # Clean up the answer
117
+
118
+ submitted_answer = answer_line # Extract string response
119
+
120
+ # response_dict = agent.invoke({"input": question_text})
121
+ # # Extract the answer, handling potential missing key
122
+ # submitted_answer = response_dict.get("output", f"AGENT ERROR: No 'output' key in response: {response_dict}")
123
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
124
  print(f"Task ID: {task_id}, Question: {question_text}, Submitted Answer: {submitted_answer}")
125
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
requirements.txt CHANGED
@@ -6,8 +6,10 @@ python-dotenv
6
  langchain-tavily
7
  langchain
8
  langchain-community
 
9
  arxiv
10
  langchain-google-genai
11
  langgraph
12
  gradio[oauth]
13
- pymupdf
 
 
6
  langchain-tavily
7
  langchain
8
  langchain-community
9
+ langgraph
10
  arxiv
11
  langchain-google-genai
12
  langgraph
13
  gradio[oauth]
14
+ pymupdf
15
+ yt-dlp
system_prompt.txt CHANGED
@@ -1,49 +1,22 @@
1
- You are a helpful assistant tasked with answering questions using a set of tools that allow you to directly understand and analyze Youtube videos, images, and other content.
2
-
3
- When you receive a question:
4
-
5
- 1. **Plan (Thought)**
6
- - Think through the user’s goal.
7
- - Identify what information you already know versus what you need to look up or compute.
8
- - Decide whether one or more tools can help you achieve the user’s goal more efficiently or accurately.
9
-
10
- 2. **Decide on Action**
11
- - If you need to retrieve data, run calculations, or fetch external knowledge, choose the appropriate tool and format your call precisely.
12
- - If you can answer directly, skip to “Final Answer.”
13
-
14
- 3. **Act (Tool Invocation)**
15
- - Invoke the chosen tool(s) with clearly structured inputs.
16
- - E.g., `<<call:search_tool(query="latest sales figures for X")>>` or `<<call:calculator(expression="a + b")>>`.
17
-
18
- 4. **Observe (Tool Output)**
19
- - Read the tool’s response carefully.
20
- - Extract the relevant facts or results.
21
-
22
- 5. **Iterate or Conclude**
23
- - If the result fully answers the user’s question, proceed to “Final Answer.”
24
- - Otherwise, repeat from **Plan** using the new information.
25
-
26
- 6. **Final Answer**
27
- - Summarize your findings in clear, concise language.
28
- - Cite or reference any tool outputs where appropriate.
29
- - Offer next steps or alternative suggestions if relevant.
30
-
31
- **Remember:**
32
- - Always consider whether a tool can enhance accuracy or save time before answering from memory.
33
- - Structure your reasoning explicitly in “Thought,” “Action,” “Observation,” and “Answer” steps (the ReAct pattern).
34
- - Keep your language user-friendly and your explanations transparent.
35
-
36
- Now, I will ask you a question. Report your thoughts, and finish your answer with the following template:
37
  FINAL ANSWER: [YOUR FINAL ANSWER].
38
- YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
39
  Your answer should only start with "FINAL ANSWER: ", then follows with the answer.
40
 
41
  Here are a few examples of questions and final answer:
42
 
43
  ---
44
  Question: What was the actual enrollment count of the clinical trial on H. pylori in acne vulgaris patients from Jan-May 2018 as listed on the NIH website?
45
- FINAL ANSWER: 90
46
 
47
  ---
48
  Question: In NASA's Astronomy Picture of the Day on 2006 January 21, two astronauts are visible, with one appearing much smaller than the other. As of August 2023, out of the astronauts in the NASA Astronaut Group that the smaller astronaut was a member of , which one spent the least time in space, and how many minutes did he spend in space, rounded to the nearest minute? Exclude any astronaunts who did not spend any time in space. Give the last name of the astronaunt, separated from the number of minutes by a semicolon.
49
- FINAL ANSWER: White;5876
 
 
 
 
 
 
 
 
1
+ You are a helpful assistant tasked with answering questions using a set of tools.
2
+ Now, I will ask you a question. Think step-by-step **silently** (do NOT reveal your reasoning), and give your answer with the exact following template:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  FINAL ANSWER: [YOUR FINAL ANSWER].
4
+ YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, Apply the rules above for each element (number or string), ensure there is exactly one space after each comma.
5
  Your answer should only start with "FINAL ANSWER: ", then follows with the answer.
6
 
7
  Here are a few examples of questions and final answer:
8
 
9
  ---
10
  Question: What was the actual enrollment count of the clinical trial on H. pylori in acne vulgaris patients from Jan-May 2018 as listed on the NIH website?
11
+ Answer: FINAL ANSWER: 90
12
 
13
  ---
14
  Question: In NASA's Astronomy Picture of the Day on 2006 January 21, two astronauts are visible, with one appearing much smaller than the other. As of August 2023, out of the astronauts in the NASA Astronaut Group that the smaller astronaut was a member of , which one spent the least time in space, and how many minutes did he spend in space, rounded to the nearest minute? Exclude any astronaunts who did not spend any time in space. Give the last name of the astronaunt, separated from the number of minutes by a semicolon.
15
+ Answer: FINAL ANSWER: White;5876
16
+
17
+ --
18
+ Question: The attached Excel file contains the sales of menu items for
19
+ a local fast-food chain. What were the total sales that the chain made from
20
+ food (not including drinks)? Express your answer in USD with two decimal
21
+ places.
22
+ Answer: FINAL ANSWER: $89706.00
testing.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from tools.analyze_excel import analyze_excel
2
+
3
+ from langchain_google_genai import ChatGoogleGenerativeAI
tools/analyze_audio.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from langchain_core.tools import tool
3
+ from langchain_core.messages import HumanMessage
4
+ from langchain_google_genai import ChatGoogleGenerativeAI
5
+ import httpx
6
+ from dotenv import load_dotenv
7
+
8
+ load_dotenv()
9
+
10
+ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
11
+
12
+ @tool
13
+ def analyze_audio(audio_url: str, question: str) -> str:
14
+ """
15
+ Analyze audio data from a URL using a multimodal model.
16
+ """
17
+ # Fetch audio data
18
+ try:
19
+ # Fetch audio data
20
+ response = httpx.get(audio_url)
21
+ response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
22
+ audio_data = base64.b64encode(response.content).decode("utf-8")
23
+
24
+ # Pass to LLM
25
+ message = [
26
+ HumanMessage(
27
+ content = [
28
+ {
29
+ "type": "text",
30
+ "text": "Analyze the audio and answer the following question: " + question,
31
+ },
32
+ {
33
+ "type": "audio",
34
+ "source_type": "base64",
35
+ "data": audio_data,
36
+ "mime_type": "audio/mp3", # Assuming mp3, might need adjustment based on actual content type
37
+ },
38
+ ],
39
+ )
40
+ ]
41
+
42
+ llm_response = llm.invoke(message)
43
+ return llm_response.content.strip()
44
+
45
+ except httpx.MissingSchema as e:
46
+ error_msg = f"Error analyzing audio: The provided URL '{audio_url}' is missing the 'http://' or 'https://' protocol. Please provide a complete URL."
47
+ print(error_msg)
48
+ return error_msg # Return the specific error to the agent
49
+ except httpx.InvalidURL as e:
50
+ error_msg = f"Error analyzing audio: The provided URL '{audio_url}' is invalid. Details: {str(e)}"
51
+ print(error_msg)
52
+ return error_msg # Return the specific error to the agent
53
+ except httpx.RequestError as e:
54
+ # Catch other httpx request errors (network issues, timeouts, 404s, etc.)
55
+ error_msg = f"Error fetching audio from URL '{audio_url}': {str(e)}"
56
+ print(error_msg)
57
+ return error_msg # Return the specific error to the agent
58
+ except Exception as e:
59
+ # Catch other potential errors (base64 encoding, LLM invocation, etc.)
60
+ error_msg = f"An unexpected error occurred during audio analysis: {str(e)}"
61
+ print(error_msg)
62
+ return error_msg # Return the specific error to the agent
63
+
64
+ if __name__ == "__main__":
65
+ # Example usage
66
+ audio_url = "https://www.learningcontainer.com/wp-content/uploads/2020/02/Kalimba.mp3"
67
+ question = "What is the main topic of this audio?"
68
+ result = analyze_audio.invoke({"audio_url": audio_url, "question": question})
69
+ print(result)
tools/analyze_excel.py CHANGED
@@ -21,7 +21,7 @@ def analyze_excel(file_path: str, question: str) -> str:
21
  # Read Excel file
22
  df = pd.read_excel(file_path)
23
 
24
- # Basic information about the data
25
  total_rows = len(df)
26
  total_columns = len(df.columns)
27
  columns = list(df.columns)
@@ -38,4 +38,4 @@ def analyze_excel(file_path: str, question: str) -> str:
38
  return summary
39
 
40
  except Exception as e:
41
- return f"Error analyzing Excel file: {str(e)}"
 
21
  # Read Excel file
22
  df = pd.read_excel(file_path)
23
 
24
+ # Basic information about the data
25
  total_rows = len(df)
26
  total_columns = len(df.columns)
27
  columns = list(df.columns)
 
38
  return summary
39
 
40
  except Exception as e:
41
+ return f"Error analyzing Excel file: {str(e)}"
tools/{extract_text_from_image.py → analyze_image.py} RENAMED
@@ -1,10 +1,15 @@
1
  import base64
2
- from typing import List, TypedDict, Annotated, Optional
3
  from langchain_core.tools import tool
4
  from langchain_core.messages import HumanMessage
 
 
 
 
 
 
5
 
6
  @tool
7
- def extract_text_from_image(img_path: str) -> str:
8
  """
9
  Extract text from an image file using a multimodal model.
10
  """
@@ -23,8 +28,7 @@ def extract_text_from_image(img_path: str) -> str:
23
  {
24
  "type": "text",
25
  "text": (
26
- "Extract all the text from this image. "
27
- "Return only the extracted text, no explanations."
28
  ),
29
  },
30
  {
@@ -38,7 +42,8 @@ def extract_text_from_image(img_path: str) -> str:
38
  ]
39
 
40
  # Call the vision-capable model
41
- response = vision_llm.invoke(message)
 
42
 
43
  # Append extracted text
44
  all_text += response.content + "\n\n"
@@ -49,3 +54,11 @@ def extract_text_from_image(img_path: str) -> str:
49
  error_msg = f"Error extracting text: {str(e)}"
50
  print(error_msg)
51
  return ""
 
 
 
 
 
 
 
 
 
1
  import base64
 
2
  from langchain_core.tools import tool
3
  from langchain_core.messages import HumanMessage
4
+ from langchain_google_genai import ChatGoogleGenerativeAI
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+
9
+ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
10
 
11
  @tool
12
+ def analyze_image(img_path: str, question: str) -> str:
13
  """
14
  Extract text from an image file using a multimodal model.
15
  """
 
28
  {
29
  "type": "text",
30
  "text": (
31
+ "Analyze the image and answer the following question: " + question
 
32
  ),
33
  },
34
  {
 
42
  ]
43
 
44
  # Call the vision-capable model
45
+ # Call the vision-capable model with the prepared message list
46
+ response = llm.invoke(message)
47
 
48
  # Append extracted text
49
  all_text += response.content + "\n\n"
 
54
  error_msg = f"Error extracting text: {str(e)}"
55
  print(error_msg)
56
  return ""
57
+
58
+ if __name__ == "__main__":
59
+ # Example usage
60
+ img_path = r"C:\Users\pkduo\OneDrive\Máy tính\HF Agent Course Final\Final_Assignment_Template\Screenshot 2025-05-02 144021.png"
61
+ question = "Review the chess position provided in the image. It is white's turn. Provide the correct next move for white which guarantees a win. Please provide your response in algebraic notation.?"
62
+ # Invoke the tool using the recommended .invoke() method with a dictionary input
63
+ result = analyze_image.invoke({"img_path": img_path, "question": question})
64
+ print(result)
tools/analyze_youtube.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import os
4
+ from urllib.parse import urlparse, parse_qs
5
+ from dotenv import load_dotenv
6
+ import yt_dlp
7
+
8
+ from langchain.tools import tool
9
+ from langchain_google_genai import ChatGoogleGenerativeAI
10
+ from langchain_core.prompts import PromptTemplate
11
+ from langchain_core.output_parsers import StrOutputParser
12
+
13
+ load_dotenv()
14
+
15
+ @tool
16
+ def answer_question_about_youtube_video(url: str, question: str) -> str:
17
+ """
18
+ Answers a specific question about a YouTube video using its transcript, title, and description.
19
+
20
+ Fetches video metadata (title, description) and transcript using yt-dlp.
21
+ If a transcript is available, it uses an LLM to answer the provided question based on the transcript content,
22
+ using the title and description as additional context.
23
+
24
+ Args:
25
+ url (str): Full YouTube video URL (or any URL yt-dlp supports).
26
+ question (str): The specific question to answer about the video's content.
27
+
28
+ Returns:
29
+ str: The answer to the question based on the video's transcript,
30
+ or a message indicating the transcript was unavailable or an error occurred.
31
+ """
32
+ subtitle_filename = None
33
+ video_id = None
34
+ try:
35
+ # 1. Get video info (title, description) and transcript using yt-dlp
36
+ ydl_opts = {
37
+ 'writesubtitles': True,
38
+ 'subtitleslangs': ['en'], # Prioritize English
39
+ 'writeautomaticsub': True, # Also try auto-generated captions
40
+ 'subtitlesformat': 'json3',
41
+ 'skip_download': True,
42
+ 'quiet': True,
43
+ 'outtmpl': '%(id)s', # Base name for potential subtitle file
44
+ 'noplaylist': True,
45
+ }
46
+
47
+ transcript_text = None # Initialize as None to clearly indicate if found
48
+ title = "N/A"
49
+ description = "N/A"
50
+
51
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
52
+ # Extract info first to get metadata and ID
53
+ # Use ignoreerrors=True to try and get metadata even if download fails later
54
+ info_dict = ydl.extract_info(url, download=False, process=False) # process=False avoids some errors here
55
+ video_id = info_dict.get('id')
56
+ title = info_dict.get('title', 'Title not found')
57
+ description = info_dict.get('description', 'Description not found')
58
+
59
+ if not video_id:
60
+ # Try extracting ID from URL as a fallback if yt-dlp fails early
61
+ try:
62
+ parsed = urlparse(url)
63
+ if parsed.hostname in ("www.youtube.com", "youtube.com"):
64
+ video_id = parse_qs(parsed.query).get("v", [None])[0]
65
+ elif parsed.hostname == "youtu.be":
66
+ video_id = parsed.path.lstrip("/")
67
+ if not video_id:
68
+ return f"Error: Could not extract video ID from URL: {url}"
69
+ except Exception:
70
+ return f"Error: Could not extract video ID from URL: {url}"
71
+
72
+
73
+ # Construct expected subtitle filename (best guess, might include lang code later)
74
+ subtitle_filename_base = f"{video_id}" # yt-dlp adds lang/format
75
+
76
+ # Attempt to download (this will trigger subtitle download)
77
+ try:
78
+ # Re-run extract_info with download=True to trigger download actions
79
+ # This is often more reliable for getting subtitles written
80
+ ydl.extract_info(url, download=True) # Let yt-dlp handle download logic
81
+ except yt_dlp.utils.DownloadError as de:
82
+ # Log subtitle-specific errors but continue if possible
83
+ if "subtitles" in str(de).lower():
84
+ print(f"Info: Subtitle download issue for {url}: {de}")
85
+ else:
86
+ # If it's not a subtitle error, it might be more critical
87
+ print(f"Warning: Download error for {url}: {de}")
88
+ # Decide if you want to return here or proceed without transcript
89
+
90
+
91
+ # Find the actual downloaded subtitle file (json3 format, English preferred)
92
+ found_subtitle_file = None
93
+ transcript_status = "not_found" # Possible values: not_found, found_but_empty, found_but_error, processed
94
+
95
+ # List potential subtitle files matching the pattern
96
+ potential_files = [f for f in os.listdir('.') if f.startswith(video_id) and f.endswith('.json3')]
97
+
98
+ if potential_files:
99
+ # Prioritize English if available
100
+ english_file = f"{video_id}.en.json3"
101
+ if english_file in potential_files:
102
+ found_subtitle_file = english_file
103
+ else:
104
+ # Otherwise, take the first one found (yt-dlp usually names it based on lang)
105
+ found_subtitle_file = potential_files[0]
106
+
107
+ subtitle_filename = found_subtitle_file # Store the actual found filename for cleanup
108
+ print(f"Info: Found subtitle file: {found_subtitle_file}")
109
+
110
+ try:
111
+ with open(found_subtitle_file, 'r', encoding='utf-8') as f:
112
+ subtitle_data = json.load(f)
113
+ # Extract text from json3 format
114
+ segments = []
115
+ for event in subtitle_data.get('events', []):
116
+ if event and 'segs' in event:
117
+ for seg in event['segs']:
118
+ if seg and 'utf8' in seg:
119
+ segments.append(seg['utf8'].strip())
120
+ processed_text = " ".join(segments)
121
+
122
+ if processed_text:
123
+ transcript_text = processed_text # Assign only if text was extracted
124
+ transcript_status = "processed"
125
+ else:
126
+ # File exists but no text extracted
127
+ print(f"Warning: Transcript file {found_subtitle_file} found but contained no processable text.")
128
+ transcript_status = "found_but_empty"
129
+ # Keep transcript_text as None
130
+
131
+ except json.JSONDecodeError as jde:
132
+ print(f"Warning: Could not parse JSON in subtitle file {found_subtitle_file}: {jde}")
133
+ transcript_status = "found_but_error"
134
+ # Keep transcript_text as None
135
+ except Exception as e:
136
+ print(f"Warning: Could not read/process subtitle file {found_subtitle_file}: {e}")
137
+ transcript_status = "found_but_error"
138
+ # Keep transcript_text as None
139
+ # else: transcript_text remains None, transcript_status remains "not_found"
140
+
141
+ # 2. Check if transcript is available before proceeding to LLM
142
+ if transcript_text is None:
143
+ if transcript_status == "not_found":
144
+ return f"Transcript not found for video {video_id}. Cannot answer question."
145
+ elif transcript_status == "found_but_empty":
146
+ return f"Transcript file found ({subtitle_filename}) but contained no text. Cannot answer question."
147
+ elif transcript_status == "found_but_error":
148
+ return f"Transcript file found ({subtitle_filename}) but could not be processed. Cannot answer question."
149
+ else: # Should not happen if transcript_text is None, but as a fallback
150
+ return "Transcript unavailable for an unknown reason. Cannot answer question."
151
+
152
+
153
+ # 3. Prepare prompt for LLM Q&A
154
+ qa_prompt_template = """
155
+ You are an assistant designed to answer questions about a YouTube video based *only* on its provided transcript, title, and description.
156
+
157
+ Video Title: {title}
158
+ Video Description: {description}
159
+
160
+ Video Transcript:
161
+ ---
162
+ {transcript}
163
+ ---
164
+
165
+ Based *only* on the information provided above (primarily the transcript), answer the following question:
166
+ Question: {question}
167
+
168
+ If the answer cannot be found in the transcript or the provided context, state that clearly (e.g., "The transcript does not contain information about..."). Do not make assumptions or use external knowledge. Provide a concise answer.
169
+
170
+ Answer:
171
+ """
172
+
173
+ prompt = PromptTemplate(
174
+ template=qa_prompt_template,
175
+ input_variables=["title", "description", "transcript", "question"]
176
+ )
177
+
178
+ # 4. Query LLM
179
+ llm = ChatGoogleGenerativeAI(
180
+ model="gemini-1.5-flash", # Or another suitable model like gemini-pro
181
+ temperature=0.0, # Keep temperature low for factual Q&A based on context
182
+ )
183
+
184
+ # Create a simple chain: prompt -> llm -> output parser
185
+ chain = prompt | llm | StrOutputParser()
186
+
187
+ # Run the chain with the extracted info
188
+ answer = chain.invoke({
189
+ "title": title,
190
+ "description": description if description else "Not Available",
191
+ "transcript": transcript_text, # Pass the extracted transcript
192
+ "question": question
193
+ })
194
+
195
+ return answer
196
+
197
+ except yt_dlp.utils.DownloadError as e:
198
+ # More specific error for user
199
+ error_message = f"Error during video data/subtitle download for {url}: {e}. "
200
+ if "video unavailable" in str(e).lower():
201
+ error_message += "The video might be private, deleted, or unavailable in your region."
202
+ elif "subtitles" in str(e).lower():
203
+ error_message += "Could not fetch subtitles. They might not exist for this video in English."
204
+ else:
205
+ error_message += "There was a problem accessing the video data."
206
+ return error_message
207
+ except Exception as e:
208
+ return f"An unexpected error occurred while processing {url}: {e}"
209
+ finally:
210
+ # Clean up the downloaded subtitle file if it exists and was identified
211
+ if subtitle_filename and os.path.exists(subtitle_filename):
212
+ try:
213
+ os.remove(subtitle_filename)
214
+ print(f"Cleaned up subtitle file: {subtitle_filename}")
215
+ except Exception as e:
216
+ print(f"Warning: Could not remove subtitle file {subtitle_filename}: {e}")
217
+ # Attempt cleanup based on video_id if filename wasn't confirmed but ID exists
218
+ elif video_id:
219
+ # Check common possible names based on yt-dlp patterns
220
+ possible_cleanup_files = [f"{video_id}.en.json3", f"{video_id}.json3"]
221
+ for fname in possible_cleanup_files:
222
+ if os.path.exists(fname):
223
+ try:
224
+ os.remove(fname)
225
+ print(f"Cleaned up potential subtitle file: {fname}")
226
+ except Exception as e:
227
+ print(f"Warning: Could not remove potential subtitle file {fname}: {e}")
228
+
229
+
230
+ if __name__ == "__main__":
231
+ # Test case 1: Video with likely available English subtitles
232
+ test_url_1 = "https://www.youtube.com/watch?v=JGwWNGJdvx8" # Google I/O Keynote
233
+ test_question_1 = "What models were mentioned in the Gemini family according to the transcript?"
234
+
235
+ # Test case 2: Video likely without subtitles or with non-English ones
236
+ test_url_2 = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" # Rick Astley
237
+ test_question_2 = "Does the transcript mention the singer giving someone up?"
238
+
239
+ # Test case 3: Invalid URL (example)
240
+ # test_url_3 = "https://www.youtube.com/watch?v=invalididxyz"
241
+ # test_question_3 = "What is this video about?"
242
+
243
+ print(f"--- Test 1: Answering Question for: {test_url_1} ---")
244
+ print(f"Question: {test_question_1}")
245
+ # Invoke the tool using the .invoke() method with a dictionary input
246
+ answer1 = answer_question_about_youtube_video.invoke({"url": test_url_1, "question": test_question_1})
247
+ print(f"\nAnswer 1:\n{answer1}")
248
+ print("--- End of Test 1 ---")
249
+
250
+ print(f"\n--- Test 2: Answering Question for: {test_url_2} ---")
251
+ print(f"Question: {test_question_2}")
252
+ # Invoke the tool using the .invoke() method with a dictionary input
253
+ answer2 = answer_question_about_youtube_video.invoke({"url": test_url_2, "question": test_question_2})
254
+ print(f"\nAnswer 2:\n{answer2}")
255
+ print("--- End of Test 2 ---")
256
+
257
+ # print(f"\n--- Test 3: Answering Question for: {test_url_3} ---")
258
+ # print(f"Question: {test_question_3}")
259
+ # answer3 = answer_question_about_youtube_video(test_url_3, test_question_3)
260
+ # print(f"\nAnswer 3:\n{answer3}")
261
+ # print("--- End of Test 3 ---")
tools/download_file.py CHANGED
@@ -29,6 +29,6 @@ def download_file(url: str) -> str:
29
  for chunk in response.iter_content(chunk_size=8192):
30
  f.write(chunk)
31
 
32
- return temp_file_path
33
  except Exception as e:
34
  return f"An error occurred while downloading the file: {e}"
 
29
  for chunk in response.iter_content(chunk_size=8192):
30
  f.write(chunk)
31
 
32
+ return "File downloaded and saved successfully to {temp_file_path}. Read this file to process its content."
33
  except Exception as e:
34
  return f"An error occurred while downloading the file: {e}"
tools/read_file.py CHANGED
@@ -23,4 +23,4 @@ def read_and_save_file(file_path: str) -> str:
23
  temp_file.write(content)
24
  temp_file.close()
25
 
26
- return temp_file.name
 
23
  temp_file.write(content)
24
  temp_file.close()
25
 
26
+ return "File read and saved successfullly to {file_path}. Read this file to process its content."