MickyWin22 commited on
Commit
3413fb8
·
verified ·
1 Parent(s): 0fe12f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -28
app.py CHANGED
@@ -5,34 +5,42 @@ import pandas as pd
5
  import traceback
6
  import time
7
  import mimetypes
 
8
 
9
  # Import smol-agent and tool components
10
  from smolagents import CodeAgent, LiteLLMModel, tool
11
  from smolagents import DuckDuckGoSearchTool
12
  from unstructured.partition.auto import partition
13
 
 
 
 
 
14
  # --- Constants ---
15
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
16
 
17
- # --- Tool Definition (Updated for Multimodality) ---
18
  @tool
19
  def file_reader(file_path: str) -> str:
20
  """
21
- Reads the content of a file and returns its text content.
22
- This tool supports various file types, including text (PDF, TXT, CSV)
23
- and can perform Optical Character Recognition (OCR) on images (PNG, JPG).
24
- It can be used with either a local path or a web URL.
25
- For non-text/image formats like audio or video, it will return a message
26
- indicating the file type, as it cannot analyze their content directly.
 
27
 
28
  Args:
29
  file_path (str): The local path or web URL of the file to be read.
 
 
30
  """
31
  temp_file_path = None
32
  try:
33
- # Handle web URLs by downloading the file first
34
  if file_path.startswith("http://") or file_path.startswith("https://"):
35
- temp_file_path = "temp_downloaded_file"
36
  response = requests.get(file_path, timeout=20)
37
  response.raise_for_status()
38
  with open(temp_file_path, "wb") as f:
@@ -41,44 +49,62 @@ def file_reader(file_path: str) -> str:
41
  else:
42
  local_path = file_path
43
 
44
- # Gracefully handle unsupported file types (e.g., audio, video)
45
  mime_type, _ = mimetypes.guess_type(local_path)
46
- if mime_type and not (mime_type.startswith('text/') or mime_type.startswith('image/') or mime_type == 'application/pdf' or mime_type == 'application/zip'):
47
- if temp_file_path and os.path.exists(temp_file_path):
48
- os.remove(temp_file_path)
49
- return f"File is of a non-visual, non-text format ({mime_type}). Content analysis is not supported by this tool."
50
 
51
- # Use 'unstructured' which has built-in OCR for images.
52
- # This will extract text from images where possible.
53
- elements = partition(local_path)
 
 
 
 
54
 
55
- # Clean up the temporary file if it was created
56
- if temp_file_path and os.path.exists(temp_file_path):
57
- os.remove(temp_file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
 
 
59
  return "\n\n".join([str(el) for el in elements])
 
60
  except Exception as e:
61
- # Ensure cleanup even if an error occurs
 
 
62
  if temp_file_path and os.path.exists(temp_file_path):
63
  os.remove(temp_file_path)
64
- return f"Error reading or processing file '{file_path}': {e}"
65
 
66
 
67
- # --- Agent Class (Updated with Native Memory Management) ---
68
  class GaiaSmolAgent:
69
  def __init__(self):
70
  """
71
  Initializes the optimized agent.
72
- Now uses the agent's native conversation memory capabilities.
73
  """
74
  print("Initializing Optimized GaiaSmolAgent...")
75
  api_key = os.getenv("GEMINI_API_KEY")
76
  if not api_key:
77
  raise ValueError("API key 'GEMINI_API_KEY' not found in environment secrets.")
78
 
79
- # Use a faster, more cost-effective model optimized for speed.
80
  model = LiteLLMModel(
81
- model_id="gemini/gemini-1.5-flash-latest",
82
  api_key=api_key,
83
  temperature=0.0,
84
  timeout=120.0, # Add a timeout to prevent hanging
@@ -90,7 +116,7 @@ class GaiaSmolAgent:
90
 
91
  **Available Tools:**
92
  - `duck_duck_go_search(query: str) -> str`: Use this to find information, file URLs, or anything on the web.
93
- - `file_reader(file_path: str) -> str`: Use this to read the contents of a file from a local path or a web URL. It can read text and extract text from images (OCR).
94
 
95
  **Your Thought Process:**
96
  1. **Deconstruct the Goal:** Carefully analyze the question to understand what information is needed, considering the previous turns in the conversation.
@@ -113,7 +139,7 @@ class GaiaSmolAgent:
113
  planning_interval=3 # Re-plan every 3 steps, considering memory.
114
  )
115
 
116
- print("Optimized GaiaSmolAgent initialized successfully with native memory and multimodal capabilities.")
117
 
118
  def __call__(self, question: str, reset_memory: bool = False) -> str:
119
  """
 
5
  import traceback
6
  import time
7
  import mimetypes
8
+ from tempfile import NamedTemporaryFile
9
 
10
  # Import smol-agent and tool components
11
  from smolagents import CodeAgent, LiteLLMModel, tool
12
  from smolagents import DuckDuckGoSearchTool
13
  from unstructured.partition.auto import partition
14
 
15
+ # Imports for advanced file processing
16
+ import speech_recognition as sr
17
+ from moviepy.editor import VideoFileClip
18
+
19
  # --- Constants ---
20
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
21
 
22
+ # --- Tool Definition (Upgraded for Full Multimodality) ---
23
  @tool
24
  def file_reader(file_path: str) -> str:
25
  """
26
+ Reads and analyzes the content of a file and returns relevant text-based information.
27
+ Supports:
28
+ - Text files (PDF, TXT, CSV)
29
+ - Images (PNG, JPG) with OCR
30
+ - Audio (MP3, WAV) via speech recognition
31
+ - Video (MP4, MOV) via speech recognition on audio track
32
+ Can be used with a local file path or a web URL.
33
 
34
  Args:
35
  file_path (str): The local path or web URL of the file to be read.
36
+ Returns:
37
+ str: Extracted or transcribed content as text.
38
  """
39
  temp_file_path = None
40
  try:
41
+ # Download the file if it's a URL
42
  if file_path.startswith("http://") or file_path.startswith("https://"):
43
+ temp_file_path = NamedTemporaryFile(delete=False).name
44
  response = requests.get(file_path, timeout=20)
45
  response.raise_for_status()
46
  with open(temp_file_path, "wb") as f:
 
49
  else:
50
  local_path = file_path
51
 
 
52
  mime_type, _ = mimetypes.guess_type(local_path)
53
+ recognizer = sr.Recognizer()
 
 
 
54
 
55
+ if mime_type:
56
+ # Handle audio files
57
+ if mime_type.startswith("audio/"):
58
+ with sr.AudioFile(local_path) as source:
59
+ audio = recognizer.record(source)
60
+ # Using whisper for robust speech recognition
61
+ return recognizer.recognize_whisper(audio)
62
 
63
+ # Handle video files by extracting audio
64
+ elif mime_type.startswith("video/"):
65
+ # Use a temporary file for the extracted audio
66
+ with NamedTemporaryFile(suffix=".wav", delete=False) as audio_temp:
67
+ audio_temp_path = audio_temp.name
68
+
69
+ clip = VideoFileClip(local_path)
70
+ clip.audio.write_audiofile(audio_temp_path, codec='pcm_s16le')
71
+
72
+ with sr.AudioFile(audio_temp_path) as source:
73
+ audio = recognizer.record(source)
74
+
75
+ # Clean up the temporary audio file
76
+ os.remove(audio_temp_path)
77
+
78
+ # Using whisper for robust speech recognition
79
+ return recognizer.recognize_whisper(audio)
80
 
81
+ # Default to handling text and images with OCR if not audio/video
82
+ elements = partition(local_path)
83
  return "\n\n".join([str(el) for el in elements])
84
+
85
  except Exception as e:
86
+ return f"Error reading or processing file '{file_path}': {e}"
87
+ finally:
88
+ # Clean up the downloaded file if it exists
89
  if temp_file_path and os.path.exists(temp_file_path):
90
  os.remove(temp_file_path)
 
91
 
92
 
93
+ # --- Agent Class (Updated with More Powerful Model and Tools) ---
94
  class GaiaSmolAgent:
95
  def __init__(self):
96
  """
97
  Initializes the optimized agent.
98
+ Now uses a more powerful model and the agent's native conversation memory.
99
  """
100
  print("Initializing Optimized GaiaSmolAgent...")
101
  api_key = os.getenv("GEMINI_API_KEY")
102
  if not api_key:
103
  raise ValueError("API key 'GEMINI_API_KEY' not found in environment secrets.")
104
 
105
+ # Use a more powerful, "clever" model for better reasoning.
106
  model = LiteLLMModel(
107
+ model_id="gemini/gemini-1.5-pro-latest",
108
  api_key=api_key,
109
  temperature=0.0,
110
  timeout=120.0, # Add a timeout to prevent hanging
 
116
 
117
  **Available Tools:**
118
  - `duck_duck_go_search(query: str) -> str`: Use this to find information, file URLs, or anything on the web.
119
+ - `file_reader(file_path: str) -> str`: Use this to read the contents of a file from a local path or a web URL. It can read text, extract text from images (OCR), and transcribe audio from audio/video files.
120
 
121
  **Your Thought Process:**
122
  1. **Deconstruct the Goal:** Carefully analyze the question to understand what information is needed, considering the previous turns in the conversation.
 
139
  planning_interval=3 # Re-plan every 3 steps, considering memory.
140
  )
141
 
142
+ print("Optimized GaiaSmolAgent initialized successfully with native memory and full multimodal capabilities.")
143
 
144
  def __call__(self, question: str, reset_memory: bool = False) -> str:
145
  """