Agents_Course_Final_Assignment

Sleeping

App Files Files Community

Maxenceleguery commited on Apr 24, 2025

Commit

b9e8b68

1 Parent(s): f5d1478

:sparkles: Adding audio modality

Browse files

Files changed (1) hide show

app.py +12 -30

app.py CHANGED Viewed

@@ -97,43 +97,39 @@ def load_file_from_response(response):
             return {"type": "image", "data": Image.open(io.BytesIO(content_bytes))}
         elif "audio/" in content_type:
-            audio_data, sample_rate = sf.read(io.BytesIO(content_bytes))
-            return {
-                "type": "audio",
-                "data": {"array": audio_data, "sample_rate": sample_rate},
-            }
         elif "application/octet-stream" in content_type:
             # Try Excel
             try:
                 excel_data = pd.read_excel(io.BytesIO(content_bytes))
                 return {"type": "excel", "data": excel_data}
-            except Exception:
-                pass
             # Try image
             try:
                 img = Image.open(io.BytesIO(content_bytes))
                 return {"type": "image", "data": img}
             except UnidentifiedImageError:
-                pass
             # Try audio
             try:
-                audio_data, sample_rate = sf.read(io.BytesIO(content_bytes))
-                return {
-                    "type": "audio",
-                    "data": {"array": audio_data, "sample_rate": sample_rate},
-                }
-            except RuntimeError:
-                pass
             # Try UTF-8 text
             try:
                 text = content_bytes.decode("utf-8")
                 return {"type": "text", "data": text}
             except UnicodeDecodeError:
-                pass
             return {"type": "binary", "data": content_bytes}
@@ -158,20 +154,6 @@ def load_image(image_path: str) -> str:
     return f"data:image/jpeg;base64,{encoded}"
-def load_audio(audio_path: str) -> str:
-    """Encodes audio as base64 for GPT-4o (if needed)."""
-    with open(audio_path, "rb") as f:
-        encoded = base64.b64encode(f.read()).decode("utf-8")
-    return f"data:audio/wav;base64,{encoded}"
-def transcribe_audio(audio_path: str) -> str:
-    """Transcribes audio file using OpenAI Whisper model (whisper-1)."""
-    with open(audio_path, "rb") as f:
-        transcript = openai.Audio.transcribe("whisper-1", f)
-    return transcript.get("text", "")
 def describe_image(image_path: str) -> str:
     """Sends image directly to GPT-4o to describe it."""
     image_base64 = load_image(image_path)

             return {"type": "image", "data": Image.open(io.BytesIO(content_bytes))}
         elif "audio/" in content_type:
+            # Transcribe audio using OpenAI Whisper
+            transcript = openai.Audio.transcribe("whisper-1", io.BytesIO(content_bytes))
+            return {"type": "text", "data": transcript.get("text", "")}
         elif "application/octet-stream" in content_type:
             # Try Excel
             try:
                 excel_data = pd.read_excel(io.BytesIO(content_bytes))
                 return {"type": "excel", "data": excel_data}
+            except Exception as e:
+                print(f"Error loading excel")
             # Try image
             try:
                 img = Image.open(io.BytesIO(content_bytes))
                 return {"type": "image", "data": img}
             except UnidentifiedImageError:
+                print(f"Error loading image")
             # Try audio
             try:
+                # Transcribe audio from raw bytes
+                transcript = openai.Audio.transcribe(model="whisper-1", file=io.BytesIO(content_bytes))
+                return {"type": "text", "data": transcript.get("text", "")}
+            except Exception as e:
+                print(f"Error transcribing audio")
             # Try UTF-8 text
             try:
                 text = content_bytes.decode("utf-8")
                 return {"type": "text", "data": text}
             except UnicodeDecodeError:
+                print(f"Error decoding UTF-8")
             return {"type": "binary", "data": content_bytes}
     return f"data:image/jpeg;base64,{encoded}"
 def describe_image(image_path: str) -> str:
     """Sends image directly to GPT-4o to describe it."""
     image_base64 = load_image(image_path)