Maxenceleguery commited on
Commit
b9e8b68
·
1 Parent(s): f5d1478

:sparkles: Adding audio modality

Browse files
Files changed (1) hide show
  1. app.py +12 -30
app.py CHANGED
@@ -97,43 +97,39 @@ def load_file_from_response(response):
97
  return {"type": "image", "data": Image.open(io.BytesIO(content_bytes))}
98
 
99
  elif "audio/" in content_type:
100
- audio_data, sample_rate = sf.read(io.BytesIO(content_bytes))
101
- return {
102
- "type": "audio",
103
- "data": {"array": audio_data, "sample_rate": sample_rate},
104
- }
105
 
106
  elif "application/octet-stream" in content_type:
107
  # Try Excel
108
  try:
109
  excel_data = pd.read_excel(io.BytesIO(content_bytes))
110
  return {"type": "excel", "data": excel_data}
111
- except Exception:
112
- pass
113
 
114
  # Try image
115
  try:
116
  img = Image.open(io.BytesIO(content_bytes))
117
  return {"type": "image", "data": img}
118
  except UnidentifiedImageError:
119
- pass
120
 
121
  # Try audio
122
  try:
123
- audio_data, sample_rate = sf.read(io.BytesIO(content_bytes))
124
- return {
125
- "type": "audio",
126
- "data": {"array": audio_data, "sample_rate": sample_rate},
127
- }
128
- except RuntimeError:
129
- pass
130
 
131
  # Try UTF-8 text
132
  try:
133
  text = content_bytes.decode("utf-8")
134
  return {"type": "text", "data": text}
135
  except UnicodeDecodeError:
136
- pass
137
 
138
  return {"type": "binary", "data": content_bytes}
139
 
@@ -158,20 +154,6 @@ def load_image(image_path: str) -> str:
158
  return f"data:image/jpeg;base64,{encoded}"
159
 
160
 
161
- def load_audio(audio_path: str) -> str:
162
- """Encodes audio as base64 for GPT-4o (if needed)."""
163
- with open(audio_path, "rb") as f:
164
- encoded = base64.b64encode(f.read()).decode("utf-8")
165
- return f"data:audio/wav;base64,{encoded}"
166
-
167
-
168
- def transcribe_audio(audio_path: str) -> str:
169
- """Transcribes audio file using OpenAI Whisper model (whisper-1)."""
170
- with open(audio_path, "rb") as f:
171
- transcript = openai.Audio.transcribe("whisper-1", f)
172
- return transcript.get("text", "")
173
-
174
-
175
  def describe_image(image_path: str) -> str:
176
  """Sends image directly to GPT-4o to describe it."""
177
  image_base64 = load_image(image_path)
 
97
  return {"type": "image", "data": Image.open(io.BytesIO(content_bytes))}
98
 
99
  elif "audio/" in content_type:
100
+ # Transcribe audio using OpenAI Whisper
101
+ transcript = openai.Audio.transcribe("whisper-1", io.BytesIO(content_bytes))
102
+ return {"type": "text", "data": transcript.get("text", "")}
 
 
103
 
104
  elif "application/octet-stream" in content_type:
105
  # Try Excel
106
  try:
107
  excel_data = pd.read_excel(io.BytesIO(content_bytes))
108
  return {"type": "excel", "data": excel_data}
109
+ except Exception as e:
110
+ print(f"Error loading excel")
111
 
112
  # Try image
113
  try:
114
  img = Image.open(io.BytesIO(content_bytes))
115
  return {"type": "image", "data": img}
116
  except UnidentifiedImageError:
117
+ print(f"Error loading image")
118
 
119
  # Try audio
120
  try:
121
+ # Transcribe audio from raw bytes
122
+ transcript = openai.Audio.transcribe(model="whisper-1", file=io.BytesIO(content_bytes))
123
+ return {"type": "text", "data": transcript.get("text", "")}
124
+ except Exception as e:
125
+ print(f"Error transcribing audio")
 
 
126
 
127
  # Try UTF-8 text
128
  try:
129
  text = content_bytes.decode("utf-8")
130
  return {"type": "text", "data": text}
131
  except UnicodeDecodeError:
132
+ print(f"Error decoding UTF-8")
133
 
134
  return {"type": "binary", "data": content_bytes}
135
 
 
154
  return f"data:image/jpeg;base64,{encoded}"
155
 
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  def describe_image(image_path: str) -> str:
158
  """Sends image directly to GPT-4o to describe it."""
159
  image_base64 = load_image(image_path)