nexusbert commited on
Commit
12f5131
·
1 Parent(s): 4e0cc0a
Files changed (1) hide show
  1. app.py +23 -66
app.py CHANGED
@@ -192,85 +192,42 @@ def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.nda
192
  logger.error(f"FFmpeg preprocessing failed: {e}")
193
  raise HTTPException(status_code=400, detail="Audio preprocessing failed. Ensure ffmpeg is installed.")
194
 
195
- def _score_transcription_quality(text: str) -> float:
196
- if not text or not text.strip():
197
- return 0.0
198
-
199
- text_lower = text.lower()
200
- score = 0.0
201
-
202
- if len(text.strip()) > 3:
203
- score += 0.3
204
-
205
- if any(char.isalpha() for char in text):
206
- score += 0.2
207
-
208
- if len(text.split()) > 1:
209
- score += 0.2
210
-
211
- if not any(char in text for char in "[]{}()"):
212
- score += 0.1
213
-
214
- if not text.endswith("..."):
215
- score += 0.1
216
-
217
- if len(text.strip()) > 10:
218
- score += 0.1
219
-
220
- return min(score, 1.0)
221
 
222
  def speech_to_text(audio_data: bytes) -> str:
223
  audio_array = preprocess_audio_ffmpeg(audio_data)
224
-
225
- mms_text = ""
226
- igbo_text = ""
227
 
228
  mms_result = _get_mms()
229
  if mms_result and mms_result[0] is not None and mms_result[1] is not None:
230
  mms_model, mms_proc = mms_result
231
  mms_text = _run_mms(mms_model, mms_proc, audio_array)
232
- logger.info(f"MMS result: '{mms_text}'")
 
 
233
 
234
  igbo_result = _get_igbo_asr()
235
  if igbo_result[0] is not None and igbo_result[1] is not None:
236
  igbo_model, igbo_proc = igbo_result
237
  igbo_text = _run_whisper(igbo_model, igbo_proc, audio_array, language="igbo")
238
- logger.info(f"Igbo ASR result: '{igbo_text}'")
239
-
240
- if not mms_text and not igbo_text:
241
- return ""
242
-
243
- if not mms_text:
244
- logger.info("Using Igbo ASR result (MMS failed)")
245
- return igbo_text
246
-
247
- if not igbo_text:
248
- logger.info("Using MMS ASR result (Igbo ASR failed)")
249
- return mms_text
250
-
251
- mms_score = _score_transcription_quality(mms_text)
252
- igbo_score = _score_transcription_quality(igbo_text)
253
-
254
- mms_lang = detect_language(mms_text)
255
- igbo_lang = detect_language(igbo_text)
256
-
257
- logger.info(f"MMS: '{mms_text}' (score: {mms_score:.2f}, lang: {mms_lang})")
258
- logger.info(f"Igbo: '{igbo_text}' (score: {igbo_score:.2f}, lang: {igbo_lang})")
259
-
260
- if igbo_lang == "ig" and mms_lang != "ig":
261
- logger.info("Using Igbo ASR result (detected Igbo language)")
262
- return igbo_text
263
-
264
- if mms_lang == "ig" and igbo_lang != "ig":
265
- logger.info("Using MMS ASR result (Igbo ASR didn't detect Igbo)")
266
- return mms_text
267
-
268
- if igbo_score > mms_score + 0.1:
269
- logger.info("Using Igbo ASR result (higher quality score)")
270
- return igbo_text
271
- else:
272
- logger.info("Using MMS ASR result (higher quality score)")
273
- return mms_text
274
 
275
 
276
  def get_ai_response(text: str, response_language: str = None) -> str:
 
192
  logger.error(f"FFmpeg preprocessing failed: {e}")
193
  raise HTTPException(status_code=400, detail="Audio preprocessing failed. Ensure ffmpeg is installed.")
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  def speech_to_text(audio_data: bytes) -> str:
197
  audio_array = preprocess_audio_ffmpeg(audio_data)
198
+ candidates = []
 
 
199
 
200
  mms_result = _get_mms()
201
  if mms_result and mms_result[0] is not None and mms_result[1] is not None:
202
  mms_model, mms_proc = mms_result
203
  mms_text = _run_mms(mms_model, mms_proc, audio_array)
204
+ if mms_text:
205
+ candidates.append(("mms", mms_text))
206
+ logger.info(f"MMS result: '{mms_text}'")
207
 
208
  igbo_result = _get_igbo_asr()
209
  if igbo_result[0] is not None and igbo_result[1] is not None:
210
  igbo_model, igbo_proc = igbo_result
211
  igbo_text = _run_whisper(igbo_model, igbo_proc, audio_array, language="igbo")
212
+ if igbo_text:
213
+ candidates.append(("igbo", igbo_text))
214
+ logger.info(f"Igbo ASR result: '{igbo_text}'")
215
+
216
+ for model_name, text in candidates:
217
+ detected_lang = detect_language(text)
218
+ if detected_lang == "ig" and model_name == "igbo":
219
+ logger.info(f"Using {model_name} ASR result (detected {detected_lang} language)")
220
+ return text
221
+ elif detected_lang in ["ha", "yo", "en"] and model_name == "mms":
222
+ logger.info(f"Using {model_name} ASR result (detected {detected_lang} language)")
223
+ return text
224
+
225
+ if candidates:
226
+ best_text = max((t for _, t in candidates), key=lambda s: len(s or ""))
227
+ logger.info(f"Using best result by length: '{best_text}'")
228
+ return best_text
229
+
230
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
 
233
  def get_ai_response(text: str, response_language: str = None) -> str: