nexusbert commited on
Commit
79cc3f4
·
1 Parent(s): c162be1
Files changed (1) hide show
  1. app.py +156 -21
app.py CHANGED
@@ -133,9 +133,18 @@ def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.nda
133
  out_path = out_file.name
134
 
135
  ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
 
136
  subprocess.run([
137
  ffmpeg_exe, '-y', '-i', in_path,
138
- '-ac', '1', '-ar', str(target_sr), out_path
 
 
 
 
 
 
 
 
139
  ], check=True, capture_output=True)
140
 
141
  with open(out_path, 'rb') as f:
@@ -145,15 +154,105 @@ def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.nda
145
  os.unlink(out_path)
146
 
147
  audio_array, sr = sf.read(io.BytesIO(wav_data))
 
148
  if len(audio_array.shape) > 1:
149
  audio_array = np.mean(audio_array, axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  return audio_array.astype(np.float32)
 
151
  except Exception as e:
152
  logger.error(f"FFmpeg preprocessing failed: {e}")
153
  raise HTTPException(status_code=400, detail="Audio preprocessing failed. Ensure ffmpeg is installed.")
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  def speech_to_text(audio_data: bytes) -> str:
156
  audio_array = preprocess_audio_ffmpeg(audio_data)
 
 
 
 
 
 
 
 
 
 
157
  candidates = []
158
  for code in ["yo", "ha", "ig", "en"]:
159
  model, proc = _get_asr(code)
@@ -172,6 +271,50 @@ def speech_to_text(audio_data: bytes) -> str:
172
  return max((t for _, t in candidates), key=lambda s: len(s or ""))
173
  return ""
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  def get_ai_response(text: str) -> str:
177
  try:
@@ -183,12 +326,9 @@ def get_ai_response(text: str) -> str:
183
  logger.error(f"AI request error: {e}")
184
  return f"I'm sorry, I couldn't connect to the AI service. You said: '{text}'."
185
 
186
- # Enhanced keyword lists for language detection
187
  HAUSA_WORDS = [
188
- # Agricultural terms
189
  "aikin", "manoma", "gona", "amfanin", "yanayi", "tsaba", "fasaha", "bisa", "noman", "shuka",
190
  "daji", "rani", "damina", "amfani", "bidi'a", "noma", "bashi", "manure", "tsiro", "gishiri",
191
- # Common Hausa words
192
  "da", "shi", "ta", "su", "mu", "ku", "ni", "kai", "ita", "shi", "ita", "su", "mu", "ku",
193
  "ina", "yana", "tana", "suna", "muna", "kuna", "na", "ka", "ta", "sa", "mu", "ku",
194
  "wani", "wata", "wasu", "wadansu", "wadannan", "wannan", "wancan", "wannan",
@@ -197,9 +337,7 @@ HAUSA_WORDS = [
197
  ]
198
 
199
  YORUBA_WORDS = [
200
- # Agricultural terms
201
  "ilé", "ọmọ", "òun", "awọn", "agbẹ", "oko", "ọgbà", "irugbin", "àkọsílẹ", "omi", "ojo", "àgbàlá", "irọlẹ",
202
- # Common Yoruba words
203
  "ni", "ti", "si", "fun", "lati", "ninu", "lori", "labe", "pelu", "ati", "tabi", "sugbon",
204
  "o", "a", "e", "won", "mi", "re", "wa", "yin", "won", "mi", "re", "wa", "yin",
205
  "kan", "kankan", "die", "pupo", "gbogbo", "kookan", "kookan", "gbogbo",
@@ -208,9 +346,7 @@ YORUBA_WORDS = [
208
  ]
209
 
210
  IGBO_WORDS = [
211
- # Agricultural terms
212
  "ugbo", "akụkọ", "mmiri", "ala", "ọrụ", "ncheta", "ọhụrụ", "ugwu", "nri", "ahụhụ",
213
- # Common Igbo words
214
  "na", "n'", "maka", "n'ihi", "n'ime", "n'elu", "n'okpuru", "na", "na", "na",
215
  "m", "i", "o", "ya", "anyị", "unu", "ha", "m", "i", "o", "ya", "anyị", "unu", "ha",
216
  "otu", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ",
@@ -219,23 +355,17 @@ IGBO_WORDS = [
219
  ]
220
 
221
  def detect_language_keywords(text: str) -> str:
222
- """
223
- Lightweight keyword-based language detection.
224
- Returns language code: 'ha' (Hausa), 'yo' (Yoruba), 'ig' (Igbo), 'en' (English)
225
- """
226
  text_lower = text.lower().strip()
227
 
228
  if not text_lower:
229
- return "en" # Default to English for empty text
230
 
231
- # Count matches for each language
232
  hausa_count = sum(1 for word in HAUSA_WORDS if word in text_lower)
233
  yoruba_count = sum(1 for word in YORUBA_WORDS if word in text_lower)
234
  igbo_count = sum(1 for word in IGBO_WORDS if word in text_lower)
235
 
236
  logger.info(f"Language detection scores - Hausa: {hausa_count}, Yoruba: {yoruba_count}, Igbo: {igbo_count}")
237
 
238
- # Return language with highest count, default to English if no matches
239
  if hausa_count > yoruba_count and hausa_count > igbo_count:
240
  logger.info("Keyword detection: Hausa")
241
  return "ha"
@@ -250,9 +380,6 @@ def detect_language_keywords(text: str) -> str:
250
  return "en"
251
 
252
  def detect_language(text: str) -> str:
253
- """
254
- Main language detection function using lightweight keyword-based approach.
255
- """
256
  logger.info(f"Detecting language for text: '{text[:50]}...'")
257
  return detect_language_keywords(text)
258
 
@@ -282,7 +409,6 @@ def text_to_speech_file(text: str) -> str:
282
  audio_raw = speech_output["audio"]
283
  sampling_rate = int(speech_output["sampling_rate"])
284
 
285
-
286
  if isinstance(audio_raw, torch.Tensor):
287
  audio_np = audio_raw.detach().cpu().numpy()
288
  else:
@@ -292,15 +418,24 @@ def text_to_speech_file(text: str) -> str:
292
  audio_np = audio_np.reshape(-1)
293
  audio_np = audio_np.astype(np.float32, copy=False)
294
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
  audio_clipped = np.clip(audio_np, -1.0, 1.0)
297
  audio_int16 = (audio_clipped * 32767.0).astype(np.int16)
298
 
299
-
300
  fd, path = tempfile.mkstemp(suffix=".wav")
301
  os.close(fd)
302
 
303
-
304
  sf.write(path, audio_int16, sampling_rate, format='WAV', subtype='PCM_16')
305
  return path
306
 
 
133
  out_path = out_file.name
134
 
135
  ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
136
+
137
  subprocess.run([
138
  ffmpeg_exe, '-y', '-i', in_path,
139
+ '-ac', '1',
140
+ '-ar', str(target_sr),
141
+ '-af',
142
+ 'highpass=f=80,' +
143
+ 'lowpass=f=8000,' +
144
+ 'dynaudnorm=p=0.95:m=10.0,' +
145
+ 'volume=1.0,' +
146
+ 'aresample=resampler=soxr',
147
+ out_path
148
  ], check=True, capture_output=True)
149
 
150
  with open(out_path, 'rb') as f:
 
154
  os.unlink(out_path)
155
 
156
  audio_array, sr = sf.read(io.BytesIO(wav_data))
157
+
158
  if len(audio_array.shape) > 1:
159
  audio_array = np.mean(audio_array, axis=1)
160
+
161
+ if sr != target_sr:
162
+ logger.warning(f"Audio sampling rate {sr} != target {target_sr}, applying additional resampling...")
163
+ try:
164
+ from scipy import signal
165
+ ratio = target_sr / sr
166
+ audio_array = signal.resample(audio_array, int(len(audio_array) * ratio))
167
+ logger.info(f"Successfully resampled using scipy to {target_sr}Hz")
168
+ except ImportError:
169
+ logger.warning("scipy not available, using numpy interpolation")
170
+ ratio = target_sr / sr
171
+ new_length = int(len(audio_array) * ratio)
172
+ audio_array = np.interp(
173
+ np.linspace(0, len(audio_array), new_length),
174
+ np.arange(len(audio_array)),
175
+ audio_array
176
+ )
177
+
178
+ audio_array = _validate_and_normalize_audio(audio_array)
179
+
180
+ logger.info(f"Audio preprocessing complete: {len(audio_array)} samples at {target_sr}Hz")
181
  return audio_array.astype(np.float32)
182
+
183
  except Exception as e:
184
  logger.error(f"FFmpeg preprocessing failed: {e}")
185
  raise HTTPException(status_code=400, detail="Audio preprocessing failed. Ensure ffmpeg is installed.")
186
 
187
+ def _validate_and_normalize_audio(audio_array: np.ndarray) -> np.ndarray:
188
+ # Check for silence or very low amplitude
189
+ rms = np.sqrt(np.mean(audio_array**2))
190
+ if rms < 0.001:
191
+ logger.warning("Audio appears to be very quiet or silent")
192
+
193
+ max_val = np.max(np.abs(audio_array))
194
+ if max_val > 0.95:
195
+ logger.warning(f"Audio may be clipped (max: {max_val:.3f})")
196
+
197
+ target_rms = 0.1
198
+ current_rms = np.sqrt(np.mean(audio_array**2))
199
+
200
+ if current_rms > 0:
201
+ normalization_factor = min(target_rms / current_rms, 2.0)
202
+ audio_array = audio_array * normalization_factor
203
+ logger.info(f"Normalized audio RMS from {current_rms:.4f} to {np.sqrt(np.mean(audio_array**2)):.4f}")
204
+
205
+ audio_array = np.clip(audio_array, -0.99, 0.99)
206
+
207
+ audio_array = audio_array - np.mean(audio_array)
208
+
209
+ return audio_array
210
+
211
+ def chunk_audio(audio_array: np.ndarray, chunk_length: float = 10.0, overlap: float = 1.0, sample_rate: int = 16000) -> list:
212
+ chunk_samples = int(chunk_length * sample_rate)
213
+ overlap_samples = int(overlap * sample_rate)
214
+ step_samples = chunk_samples - overlap_samples
215
+
216
+ chunks = []
217
+ start = 0
218
+
219
+ while start < len(audio_array):
220
+ end = min(start + chunk_samples, len(audio_array))
221
+ chunk = audio_array[start:end]
222
+
223
+ fade_samples = int(0.05 * sample_rate)
224
+ if len(chunk) > 2 * fade_samples:
225
+ chunk[:fade_samples] *= np.linspace(0, 1, fade_samples)
226
+ chunk[-fade_samples:] *= np.linspace(1, 0, fade_samples)
227
+
228
+ if len(chunk) < chunk_samples:
229
+ chunk = np.pad(chunk, (0, chunk_samples - len(chunk)), mode='constant')
230
+
231
+ chunk_rms = np.sqrt(np.mean(chunk**2))
232
+ if chunk_rms < 0.001:
233
+ logger.warning(f"Chunk {len(chunks)+1} appears to be very quiet (RMS: {chunk_rms:.6f})")
234
+
235
+ chunks.append(chunk)
236
+ start += step_samples
237
+
238
+ if end >= len(audio_array):
239
+ break
240
+
241
+ logger.info(f"Split audio into {len(chunks)} chunks of {chunk_length}s each with quality preservation")
242
+ return chunks
243
+
244
  def speech_to_text(audio_data: bytes) -> str:
245
  audio_array = preprocess_audio_ffmpeg(audio_data)
246
+
247
+ audio_duration = len(audio_array) / 16000
248
+ logger.info(f"Audio duration: {audio_duration:.2f} seconds")
249
+
250
+ if audio_duration <= 15:
251
+ return _process_single_chunk(audio_array)
252
+ else:
253
+ return _process_chunked_audio(audio_array)
254
+
255
+ def _process_single_chunk(audio_array: np.ndarray) -> str:
256
  candidates = []
257
  for code in ["yo", "ha", "ig", "en"]:
258
  model, proc = _get_asr(code)
 
271
  return max((t for _, t in candidates), key=lambda s: len(s or ""))
272
  return ""
273
 
274
+ def _process_chunked_audio(audio_array: np.ndarray) -> str:
275
+ chunks = chunk_audio(audio_array, chunk_length=10.0, overlap=1.0)
276
+
277
+ language_results = {}
278
+
279
+ for code in ["yo", "ha", "ig", "en"]:
280
+ model, proc = _get_asr(code)
281
+ if model is None or proc is None:
282
+ continue
283
+
284
+ chunk_texts = []
285
+ for i, chunk in enumerate(chunks):
286
+ try:
287
+ text = _run_whisper(model, proc, chunk)
288
+ if text and text.strip():
289
+ chunk_texts.append(text.strip())
290
+ logger.info(f"Chunk {i+1}/{len(chunks)} ({code}): {text[:50]}...")
291
+ except Exception as e:
292
+ logger.warning(f"Failed to process chunk {i+1} with {code}: {e}")
293
+ continue
294
+
295
+ if chunk_texts:
296
+ combined_text = " ".join(chunk_texts)
297
+ language_results[code] = combined_text
298
+ logger.info(f"Combined {code} result: {combined_text[:100]}...")
299
+
300
+ best_result = ""
301
+ best_confidence = 0
302
+
303
+ for lang_code, text in language_results.items():
304
+ detected_lang = detect_language(text)
305
+ confidence = len(text.split())
306
+
307
+ logger.info(f"Language {lang_code}: detected as {detected_lang}, confidence: {confidence}")
308
+
309
+ if detected_lang == lang_code:
310
+ confidence *= 2
311
+
312
+ if confidence > best_confidence:
313
+ best_confidence = confidence
314
+ best_result = text
315
+
316
+ return best_result if best_result else ""
317
+
318
 
319
  def get_ai_response(text: str) -> str:
320
  try:
 
326
  logger.error(f"AI request error: {e}")
327
  return f"I'm sorry, I couldn't connect to the AI service. You said: '{text}'."
328
 
 
329
  HAUSA_WORDS = [
 
330
  "aikin", "manoma", "gona", "amfanin", "yanayi", "tsaba", "fasaha", "bisa", "noman", "shuka",
331
  "daji", "rani", "damina", "amfani", "bidi'a", "noma", "bashi", "manure", "tsiro", "gishiri",
 
332
  "da", "shi", "ta", "su", "mu", "ku", "ni", "kai", "ita", "shi", "ita", "su", "mu", "ku",
333
  "ina", "yana", "tana", "suna", "muna", "kuna", "na", "ka", "ta", "sa", "mu", "ku",
334
  "wani", "wata", "wasu", "wadansu", "wadannan", "wannan", "wancan", "wannan",
 
337
  ]
338
 
339
  YORUBA_WORDS = [
 
340
  "ilé", "ọmọ", "òun", "awọn", "agbẹ", "oko", "ọgbà", "irugbin", "àkọsílẹ", "omi", "ojo", "àgbàlá", "irọlẹ",
 
341
  "ni", "ti", "si", "fun", "lati", "ninu", "lori", "labe", "pelu", "ati", "tabi", "sugbon",
342
  "o", "a", "e", "won", "mi", "re", "wa", "yin", "won", "mi", "re", "wa", "yin",
343
  "kan", "kankan", "die", "pupo", "gbogbo", "kookan", "kookan", "gbogbo",
 
346
  ]
347
 
348
  IGBO_WORDS = [
 
349
  "ugbo", "akụkọ", "mmiri", "ala", "ọrụ", "ncheta", "ọhụrụ", "ugwu", "nri", "ahụhụ",
 
350
  "na", "n'", "maka", "n'ihi", "n'ime", "n'elu", "n'okpuru", "na", "na", "na",
351
  "m", "i", "o", "ya", "anyị", "unu", "ha", "m", "i", "o", "ya", "anyị", "unu", "ha",
352
  "otu", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ",
 
355
  ]
356
 
357
  def detect_language_keywords(text: str) -> str:
 
 
 
 
358
  text_lower = text.lower().strip()
359
 
360
  if not text_lower:
361
+ return "en"
362
 
 
363
  hausa_count = sum(1 for word in HAUSA_WORDS if word in text_lower)
364
  yoruba_count = sum(1 for word in YORUBA_WORDS if word in text_lower)
365
  igbo_count = sum(1 for word in IGBO_WORDS if word in text_lower)
366
 
367
  logger.info(f"Language detection scores - Hausa: {hausa_count}, Yoruba: {yoruba_count}, Igbo: {igbo_count}")
368
 
 
369
  if hausa_count > yoruba_count and hausa_count > igbo_count:
370
  logger.info("Keyword detection: Hausa")
371
  return "ha"
 
380
  return "en"
381
 
382
  def detect_language(text: str) -> str:
 
 
 
383
  logger.info(f"Detecting language for text: '{text[:50]}...'")
384
  return detect_language_keywords(text)
385
 
 
409
  audio_raw = speech_output["audio"]
410
  sampling_rate = int(speech_output["sampling_rate"])
411
 
 
412
  if isinstance(audio_raw, torch.Tensor):
413
  audio_np = audio_raw.detach().cpu().numpy()
414
  else:
 
418
  audio_np = audio_np.reshape(-1)
419
  audio_np = audio_np.astype(np.float32, copy=False)
420
 
421
+ target_sr = 16000
422
+ if sampling_rate != target_sr:
423
+ logger.info(f"Resampling TTS audio from {sampling_rate}Hz to {target_sr}Hz")
424
+ ratio = target_sr / sampling_rate
425
+ new_length = int(len(audio_np) * ratio)
426
+ audio_np = np.interp(
427
+ np.linspace(0, len(audio_np), new_length),
428
+ np.arange(len(audio_np)),
429
+ audio_np
430
+ )
431
+ sampling_rate = target_sr
432
 
433
  audio_clipped = np.clip(audio_np, -1.0, 1.0)
434
  audio_int16 = (audio_clipped * 32767.0).astype(np.int16)
435
 
 
436
  fd, path = tempfile.mkstemp(suffix=".wav")
437
  os.close(fd)
438
 
 
439
  sf.write(path, audio_int16, sampling_rate, format='WAV', subtype='PCM_16')
440
  return path
441