Spaces:

nexusbert
/

milestone3

Sleeping

App Files Files Community

nexusbert commited on Oct 12, 2025

Commit

79cc3f4

1 Parent(s): c162be1

sampling

Browse files

Files changed (1) hide show

app.py +156 -21

app.py CHANGED Viewed

@@ -133,9 +133,18 @@ def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.nda
             out_path = out_file.name
         ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
         subprocess.run([
             ffmpeg_exe, '-y', '-i', in_path,
-            '-ac', '1', '-ar', str(target_sr), out_path
         ], check=True, capture_output=True)
         with open(out_path, 'rb') as f:
@@ -145,15 +154,105 @@ def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.nda
         os.unlink(out_path)
         audio_array, sr = sf.read(io.BytesIO(wav_data))
         if len(audio_array.shape) > 1:
             audio_array = np.mean(audio_array, axis=1)
         return audio_array.astype(np.float32)
     except Exception as e:
         logger.error(f"FFmpeg preprocessing failed: {e}")
         raise HTTPException(status_code=400, detail="Audio preprocessing failed. Ensure ffmpeg is installed.")
 def speech_to_text(audio_data: bytes) -> str:
     audio_array = preprocess_audio_ffmpeg(audio_data)
     candidates = []
     for code in ["yo", "ha", "ig", "en"]:
         model, proc = _get_asr(code)
@@ -172,6 +271,50 @@ def speech_to_text(audio_data: bytes) -> str:
         return max((t for _, t in candidates), key=lambda s: len(s or ""))
     return ""
 def get_ai_response(text: str) -> str:
     try:
@@ -183,12 +326,9 @@ def get_ai_response(text: str) -> str:
         logger.error(f"AI request error: {e}")
         return f"I'm sorry, I couldn't connect to the AI service. You said: '{text}'."
-# Enhanced keyword lists for language detection
 HAUSA_WORDS = [
-    # Agricultural terms
     "aikin", "manoma", "gona", "amfanin", "yanayi", "tsaba", "fasaha", "bisa", "noman", "shuka",
     "daji", "rani", "damina", "amfani", "bidi'a", "noma", "bashi", "manure", "tsiro", "gishiri",
-    # Common Hausa words
     "da", "shi", "ta", "su", "mu", "ku", "ni", "kai", "ita", "shi", "ita", "su", "mu", "ku",
     "ina", "yana", "tana", "suna", "muna", "kuna", "na", "ka", "ta", "sa", "mu", "ku",
     "wani", "wata", "wasu", "wadansu", "wadannan", "wannan", "wancan", "wannan",
@@ -197,9 +337,7 @@ HAUSA_WORDS = [
 ]
 YORUBA_WORDS = [
-    # Agricultural terms
     "ilé", "ọmọ", "òun", "awọn", "agbẹ", "oko", "ọgbà", "irugbin", "àkọsílẹ", "omi", "ojo", "àgbàlá", "irọlẹ",
-    # Common Yoruba words
     "ni", "ti", "si", "fun", "lati", "ninu", "lori", "labe", "pelu", "ati", "tabi", "sugbon",
     "o", "a", "e", "won", "mi", "re", "wa", "yin", "won", "mi", "re", "wa", "yin",
     "kan", "kankan", "die", "pupo", "gbogbo", "kookan", "kookan", "gbogbo",
@@ -208,9 +346,7 @@ YORUBA_WORDS = [
 ]
 IGBO_WORDS = [
-    # Agricultural terms
     "ugbo", "akụkọ", "mmiri", "ala", "ọrụ", "ncheta", "ọhụrụ", "ugwu", "nri", "ahụhụ",
-    # Common Igbo words
     "na", "n'", "maka", "n'ihi", "n'ime", "n'elu", "n'okpuru", "na", "na", "na",
     "m", "i", "o", "ya", "anyị", "unu", "ha", "m", "i", "o", "ya", "anyị", "unu", "ha",
     "otu", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ",
@@ -219,23 +355,17 @@ IGBO_WORDS = [
 ]
 def detect_language_keywords(text: str) -> str:
-    """
-    Lightweight keyword-based language detection.
-    Returns language code: 'ha' (Hausa), 'yo' (Yoruba), 'ig' (Igbo), 'en' (English)
-    """
     text_lower = text.lower().strip()
     if not text_lower:
-        return "en"  # Default to English for empty text
-    # Count matches for each language
     hausa_count = sum(1 for word in HAUSA_WORDS if word in text_lower)
     yoruba_count = sum(1 for word in YORUBA_WORDS if word in text_lower)
     igbo_count = sum(1 for word in IGBO_WORDS if word in text_lower)
     logger.info(f"Language detection scores - Hausa: {hausa_count}, Yoruba: {yoruba_count}, Igbo: {igbo_count}")
-    # Return language with highest count, default to English if no matches
     if hausa_count > yoruba_count and hausa_count > igbo_count:
         logger.info("Keyword detection: Hausa")
         return "ha"
@@ -250,9 +380,6 @@ def detect_language_keywords(text: str) -> str:
         return "en"
 def detect_language(text: str) -> str:
-    """
-    Main language detection function using lightweight keyword-based approach.
-    """
     logger.info(f"Detecting language for text: '{text[:50]}...'")
     return detect_language_keywords(text)
@@ -282,7 +409,6 @@ def text_to_speech_file(text: str) -> str:
     audio_raw = speech_output["audio"]
     sampling_rate = int(speech_output["sampling_rate"])
     if isinstance(audio_raw, torch.Tensor):
         audio_np = audio_raw.detach().cpu().numpy()
     else:
@@ -292,15 +418,24 @@ def text_to_speech_file(text: str) -> str:
         audio_np = audio_np.reshape(-1)
     audio_np = audio_np.astype(np.float32, copy=False)
     audio_clipped = np.clip(audio_np, -1.0, 1.0)
     audio_int16 = (audio_clipped * 32767.0).astype(np.int16)
     fd, path = tempfile.mkstemp(suffix=".wav")
     os.close(fd)
     sf.write(path, audio_int16, sampling_rate, format='WAV', subtype='PCM_16')
     return path

             out_path = out_file.name
         ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
         subprocess.run([
             ffmpeg_exe, '-y', '-i', in_path,
+            '-ac', '1',
+            '-ar', str(target_sr),
+            '-af',
+            'highpass=f=80,' +
+            'lowpass=f=8000,' +
+            'dynaudnorm=p=0.95:m=10.0,' +
+            'volume=1.0,' +
+            'aresample=resampler=soxr',
+            out_path
         ], check=True, capture_output=True)
         with open(out_path, 'rb') as f:
         os.unlink(out_path)
         audio_array, sr = sf.read(io.BytesIO(wav_data))
         if len(audio_array.shape) > 1:
             audio_array = np.mean(audio_array, axis=1)
+        if sr != target_sr:
+            logger.warning(f"Audio sampling rate {sr} != target {target_sr}, applying additional resampling...")
+            try:
+                from scipy import signal
+                ratio = target_sr / sr
+                audio_array = signal.resample(audio_array, int(len(audio_array) * ratio))
+                logger.info(f"Successfully resampled using scipy to {target_sr}Hz")
+            except ImportError:
+                logger.warning("scipy not available, using numpy interpolation")
+                ratio = target_sr / sr
+                new_length = int(len(audio_array) * ratio)
+                audio_array = np.interp(
+                    np.linspace(0, len(audio_array), new_length),
+                    np.arange(len(audio_array)),
+                    audio_array
+                )
+        audio_array = _validate_and_normalize_audio(audio_array)
+        logger.info(f"Audio preprocessing complete: {len(audio_array)} samples at {target_sr}Hz")
         return audio_array.astype(np.float32)
     except Exception as e:
         logger.error(f"FFmpeg preprocessing failed: {e}")
         raise HTTPException(status_code=400, detail="Audio preprocessing failed. Ensure ffmpeg is installed.")
+def _validate_and_normalize_audio(audio_array: np.ndarray) -> np.ndarray:
+    # Check for silence or very low amplitude
+    rms = np.sqrt(np.mean(audio_array**2))
+    if rms < 0.001:
+        logger.warning("Audio appears to be very quiet or silent")
+    max_val = np.max(np.abs(audio_array))
+    if max_val > 0.95:
+        logger.warning(f"Audio may be clipped (max: {max_val:.3f})")
+    target_rms = 0.1
+    current_rms = np.sqrt(np.mean(audio_array**2))
+    if current_rms > 0:
+        normalization_factor = min(target_rms / current_rms, 2.0)
+        audio_array = audio_array * normalization_factor
+        logger.info(f"Normalized audio RMS from {current_rms:.4f} to {np.sqrt(np.mean(audio_array**2)):.4f}")
+    audio_array = np.clip(audio_array, -0.99, 0.99)
+    audio_array = audio_array - np.mean(audio_array)
+    return audio_array
+def chunk_audio(audio_array: np.ndarray, chunk_length: float = 10.0, overlap: float = 1.0, sample_rate: int = 16000) -> list:
+    chunk_samples = int(chunk_length * sample_rate)
+    overlap_samples = int(overlap * sample_rate)
+    step_samples = chunk_samples - overlap_samples
+    chunks = []
+    start = 0
+    while start < len(audio_array):
+        end = min(start + chunk_samples, len(audio_array))
+        chunk = audio_array[start:end]
+        fade_samples = int(0.05 * sample_rate)
+        if len(chunk) > 2 * fade_samples:
+            chunk[:fade_samples] *= np.linspace(0, 1, fade_samples)
+            chunk[-fade_samples:] *= np.linspace(1, 0, fade_samples)
+        if len(chunk) < chunk_samples:
+            chunk = np.pad(chunk, (0, chunk_samples - len(chunk)), mode='constant')
+        chunk_rms = np.sqrt(np.mean(chunk**2))
+        if chunk_rms < 0.001:
+            logger.warning(f"Chunk {len(chunks)+1} appears to be very quiet (RMS: {chunk_rms:.6f})")
+        chunks.append(chunk)
+        start += step_samples
+        if end >= len(audio_array):
+            break
+    logger.info(f"Split audio into {len(chunks)} chunks of {chunk_length}s each with quality preservation")
+    return chunks
 def speech_to_text(audio_data: bytes) -> str:
     audio_array = preprocess_audio_ffmpeg(audio_data)
+    audio_duration = len(audio_array) / 16000
+    logger.info(f"Audio duration: {audio_duration:.2f} seconds")
+    if audio_duration <= 15:
+        return _process_single_chunk(audio_array)
+    else:
+        return _process_chunked_audio(audio_array)
+def _process_single_chunk(audio_array: np.ndarray) -> str:
     candidates = []
     for code in ["yo", "ha", "ig", "en"]:
         model, proc = _get_asr(code)
         return max((t for _, t in candidates), key=lambda s: len(s or ""))
     return ""
+def _process_chunked_audio(audio_array: np.ndarray) -> str:
+    chunks = chunk_audio(audio_array, chunk_length=10.0, overlap=1.0)
+    language_results = {}
+    for code in ["yo", "ha", "ig", "en"]:
+        model, proc = _get_asr(code)
+        if model is None or proc is None:
+            continue
+        chunk_texts = []
+        for i, chunk in enumerate(chunks):
+            try:
+                text = _run_whisper(model, proc, chunk)
+                if text and text.strip():
+                    chunk_texts.append(text.strip())
+                    logger.info(f"Chunk {i+1}/{len(chunks)} ({code}): {text[:50]}...")
+            except Exception as e:
+                logger.warning(f"Failed to process chunk {i+1} with {code}: {e}")
+                continue
+        if chunk_texts:
+            combined_text = " ".join(chunk_texts)
+            language_results[code] = combined_text
+            logger.info(f"Combined {code} result: {combined_text[:100]}...")
+    best_result = ""
+    best_confidence = 0
+    for lang_code, text in language_results.items():
+        detected_lang = detect_language(text)
+        confidence = len(text.split())
+        logger.info(f"Language {lang_code}: detected as {detected_lang}, confidence: {confidence}")
+        if detected_lang == lang_code:
+            confidence *= 2
+        if confidence > best_confidence:
+            best_confidence = confidence
+            best_result = text
+    return best_result if best_result else ""
 def get_ai_response(text: str) -> str:
     try:
         logger.error(f"AI request error: {e}")
         return f"I'm sorry, I couldn't connect to the AI service. You said: '{text}'."
 HAUSA_WORDS = [
     "aikin", "manoma", "gona", "amfanin", "yanayi", "tsaba", "fasaha", "bisa", "noman", "shuka",
     "daji", "rani", "damina", "amfani", "bidi'a", "noma", "bashi", "manure", "tsiro", "gishiri",
     "da", "shi", "ta", "su", "mu", "ku", "ni", "kai", "ita", "shi", "ita", "su", "mu", "ku",
     "ina", "yana", "tana", "suna", "muna", "kuna", "na", "ka", "ta", "sa", "mu", "ku",
     "wani", "wata", "wasu", "wadansu", "wadannan", "wannan", "wancan", "wannan",
 ]
 YORUBA_WORDS = [
     "ilé", "ọmọ", "òun", "awọn", "agbẹ", "oko", "ọgbà", "irugbin", "àkọsílẹ", "omi", "ojo", "àgbàlá", "irọlẹ",
     "ni", "ti", "si", "fun", "lati", "ninu", "lori", "labe", "pelu", "ati", "tabi", "sugbon",
     "o", "a", "e", "won", "mi", "re", "wa", "yin", "won", "mi", "re", "wa", "yin",
     "kan", "kankan", "die", "pupo", "gbogbo", "kookan", "kookan", "gbogbo",
 ]
 IGBO_WORDS = [
     "ugbo", "akụkọ", "mmiri", "ala", "ọrụ", "ncheta", "ọhụrụ", "ugwu", "nri", "ahụhụ",
     "na", "n'", "maka", "n'ihi", "n'ime", "n'elu", "n'okpuru", "na", "na", "na",
     "m", "i", "o", "ya", "anyị", "unu", "ha", "m", "i", "o", "ya", "anyị", "unu", "ha",
     "otu", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ",
 ]
 def detect_language_keywords(text: str) -> str:
     text_lower = text.lower().strip()
     if not text_lower:
+        return "en"
     hausa_count = sum(1 for word in HAUSA_WORDS if word in text_lower)
     yoruba_count = sum(1 for word in YORUBA_WORDS if word in text_lower)
     igbo_count = sum(1 for word in IGBO_WORDS if word in text_lower)
     logger.info(f"Language detection scores - Hausa: {hausa_count}, Yoruba: {yoruba_count}, Igbo: {igbo_count}")
     if hausa_count > yoruba_count and hausa_count > igbo_count:
         logger.info("Keyword detection: Hausa")
         return "ha"
         return "en"
 def detect_language(text: str) -> str:
     logger.info(f"Detecting language for text: '{text[:50]}...'")
     return detect_language_keywords(text)
     audio_raw = speech_output["audio"]
     sampling_rate = int(speech_output["sampling_rate"])
     if isinstance(audio_raw, torch.Tensor):
         audio_np = audio_raw.detach().cpu().numpy()
     else:
         audio_np = audio_np.reshape(-1)
     audio_np = audio_np.astype(np.float32, copy=False)
+    target_sr = 16000
+    if sampling_rate != target_sr:
+        logger.info(f"Resampling TTS audio from {sampling_rate}Hz to {target_sr}Hz")
+        ratio = target_sr / sampling_rate
+        new_length = int(len(audio_np) * ratio)
+        audio_np = np.interp(
+            np.linspace(0, len(audio_np), new_length),
+            np.arange(len(audio_np)),
+            audio_np
+        )
+        sampling_rate = target_sr
     audio_clipped = np.clip(audio_np, -1.0, 1.0)
     audio_int16 = (audio_clipped * 32767.0).astype(np.int16)
     fd, path = tempfile.mkstemp(suffix=".wav")
     os.close(fd)
     sf.write(path, audio_int16, sampling_rate, format='WAV', subtype='PCM_16')
     return path