Spaces:

Nguyen5
/

chatbot

Sleeping

App Files Files Community

Nguyen5 commited on Dec 5, 2025

Commit

b2fa85d

1 Parent(s): e6bb64c

commit

Browse files

Files changed (1) hide show

speech_io.py +58 -113

speech_io.py CHANGED Viewed

@@ -1,157 +1,102 @@
-"""
-speech_io.py
-Sprachbasierte Ein-/Ausgabe:
-- Speech-to-Text (STT) mit Whisper (transformers.pipeline)
-- Text-to-Speech (TTS) mit MMS-TTS Deutsch
-Dieses File ist 100% stabil für HuggingFace Spaces.
-"""
-from typing import Optional, Tuple
 import numpy as np
 import soundfile as sf
-from scipy.signal import butter, filtfilt
 from transformers import pipeline
-# Modelle
-ASR_MODEL_ID = "openai/whisper-small"
-TTS_MODEL_ID = "facebook/mms-tts-deu"
 _asr = None
 _tts = None
-# ========================================================
-# STT PIPELINE
-# ========================================================
 def get_asr_pipeline():
     global _asr
     if _asr is None:
-        print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
         _asr = pipeline(
             task="automatic-speech-recognition",
             model=ASR_MODEL_ID,
-            device="cpu",
-            return_timestamps=True,   # wichtig
-            chunk_length_s=30         # auto-chunk für lange audio
         )
     return _asr
-# ========================================================
-# TTS PIPELINE
-# ========================================================
-def get_tts_pipeline():
-    global _tts
-    if _tts is None:
-        print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
-        _tts = pipeline(
-            task="text-to-speech",
-            model=TTS_MODEL_ID,
-        )
-    return _tts
-# ========================================================
-# AUDIO FILTER – Noise Reduction + Highpass
-# ========================================================
-def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
-    nyq = 0.5 * fs
-    norm_cutoff = cutoff / nyq
-    b, a = butter(order, norm_cutoff, btype="high")
-    return filtfilt(b, a, data)
-def apply_fade(audio, sr, duration_ms=10):
-    fade_samples = int(sr * duration_ms / 1000)
-    if fade_samples * 2 >= len(audio):
-        return audio
-    fade_in_curve = np.linspace(0, 1, fade_samples)
-    audio[:fade_samples] *= fade_in_curve
-    fade_out_curve = np.linspace(1, 0, fade_samples)
-    audio[-fade_samples:] *= fade_out_curve
-    return audio
-# ========================================================
-# SPEECH-TO-TEXT (STT)
-# ========================================================
 def transcribe_audio(audio_path: str) -> str:
-    """
-    audio_path: path zu WAV-Datei (von gr.Audio type="filepath")
-    """
     if audio_path is None:
         return ""
-    # WAV einlesen (soundfile garantiert PCM korrekt)
-    data, sr = sf.read(audio_path)
-    # immer Mono
-    if len(data.shape) > 1:
-        data = data[:, 0]
-    # Whisper >30s vermeiden
-    MAX_SAMPLES = sr * 30
-    if len(data) > MAX_SAMPLES:
-        data = data[:MAX_SAMPLES]
     asr = get_asr_pipeline()
-    print(">>> Transkribiere Audio...")
     result = asr(
-        {"array": data, "sampling_rate": sr},
     )
     text = result.get("text", "").strip()
-    print("ASR:", text)
     return text
-# ========================================================
-# TEXT-TO-SPEECH (TTS)
-# ========================================================
 def synthesize_speech(text: str):
-    if not text or not text.strip():
         return None
     tts = get_tts_pipeline()
     out = tts(text)
-    # rohes Audio from MMS (float32 [-1, 1])
     audio = np.array(out["audio"], dtype=np.float32)
     sr = out.get("sampling_rate", 16000)
-    # ===== FIX sample_rate =====
-    if sr is None or sr <= 0 or sr > 65535:
-        sr = 16000
-    # ===== Mono erzwingen =====
-    if audio.ndim > 1:
-        audio = audio.squeeze()
-    if audio.ndim > 1:
-        audio = audio[:, 0]
-    # ===== Noise reduction =====
-    try:
-        audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
-    except:
-        pass
-    # ===== Normalize =====
-    max_val = np.max(np.abs(audio))
-    if max_val > 0:
-        audio = audio / max_val
-    # ===== Fade gegen pop =====
-    audio = apply_fade(audio, sr)
-    # ===== int16 =====
-    audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
-    # Rückgabe: (sr, np.int16 array)
-    return (sr, audio_int16)

 import numpy as np
 import soundfile as sf
+import librosa
 from transformers import pipeline
+ASR_MODEL_ID = "openai/whisper-small"   # multilingual
+TTS_MODEL_ID = "facebook/mms-tts-deu"  # bạn có thể thay nếu muốn đa ngôn ngữ
 _asr = None
 _tts = None
+# ============================================
+# LOAD AUDIO – chuẩn hóa 16kHz mono
+# ============================================
+def load_audio_16k(path):
+    audio, sr = sf.read(path)
+    # Stereo → Mono
+    if audio.ndim > 1:
+        audio = audio.mean(axis=1)
+    # Resample → 16kHz
+    if sr != 16000:
+        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+        sr = 16000
+    return audio.astype(np.float32), sr
+# ============================================
+# LOAD WHISPER PIPELINE (multilingual)
+# ============================================
 def get_asr_pipeline():
     global _asr
     if _asr is None:
         _asr = pipeline(
             task="automatic-speech-recognition",
             model=ASR_MODEL_ID,
+            return_timestamps=False,
+            chunk_length_s=30,
         )
     return _asr
+# ============================================
+# MULTILINGUAL STT
+# ============================================
 def transcribe_audio(audio_path: str) -> str:
     if audio_path is None:
         return ""
+    audio, sr = load_audio_16k(audio_path)
+    # Nếu quá ngắn → Whisper sẽ sinh ký tự rác
+    if len(audio) < sr * 0.4:
+        return ""
     asr = get_asr_pipeline()
+    # Không đặt language → Whisper tự detect ngôn ngữ
     result = asr(
+        {"array": audio, "sampling_rate": sr},
+        generate_kwargs={
+            "task": "transcribe",     # không translate — giữ nguyên ngôn ngữ gốc
+            "temperature": 0.0        # giảm hallucination như "ვვვ..."
+        }
     )
     text = result.get("text", "").strip()
+    # Fix edge case: nếu Whisper trả về ký tự vô nghĩa → bỏ qua
+    if set(text) <= {"ვ", " "}:
+        return ""
     return text
+# ============================================
+# TEXT → SPEECH (chưa multilingual)
+# ============================================
+def get_tts_pipeline():
+    global _tts
+    if _tts is None:
+        _tts = pipeline(task="text-to-speech", model=TTS_MODEL_ID)
+    return _tts
 def synthesize_speech(text: str):
+    if not text.strip():
         return None
     tts = get_tts_pipeline()
     out = tts(text)
     audio = np.array(out["audio"], dtype=np.float32)
     sr = out.get("sampling_rate", 16000)
+    max_val = np.max(np.abs(audio)) or 1.0
+    audio = audio / max_val
+    return sr, (audio * 32767).astype(np.int16)