""" speech_io.py Sprachbasierte Ein-/Ausgabe: - Speech-to-Text (STT) mit Whisper (transformers.pipeline) - Text-to-Speech (TTS) mit MMS-TTS Deutsch Dieses File ist 100% stabil für HuggingFace Spaces. """ from typing import Optional, Tuple import numpy as np import soundfile as sf from scipy.signal import butter, filtfilt from transformers import pipeline # Modelle ASR_MODEL_ID = "openai/whisper-small" TTS_MODEL_ID = "facebook/mms-tts-deu" _asr = None _tts = None # ======================================================== # STT PIPELINE # ======================================================== def get_asr_pipeline(): global _asr if _asr is None: print(f">>> Lade ASR Modell: {ASR_MODEL_ID}") _asr = pipeline( task="automatic-speech-recognition", model=ASR_MODEL_ID, device="cpu", return_timestamps=True, # wichtig chunk_length_s=30 # auto-chunk für lange audio ) return _asr # ======================================================== # TTS PIPELINE # ======================================================== def get_tts_pipeline(): global _tts if _tts is None: print(f">>> Lade TTS Modell: {TTS_MODEL_ID}") _tts = pipeline( task="text-to-speech", model=TTS_MODEL_ID, ) return _tts # ======================================================== # AUDIO FILTER – Noise Reduction + Highpass # ======================================================== def butter_highpass_filter(data, cutoff=60, fs=16000, order=4): nyq = 0.5 * fs norm_cutoff = cutoff / nyq b, a = butter(order, norm_cutoff, btype="high") return filtfilt(b, a, data) def apply_fade(audio, sr, duration_ms=10): fade_samples = int(sr * duration_ms / 1000) if fade_samples * 2 >= len(audio): return audio fade_in_curve = np.linspace(0, 1, fade_samples) audio[:fade_samples] *= fade_in_curve fade_out_curve = np.linspace(1, 0, fade_samples) audio[-fade_samples:] *= fade_out_curve return audio # ======================================================== # SPEECH-TO-TEXT (STT) # ======================================================== def transcribe_audio(audio_path: str) -> str: """ audio_path: path zu WAV-Datei (von gr.Audio type="filepath") """ if audio_path is None: return "" # WAV einlesen (soundfile garantiert PCM korrekt) data, sr = sf.read(audio_path) # immer Mono if len(data.shape) > 1: data = data[:, 0] # Whisper >30s vermeiden MAX_SAMPLES = sr * 30 if len(data) > MAX_SAMPLES: data = data[:MAX_SAMPLES] asr = get_asr_pipeline() print(">>> Transkribiere Audio...") result = asr( {"array": data, "sampling_rate": sr}, ) text = result.get("text", "").strip() print("ASR:", text) return text # ======================================================== # TEXT-TO-SPEECH (TTS) # ======================================================== def synthesize_speech(text: str): if not text or not text.strip(): return None tts = get_tts_pipeline() out = tts(text) # rohes Audio from MMS (float32 [-1, 1]) audio = np.array(out["audio"], dtype=np.float32) sr = out.get("sampling_rate", 16000) # ===== FIX sample_rate ===== if sr is None or sr <= 0 or sr > 65535: sr = 16000 # ===== Mono erzwingen ===== if audio.ndim > 1: audio = audio.squeeze() if audio.ndim > 1: audio = audio[:, 0] # ===== Noise reduction ===== try: audio = butter_highpass_filter(audio, cutoff=60, fs=sr) except: pass # ===== Normalize ===== max_val = np.max(np.abs(audio)) if max_val > 0: audio = audio / max_val # ===== Fade gegen pop ===== audio = apply_fade(audio, sr) # ===== int16 ===== audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16) # Rückgabe: (sr, np.int16 array) return (sr, audio_int16)