| """ | |
| speech_io.py – STT + TTS lokal (transformers) | |
| """ | |
| from typing import Optional, Tuple | |
| import numpy as np | |
| import soundfile as sf | |
| from scipy.signal import butter, filtfilt | |
| from transformers import pipeline | |
| ASR_MODEL_ID = "openai/whisper-small" | |
| TTS_MODEL_ID = "facebook/mms-tts-deu" | |
| _asr = None | |
| _tts = None | |
| def get_asr_pipeline(): | |
| global _asr | |
| if _asr is None: | |
| print(f">>> Lade ASR Modell: {ASR_MODEL_ID}") | |
| _asr = pipeline( | |
| task="automatic-speech-recognition", | |
| model=ASR_MODEL_ID, | |
| device="cpu", | |
| return_timestamps=True, | |
| chunk_length_s=30, | |
| ) | |
| return _asr | |
| def get_tts_pipeline(): | |
| global _tts | |
| if _tts is None: | |
| print(f">>> Lade TTS Modell: {TTS_MODEL_ID}") | |
| _tts = pipeline( | |
| task="text-to-speech", | |
| model=TTS_MODEL_ID, | |
| ) | |
| return _tts | |
| def butter_highpass_filter(data, cutoff=60, fs=16000, order=4): | |
| nyq = 0.5 * fs | |
| norm_cutoff = cutoff / nyq | |
| b, a = butter(order, norm_cutoff, btype="high") | |
| return filtfilt(b, a, data) | |
| def apply_fade(audio, sr, duration_ms=10): | |
| fade_samples = int(sr * duration_ms / 1000) | |
| if fade_samples * 2 >= len(audio): | |
| return audio | |
| fade_in_curve = np.linspace(0, 1, fade_samples) | |
| audio[:fade_samples] *= fade_in_curve | |
| fade_out_curve = np.linspace(1, 0, fade_samples) | |
| audio[-fade_samples:] *= fade_out_curve | |
| return audio | |
| def transcribe_audio(audio_path: str) -> str: | |
| if audio_path is None: | |
| return "" | |
| data, sr = sf.read(audio_path) | |
| if len(data.shape) > 1: | |
| data = data[:, 0] | |
| MAX_SAMPLES = sr * 30 | |
| if len(data) > MAX_SAMPLES: | |
| data = data[:MAX_SAMPLES] | |
| asr = get_asr_pipeline() | |
| print(">>> Transkribiere Audio...") | |
| result = asr({"array": data, "sampling_rate": sr}) | |
| text = result.get("text", "").strip() | |
| print("ASR:", text) | |
| return text | |
| def synthesize_speech(text: str): | |
| if not text or not text.strip(): | |
| return None | |
| tts = get_tts_pipeline() | |
| out = tts(text) | |
| audio = np.array(out["audio"], dtype=np.float32) | |
| sr = out.get("sampling_rate", 16000) | |
| if sr is None or sr <= 0 or sr > 65535: | |
| sr = 16000 | |
| if audio.ndim > 1: | |
| audio = audio.squeeze() | |
| if audio.ndim > 1: | |
| audio = audio[:, 0] | |
| try: | |
| audio = butter_highpass_filter(audio, cutoff=60, fs=sr) | |
| except Exception: | |
| pass | |
| max_val = np.max(np.abs(audio)) | |
| if max_val > 0: | |
| audio = audio / max_val | |
| audio = apply_fade(audio, sr) | |
| audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16) | |
| return (sr, audio_int16) | |