chatbot2 / speech_io.py
Nguyen5's picture
commit
ed084d7
"""
speech_io.py – STT + TTS lokal (transformers)
"""
from typing import Optional, Tuple
import numpy as np
import soundfile as sf
from scipy.signal import butter, filtfilt
from transformers import pipeline
ASR_MODEL_ID = "openai/whisper-small"
TTS_MODEL_ID = "facebook/mms-tts-deu"
_asr = None
_tts = None
def get_asr_pipeline():
global _asr
if _asr is None:
print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
_asr = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_ID,
device="cpu",
return_timestamps=True,
chunk_length_s=30,
)
return _asr
def get_tts_pipeline():
global _tts
if _tts is None:
print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
_tts = pipeline(
task="text-to-speech",
model=TTS_MODEL_ID,
)
return _tts
def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
nyq = 0.5 * fs
norm_cutoff = cutoff / nyq
b, a = butter(order, norm_cutoff, btype="high")
return filtfilt(b, a, data)
def apply_fade(audio, sr, duration_ms=10):
fade_samples = int(sr * duration_ms / 1000)
if fade_samples * 2 >= len(audio):
return audio
fade_in_curve = np.linspace(0, 1, fade_samples)
audio[:fade_samples] *= fade_in_curve
fade_out_curve = np.linspace(1, 0, fade_samples)
audio[-fade_samples:] *= fade_out_curve
return audio
def transcribe_audio(audio_path: str) -> str:
if audio_path is None:
return ""
data, sr = sf.read(audio_path)
if len(data.shape) > 1:
data = data[:, 0]
MAX_SAMPLES = sr * 30
if len(data) > MAX_SAMPLES:
data = data[:MAX_SAMPLES]
asr = get_asr_pipeline()
print(">>> Transkribiere Audio...")
result = asr({"array": data, "sampling_rate": sr})
text = result.get("text", "").strip()
print("ASR:", text)
return text
def synthesize_speech(text: str):
if not text or not text.strip():
return None
tts = get_tts_pipeline()
out = tts(text)
audio = np.array(out["audio"], dtype=np.float32)
sr = out.get("sampling_rate", 16000)
if sr is None or sr <= 0 or sr > 65535:
sr = 16000
if audio.ndim > 1:
audio = audio.squeeze()
if audio.ndim > 1:
audio = audio[:, 0]
try:
audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
except Exception:
pass
max_val = np.max(np.abs(audio))
if max_val > 0:
audio = audio / max_val
audio = apply_fade(audio, sr)
audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
return (sr, audio_int16)