|
|
""" |
|
|
speech_io.py |
|
|
|
|
|
Sprachbasierte Ein-/Ausgabe: |
|
|
- Speech-to-Text (STT) mit Whisper (transformers.pipeline) |
|
|
- Text-to-Speech (TTS) mit MMS-TTS Deutsch |
|
|
|
|
|
Dieses File ist 100% stabil für HuggingFace Spaces. |
|
|
""" |
|
|
|
|
|
from typing import Optional, Tuple |
|
|
import numpy as np |
|
|
import soundfile as sf |
|
|
from scipy.signal import butter, filtfilt |
|
|
from transformers import pipeline |
|
|
|
|
|
|
|
|
ASR_MODEL_ID = "openai/whisper-small" |
|
|
TTS_MODEL_ID = "facebook/mms-tts-deu" |
|
|
|
|
|
_asr = None |
|
|
_tts = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_asr_pipeline(): |
|
|
global _asr |
|
|
if _asr is None: |
|
|
print(f">>> Lade ASR Modell: {ASR_MODEL_ID}") |
|
|
_asr = pipeline( |
|
|
task="automatic-speech-recognition", |
|
|
model=ASR_MODEL_ID, |
|
|
device="cpu", |
|
|
return_timestamps=True, |
|
|
chunk_length_s=30 |
|
|
) |
|
|
return _asr |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_tts_pipeline(): |
|
|
global _tts |
|
|
if _tts is None: |
|
|
print(f">>> Lade TTS Modell: {TTS_MODEL_ID}") |
|
|
_tts = pipeline( |
|
|
task="text-to-speech", |
|
|
model=TTS_MODEL_ID, |
|
|
) |
|
|
return _tts |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def butter_highpass_filter(data, cutoff=60, fs=16000, order=4): |
|
|
nyq = 0.5 * fs |
|
|
norm_cutoff = cutoff / nyq |
|
|
b, a = butter(order, norm_cutoff, btype="high") |
|
|
return filtfilt(b, a, data) |
|
|
|
|
|
def apply_fade(audio, sr, duration_ms=10): |
|
|
fade_samples = int(sr * duration_ms / 1000) |
|
|
|
|
|
if fade_samples * 2 >= len(audio): |
|
|
return audio |
|
|
|
|
|
fade_in_curve = np.linspace(0, 1, fade_samples) |
|
|
audio[:fade_samples] *= fade_in_curve |
|
|
|
|
|
fade_out_curve = np.linspace(1, 0, fade_samples) |
|
|
audio[-fade_samples:] *= fade_out_curve |
|
|
|
|
|
return audio |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def transcribe_audio(audio_path: str) -> str: |
|
|
""" |
|
|
audio_path: path zu WAV-Datei (von gr.Audio type="filepath") |
|
|
""" |
|
|
|
|
|
if audio_path is None: |
|
|
return "" |
|
|
|
|
|
|
|
|
data, sr = sf.read(audio_path) |
|
|
|
|
|
|
|
|
if len(data.shape) > 1: |
|
|
data = data[:, 0] |
|
|
|
|
|
|
|
|
MAX_SAMPLES = sr * 30 |
|
|
if len(data) > MAX_SAMPLES: |
|
|
data = data[:MAX_SAMPLES] |
|
|
|
|
|
asr = get_asr_pipeline() |
|
|
|
|
|
print(">>> Transkribiere Audio...") |
|
|
result = asr( |
|
|
{"array": data, "sampling_rate": sr}, |
|
|
) |
|
|
|
|
|
text = result.get("text", "").strip() |
|
|
print("ASR:", text) |
|
|
return text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def synthesize_speech(text: str): |
|
|
if not text or not text.strip(): |
|
|
return None |
|
|
|
|
|
tts = get_tts_pipeline() |
|
|
out = tts(text) |
|
|
|
|
|
|
|
|
audio = np.array(out["audio"], dtype=np.float32) |
|
|
sr = out.get("sampling_rate", 16000) |
|
|
|
|
|
|
|
|
if sr is None or sr <= 0 or sr > 65535: |
|
|
sr = 16000 |
|
|
|
|
|
|
|
|
if audio.ndim > 1: |
|
|
audio = audio.squeeze() |
|
|
if audio.ndim > 1: |
|
|
audio = audio[:, 0] |
|
|
|
|
|
|
|
|
try: |
|
|
audio = butter_highpass_filter(audio, cutoff=60, fs=sr) |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
max_val = np.max(np.abs(audio)) |
|
|
if max_val > 0: |
|
|
audio = audio / max_val |
|
|
|
|
|
|
|
|
audio = apply_fade(audio, sr) |
|
|
|
|
|
|
|
|
audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16) |
|
|
|
|
|
|
|
|
return (sr, audio_int16) |
|
|
|