File size: 2,667 Bytes
ed084d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
speech_io.py – STT + TTS lokal (transformers)
"""

from typing import Optional, Tuple
import numpy as np
import soundfile as sf
from scipy.signal import butter, filtfilt
from transformers import pipeline

ASR_MODEL_ID = "openai/whisper-small"
TTS_MODEL_ID = "facebook/mms-tts-deu"

_asr = None
_tts = None


def get_asr_pipeline():
    global _asr
    if _asr is None:
        print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
        _asr = pipeline(
            task="automatic-speech-recognition",
            model=ASR_MODEL_ID,
            device="cpu",
            return_timestamps=True,
            chunk_length_s=30,
        )
    return _asr


def get_tts_pipeline():
    global _tts
    if _tts is None:
        print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
        _tts = pipeline(
            task="text-to-speech",
            model=TTS_MODEL_ID,
        )
    return _tts


def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
    nyq = 0.5 * fs
    norm_cutoff = cutoff / nyq
    b, a = butter(order, norm_cutoff, btype="high")
    return filtfilt(b, a, data)


def apply_fade(audio, sr, duration_ms=10):
    fade_samples = int(sr * duration_ms / 1000)
    if fade_samples * 2 >= len(audio):
        return audio
    fade_in_curve = np.linspace(0, 1, fade_samples)
    audio[:fade_samples] *= fade_in_curve
    fade_out_curve = np.linspace(1, 0, fade_samples)
    audio[-fade_samples:] *= fade_out_curve
    return audio


def transcribe_audio(audio_path: str) -> str:
    if audio_path is None:
        return ""
    data, sr = sf.read(audio_path)
    if len(data.shape) > 1:
        data = data[:, 0]
    MAX_SAMPLES = sr * 30
    if len(data) > MAX_SAMPLES:
        data = data[:MAX_SAMPLES]
    asr = get_asr_pipeline()
    print(">>> Transkribiere Audio...")
    result = asr({"array": data, "sampling_rate": sr})
    text = result.get("text", "").strip()
    print("ASR:", text)
    return text


def synthesize_speech(text: str):
    if not text or not text.strip():
        return None
    tts = get_tts_pipeline()
    out = tts(text)
    audio = np.array(out["audio"], dtype=np.float32)
    sr = out.get("sampling_rate", 16000)
    if sr is None or sr <= 0 or sr > 65535:
        sr = 16000
    if audio.ndim > 1:
        audio = audio.squeeze()
    if audio.ndim > 1:
        audio = audio[:, 0]
    try:
        audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
    except Exception:
        pass
    max_val = np.max(np.abs(audio))
    if max_val > 0:
        audio = audio / max_val
    audio = apply_fade(audio, sr)
    audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
    return (sr, audio_int16)