File size: 2,664 Bytes
de84956
 
b2fa85d
de84956
 
b2fa85d
 
de84956
 
 
 
 
b2fa85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de84956
 
 
 
 
 
b2fa85d
 
de84956
 
 
 
b2fa85d
 
 
de84956
 
 
 
b2fa85d
de84956
b2fa85d
 
 
de84956
 
 
b2fa85d
de84956
b2fa85d
 
 
 
 
de84956
 
 
b2fa85d
 
 
 
 
de84956
 
b2fa85d
 
 
 
 
 
 
 
 
 
de84956
 
b2fa85d
de84956
 
 
 
 
 
 
 
b2fa85d
 
de84956
b2fa85d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
import soundfile as sf
import librosa
from transformers import pipeline

ASR_MODEL_ID = "openai/whisper-small"   # multilingual
TTS_MODEL_ID = "facebook/mms-tts-deu"  # bạn có thể thay nếu muốn đa ngôn ngữ

_asr = None
_tts = None


# ============================================
# LOAD AUDIO – chuẩn hóa 16kHz mono
# ============================================
def load_audio_16k(path):
    audio, sr = sf.read(path)

    # Stereo → Mono
    if audio.ndim > 1:
        audio = audio.mean(axis=1)

    # Resample → 16kHz
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        sr = 16000

    return audio.astype(np.float32), sr


# ============================================
# LOAD WHISPER PIPELINE (multilingual)
# ============================================
def get_asr_pipeline():
    global _asr
    if _asr is None:
        _asr = pipeline(
            task="automatic-speech-recognition",
            model=ASR_MODEL_ID,
            return_timestamps=False,
            chunk_length_s=30,
        )
    return _asr


# ============================================
# MULTILINGUAL STT
# ============================================
def transcribe_audio(audio_path: str) -> str:
    if audio_path is None:
        return ""

    audio, sr = load_audio_16k(audio_path)

    # Nếu quá ngắn → Whisper sẽ sinh ký tự rác
    if len(audio) < sr * 0.4:  
        return ""

    asr = get_asr_pipeline()

    # Không đặt language → Whisper tự detect ngôn ngữ
    result = asr(
        {"array": audio, "sampling_rate": sr},
        generate_kwargs={
            "task": "transcribe",     # không translate — giữ nguyên ngôn ngữ gốc
            "temperature": 0.0        # giảm hallucination như "ვვვ..."
        }
    )

    text = result.get("text", "").strip()

    # Fix edge case: nếu Whisper trả về ký tự vô nghĩa → bỏ qua
    if set(text) <= {"ვ", " "}:
        return ""

    return text


# ============================================
# TEXT → SPEECH (chưa multilingual)
# ============================================
def get_tts_pipeline():
    global _tts
    if _tts is None:
        _tts = pipeline(task="text-to-speech", model=TTS_MODEL_ID)
    return _tts


def synthesize_speech(text: str):
    if not text.strip():
        return None

    tts = get_tts_pipeline()
    out = tts(text)

    audio = np.array(out["audio"], dtype=np.float32)
    sr = out.get("sampling_rate", 16000)

    max_val = np.max(np.abs(audio)) or 1.0
    audio = audio / max_val

    return sr, (audio * 32767).astype(np.int16)