|
|
import numpy as np |
|
|
import soundfile as sf |
|
|
import librosa |
|
|
from transformers import pipeline |
|
|
|
|
|
ASR_MODEL_ID = "openai/whisper-small" |
|
|
TTS_MODEL_ID = "facebook/mms-tts-deu" |
|
|
|
|
|
_asr = None |
|
|
_tts = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_audio_16k(path): |
|
|
audio, sr = sf.read(path) |
|
|
|
|
|
|
|
|
if audio.ndim > 1: |
|
|
audio = audio.mean(axis=1) |
|
|
|
|
|
|
|
|
if sr != 16000: |
|
|
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) |
|
|
sr = 16000 |
|
|
|
|
|
return audio.astype(np.float32), sr |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_asr_pipeline(): |
|
|
global _asr |
|
|
if _asr is None: |
|
|
_asr = pipeline( |
|
|
task="automatic-speech-recognition", |
|
|
model=ASR_MODEL_ID, |
|
|
return_timestamps=False, |
|
|
chunk_length_s=30, |
|
|
) |
|
|
return _asr |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def transcribe_audio(audio_path: str) -> str: |
|
|
if audio_path is None: |
|
|
return "" |
|
|
|
|
|
audio, sr = load_audio_16k(audio_path) |
|
|
|
|
|
|
|
|
if len(audio) < sr * 0.4: |
|
|
return "" |
|
|
|
|
|
asr = get_asr_pipeline() |
|
|
|
|
|
|
|
|
result = asr( |
|
|
{"array": audio, "sampling_rate": sr}, |
|
|
generate_kwargs={ |
|
|
"task": "transcribe", |
|
|
"temperature": 0.0 |
|
|
} |
|
|
) |
|
|
|
|
|
text = result.get("text", "").strip() |
|
|
|
|
|
|
|
|
if set(text) <= {"ვ", " "}: |
|
|
return "" |
|
|
|
|
|
return text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_tts_pipeline(): |
|
|
global _tts |
|
|
if _tts is None: |
|
|
_tts = pipeline(task="text-to-speech", model=TTS_MODEL_ID) |
|
|
return _tts |
|
|
|
|
|
|
|
|
def synthesize_speech(text: str): |
|
|
if not text.strip(): |
|
|
return None |
|
|
|
|
|
tts = get_tts_pipeline() |
|
|
out = tts(text) |
|
|
|
|
|
audio = np.array(out["audio"], dtype=np.float32) |
|
|
sr = out.get("sampling_rate", 16000) |
|
|
|
|
|
max_val = np.max(np.abs(audio)) or 1.0 |
|
|
audio = audio / max_val |
|
|
|
|
|
return sr, (audio * 32767).astype(np.int16) |
|
|
|