Spaces:
Running
Running
File size: 1,007 Bytes
0845b4d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | import os
import torch
import soundfile as sf
import librosa
from src.model import load_model
# -----------------------------
# CONFIGURATION
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model, processor = load_model()
model.to(device)
def transcribe(audio_path: str) -> str:
audio, sr = sf.read(audio_path)
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
if sr != 16000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
sr = 16000
audio = audio.astype("float32")
# Process input
inputs = processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(device)
# Run inference
with torch.no_grad():
predicted_ids = model.generate(
inputs,
suppress_tokens=None,
max_new_tokens=400,
)
# Decode output
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
return transcription[0]
|