File size: 2,225 Bytes
6eb3f02 c5a6dda 6eb3f02 c5a6dda 6eb3f02 c2229c2 6eb3f02 a855cb1 6eb3f02 6f1a080 6eb3f02 6f1a080 6eb3f02 c2229c2 6eb3f02 c5a6dda 6eb3f02 c5a6dda 6eb3f02 c5a6dda 6eb3f02 c5a6dda 6eb3f02 c5a6dda 6f1a080 c5a6dda 6eb3f02 c5a6dda 7757a4a c5a6dda 6eb3f02 6f1a080 6eb3f02 c5a6dda 6eb3f02 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | from fastapi import FastAPI, UploadFile, File, Form
import tempfile
import shutil
import uvicorn
import whisperx
import torch
import numpy as np
import soundfile as sf
from speechbrain.pretrained import EncoderClassifier
app = FastAPI()
device = "cpu"
# Load models (light)
asr_model = whisperx.load_model("small", device)
speaker_model = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-ecapa-voxceleb",
run_opts={"device": device}
)
@app.post("/transcribe")
async def transcribe(audio: UploadFile = File(...), lang: str = Form("en")):
temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
with temp as buffer:
shutil.copyfileobj(audio.file, buffer)
audio_path = temp.name
# Load audio
audio_data = whisperx.load_audio(audio_path)
# Transcribe
result = asr_model.transcribe(audio_data, language=lang)
segments = result["segments"]
y, sr = sf.read(audio_path)
speaker_embeddings = []
speaker_labels = []
final_segments = []
for i, seg in enumerate(segments):
start = int(seg["start"] * sr)
end = int(seg["end"] * sr)
chunk = y[start:end]
if len(chunk) < sr * 0.5: # skip very short
continue
chunk_tensor = torch.tensor(chunk).unsqueeze(0)
emb = speaker_model.encode_batch(chunk_tensor)
emb = emb.squeeze().detach().cpu().numpy()
# Assign speakers
if len(speaker_embeddings) < 2:
speaker_id = f"SPEAKER_{len(speaker_embeddings)+1}"
speaker_embeddings.append(emb)
speaker_labels.append(speaker_id)
else:
sims = []
for e in speaker_embeddings:
sim = np.dot(emb, e) / (
np.linalg.norm(emb) * np.linalg.norm(e)
)
sims.append(sim)
speaker_id = speaker_labels[np.argmax(sims)]
final_segments.append({
"speaker": speaker_id,
"start": round(seg["start"], 2),
"end": round(seg["end"], 2),
"text": seg["text"]
})
return {"segments": final_segments}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860) |