Roop / app.py
don0726's picture
Update app.py
6eb3f02 verified
from fastapi import FastAPI, UploadFile, File, Form
import tempfile
import shutil
import uvicorn
import whisperx
import torch
import numpy as np
import soundfile as sf
from speechbrain.pretrained import EncoderClassifier
app = FastAPI()
device = "cpu"
# Load models (light)
asr_model = whisperx.load_model("small", device)
speaker_model = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-ecapa-voxceleb",
run_opts={"device": device}
)
@app.post("/transcribe")
async def transcribe(audio: UploadFile = File(...), lang: str = Form("en")):
temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
with temp as buffer:
shutil.copyfileobj(audio.file, buffer)
audio_path = temp.name
# Load audio
audio_data = whisperx.load_audio(audio_path)
# Transcribe
result = asr_model.transcribe(audio_data, language=lang)
segments = result["segments"]
y, sr = sf.read(audio_path)
speaker_embeddings = []
speaker_labels = []
final_segments = []
for i, seg in enumerate(segments):
start = int(seg["start"] * sr)
end = int(seg["end"] * sr)
chunk = y[start:end]
if len(chunk) < sr * 0.5: # skip very short
continue
chunk_tensor = torch.tensor(chunk).unsqueeze(0)
emb = speaker_model.encode_batch(chunk_tensor)
emb = emb.squeeze().detach().cpu().numpy()
# Assign speakers
if len(speaker_embeddings) < 2:
speaker_id = f"SPEAKER_{len(speaker_embeddings)+1}"
speaker_embeddings.append(emb)
speaker_labels.append(speaker_id)
else:
sims = []
for e in speaker_embeddings:
sim = np.dot(emb, e) / (
np.linalg.norm(emb) * np.linalg.norm(e)
)
sims.append(sim)
speaker_id = speaker_labels[np.argmax(sims)]
final_segments.append({
"speaker": speaker_id,
"start": round(seg["start"], 2),
"end": round(seg["end"], 2),
"text": seg["text"]
})
return {"segments": final_segments}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)