File size: 2,225 Bytes
6eb3f02
 
 
 
 
 
c5a6dda
6eb3f02
 
c5a6dda
6eb3f02
c2229c2
6eb3f02
a855cb1
6eb3f02
 
6f1a080
6eb3f02
 
 
 
6f1a080
6eb3f02
 
c2229c2
6eb3f02
 
 
c5a6dda
6eb3f02
c5a6dda
6eb3f02
 
c5a6dda
6eb3f02
 
 
c5a6dda
6eb3f02
c5a6dda
6f1a080
 
c5a6dda
6eb3f02
 
 
 
 
 
c5a6dda
7757a4a
c5a6dda
6eb3f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f1a080
6eb3f02
 
 
 
 
 
 
 
 
 
 
 
 
c5a6dda
6eb3f02
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from fastapi import FastAPI, UploadFile, File, Form
import tempfile
import shutil
import uvicorn
import whisperx
import torch
import numpy as np
import soundfile as sf
from speechbrain.pretrained import EncoderClassifier

app = FastAPI()

device = "cpu"

# Load models (light)
asr_model = whisperx.load_model("small", device)

speaker_model = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    run_opts={"device": device}
)

@app.post("/transcribe")
async def transcribe(audio: UploadFile = File(...), lang: str = Form("en")):

    temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    with temp as buffer:
        shutil.copyfileobj(audio.file, buffer)

    audio_path = temp.name

    # Load audio
    audio_data = whisperx.load_audio(audio_path)

    # Transcribe
    result = asr_model.transcribe(audio_data, language=lang)
    segments = result["segments"]

    y, sr = sf.read(audio_path)

    speaker_embeddings = []
    speaker_labels = []

    final_segments = []

    for i, seg in enumerate(segments):

        start = int(seg["start"] * sr)
        end = int(seg["end"] * sr)

        chunk = y[start:end]

        if len(chunk) < sr * 0.5:  # skip very short
            continue

        chunk_tensor = torch.tensor(chunk).unsqueeze(0)

        emb = speaker_model.encode_batch(chunk_tensor)
        emb = emb.squeeze().detach().cpu().numpy()

        # Assign speakers
        if len(speaker_embeddings) < 2:
            speaker_id = f"SPEAKER_{len(speaker_embeddings)+1}"
            speaker_embeddings.append(emb)
            speaker_labels.append(speaker_id)
        else:
            sims = []
            for e in speaker_embeddings:
                sim = np.dot(emb, e) / (
                    np.linalg.norm(emb) * np.linalg.norm(e)
                )
                sims.append(sim)

            speaker_id = speaker_labels[np.argmax(sims)]

        final_segments.append({
            "speaker": speaker_id,
            "start": round(seg["start"], 2),
            "end": round(seg["end"], 2),
            "text": seg["text"]
        })

    return {"segments": final_segments}


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)