File size: 1,448 Bytes
968aad0
86d5d76
968aad0
 
86d5d76
af83e32
968aad0
af83e32
86d5d76
 
 
 
 
968aad0
 
 
 
 
 
 
af83e32
86d5d76
 
 
 
 
 
 
968aad0
86d5d76
968aad0
86d5d76
968aad0
 
 
 
86d5d76
 
 
 
 
 
968aad0
 
86d5d76
 
 
968aad0
86d5d76
968aad0
 
 
 
86d5d76
 
968aad0
 
 
 
 
af83e32
968aad0
 
86d5d76
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import gradio as gr
from faster_whisper import WhisperModel
import tempfile
import os
import numpy as np
import wave

# Load Whisper model (CPU, free tier safe)
model = WhisperModel(
    "small",
    device="cpu",
    compute_type="int8"
)

def transcribe(audio):
    if audio is None:
        return {"error": "no audio"}

    sample_rate, data = audio

    # Save temp WAV
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
        with wave.open(f.name, "wb") as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(sample_rate)
            wf.writeframes((data * 32767).astype(np.int16).tobytes())
        path = f.name

    segments, info = model.transcribe(
        path,
        word_timestamps=True
    )

    os.remove(path)

    out_segments = []
    for seg in segments:
        out_segments.append({
            "start": round(seg.start, 2),
            "end": round(seg.end, 2),
            "text": seg.text.strip(),
            "words": [
                {
                    "word": w.word,
                    "start": round(w.start, 2),
                    "end": round(w.end, 2)
                }
                for w in (seg.words or [])
            ]
        })

    return {
        "language": info.language,
        "segments": out_segments
    }

iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="numpy"),
    outputs="json"
)

iface.launch()