Spaces:

Invescoz
/

whisper.cpp

Sleeping

File size: 1,448 Bytes

968aad0
86d5d76
968aad0
 
86d5d76
af83e32
968aad0
af83e32
86d5d76
 
 
 
 
968aad0
 
 
 
 
 
 
af83e32
86d5d76
 
 
 
 
 
 
968aad0
86d5d76
968aad0
86d5d76
968aad0
 
 
 
86d5d76
 
 
 
 
 
968aad0
 
86d5d76
 
 
968aad0
86d5d76
968aad0
 
 
 
86d5d76
 
968aad0
 
 
 
 
af83e32
968aad0
 
86d5d76

import gradio as gr
from faster_whisper import WhisperModel
import tempfile
import os
import numpy as np
import wave

# Load Whisper model (CPU, free tier safe)
model = WhisperModel(
    "small",
    device="cpu",
    compute_type="int8"
)

def transcribe(audio):
    if audio is None:
        return {"error": "no audio"}

    sample_rate, data = audio

    # Save temp WAV
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
        with wave.open(f.name, "wb") as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(sample_rate)
            wf.writeframes((data * 32767).astype(np.int16).tobytes())
        path = f.name

    segments, info = model.transcribe(
        path,
        word_timestamps=True
    )

    os.remove(path)

    out_segments = []
    for seg in segments:
        out_segments.append({
            "start": round(seg.start, 2),
            "end": round(seg.end, 2),
            "text": seg.text.strip(),
            "words": [
                {
                    "word": w.word,
                    "start": round(w.start, 2),
                    "end": round(w.end, 2)
                }
                for w in (seg.words or [])
            ]
        })

    return {
        "language": info.language,
        "segments": out_segments
    }

iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="numpy"),
    outputs="json"
)

iface.launch()