import gradio as gr from faster_whisper import WhisperModel import tempfile import os import numpy as np import wave # Load Whisper model (CPU, free tier safe) model = WhisperModel( "small", device="cpu", compute_type="int8" ) def transcribe(audio): if audio is None: return {"error": "no audio"} sample_rate, data = audio # Save temp WAV with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: with wave.open(f.name, "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(sample_rate) wf.writeframes((data * 32767).astype(np.int16).tobytes()) path = f.name segments, info = model.transcribe( path, word_timestamps=True ) os.remove(path) out_segments = [] for seg in segments: out_segments.append({ "start": round(seg.start, 2), "end": round(seg.end, 2), "text": seg.text.strip(), "words": [ { "word": w.word, "start": round(w.start, 2), "end": round(w.end, 2) } for w in (seg.words or []) ] }) return { "language": info.language, "segments": out_segments } iface = gr.Interface( fn=transcribe, inputs=gr.Audio(type="numpy"), outputs="json" ) iface.launch()