vosk / app.py
sohamchitimali's picture
Fast API
4e5c77e
import os
import wave
import gradio as gr
import soundfile as sf
import librosa
import numpy as np
import json
from vosk import Model, KaldiRecognizer
from fastapi import FastAPI, UploadFile, Form
from fastapi.responses import JSONResponse
import uvicorn
# ๐Ÿ”น Load models once at startup
models = {
"English (US)": Model("models/vosk-model-small-en-us-0.15"),
"English (Indian)": Model("models/vosk-model-small-en-in-0.4"),
"Hindi": Model("models/vosk-model-small-hi-0.22"),
"Telugu": Model("models/vosk-model-small-te-0.42")
}
def preprocess_audio(audio_file):
"""Convert audio to mono PCM16 WAV at 16kHz."""
data, samplerate = sf.read(audio_file)
# Convert to mono if stereo
if len(data.shape) > 1:
data = np.mean(data, axis=1)
# Resample if needed
if samplerate != 16000:
data = librosa.resample(data, orig_sr=samplerate, target_sr=16000)
# Save as PCM16
processed_path = "processed.wav"
sf.write(processed_path, data, 16000, subtype="PCM_16")
return processed_path
def run_transcription(audio_file, language):
"""Core transcription function used by both Gradio and FastAPI."""
if audio_file is None:
return "Please record or upload an audio file."
processed_file = preprocess_audio(audio_file)
wf = wave.open(processed_file, "rb")
rec = KaldiRecognizer(models[language], wf.getframerate())
full_text = []
while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
result = json.loads(rec.Result())
if "text" in result:
full_text.append(result["text"])
final_result = json.loads(rec.FinalResult())
if "text" in final_result:
full_text.append(final_result["text"])
return " ".join([t for t in full_text if t.strip()])
# ๐Ÿ”น Gradio UI
def gradio_ui():
with gr.Blocks() as demo:
gr.Markdown("## ๐ŸŽ™๏ธ Multi-Language Speech-to-Text with Vosk")
with gr.Row():
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
lang_dropdown = gr.Dropdown(choices=list(models.keys()), value="English (US)", label="Language")
output = gr.Textbox(label="Transcription")
btn = gr.Button("Transcribe")
btn.click(fn=run_transcription, inputs=[audio_input, lang_dropdown], outputs=output)
return demo
# ๐Ÿ”น FastAPI backend
app = FastAPI()
@app.post("/transcribe")
async def transcribe_api(language: str = Form(...), file: UploadFile = None):
if file is None:
return JSONResponse({"error": "No audio file provided"}, status_code=400)
temp_path = f"/tmp/{file.filename}"
with open(temp_path, "wb") as f:
f.write(await file.read())
try:
result_text = run_transcription(temp_path, language)
return {"transcription": result_text}
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
# ๐Ÿ”น Mount Gradio inside FastAPI
demo = gradio_ui()
gradio_app = gr.mount_gradio_app(app, demo, path="/")
# ๐Ÿ”น Run server
if __name__ == "__main__":
uvicorn.run(
gradio_app,
host="0.0.0.0",
port=7860
)