Spaces:

sohamchitimali
/

vosk

Running

App Files Files Community

vosk / app.py

sohamchitimali

Fast API

4e5c77e 7 months ago

raw

history blame contribute delete

3.24 kB

	import os
	import wave
	import gradio as gr
	import soundfile as sf
	import librosa
	import numpy as np
	import json
	from vosk import Model, KaldiRecognizer
	from fastapi import FastAPI, UploadFile, Form
	from fastapi.responses import JSONResponse
	import uvicorn

	# 🔹 Load models once at startup
	models = {
	"English (US)": Model("models/vosk-model-small-en-us-0.15"),
	"English (Indian)": Model("models/vosk-model-small-en-in-0.4"),
	"Hindi": Model("models/vosk-model-small-hi-0.22"),
	"Telugu": Model("models/vosk-model-small-te-0.42")
	}

	def preprocess_audio(audio_file):
	"""Convert audio to mono PCM16 WAV at 16kHz."""
	data, samplerate = sf.read(audio_file)

	# Convert to mono if stereo
	if len(data.shape) > 1:
	data = np.mean(data, axis=1)

	# Resample if needed
	if samplerate != 16000:
	data = librosa.resample(data, orig_sr=samplerate, target_sr=16000)

	# Save as PCM16
	processed_path = "processed.wav"
	sf.write(processed_path, data, 16000, subtype="PCM_16")
	return processed_path

	def run_transcription(audio_file, language):
	"""Core transcription function used by both Gradio and FastAPI."""
	if audio_file is None:
	return "Please record or upload an audio file."

	processed_file = preprocess_audio(audio_file)

	wf = wave.open(processed_file, "rb")
	rec = KaldiRecognizer(models[language], wf.getframerate())

	full_text = []

	while True:
	data = wf.readframes(4000)
	if len(data) == 0:
	break
	if rec.AcceptWaveform(data):
	result = json.loads(rec.Result())
	if "text" in result:
	full_text.append(result["text"])

	final_result = json.loads(rec.FinalResult())
	if "text" in final_result:
	full_text.append(final_result["text"])

	return " ".join([t for t in full_text if t.strip()])


	# 🔹 Gradio UI
	def gradio_ui():
	with gr.Blocks() as demo:
	gr.Markdown("## 🎙️ Multi-Language Speech-to-Text with Vosk")
	with gr.Row():
	audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
	lang_dropdown = gr.Dropdown(choices=list(models.keys()), value="English (US)", label="Language")
	output = gr.Textbox(label="Transcription")

	btn = gr.Button("Transcribe")
	btn.click(fn=run_transcription, inputs=[audio_input, lang_dropdown], outputs=output)
	return demo


	# 🔹 FastAPI backend
	app = FastAPI()

	@app.post("/transcribe")
	async def transcribe_api(language: str = Form(...), file: UploadFile = None):
	if file is None:
	return JSONResponse({"error": "No audio file provided"}, status_code=400)

	temp_path = f"/tmp/{file.filename}"
	with open(temp_path, "wb") as f:
	f.write(await file.read())

	try:
	result_text = run_transcription(temp_path, language)
	return {"transcription": result_text}
	except Exception as e:
	return JSONResponse({"error": str(e)}, status_code=500)


	# 🔹 Mount Gradio inside FastAPI
	demo = gradio_ui()
	gradio_app = gr.mount_gradio_app(app, demo, path="/")

	# 🔹 Run server
	if __name__ == "__main__":
	uvicorn.run(
	gradio_app,
	host="0.0.0.0",
	port=7860
	)