Spaces:
Running
Running
| import os | |
| import wave | |
| import gradio as gr | |
| import soundfile as sf | |
| import librosa | |
| import numpy as np | |
| import json | |
| from vosk import Model, KaldiRecognizer | |
| from fastapi import FastAPI, UploadFile, Form | |
| from fastapi.responses import JSONResponse | |
| import uvicorn | |
| # ๐น Load models once at startup | |
| models = { | |
| "English (US)": Model("models/vosk-model-small-en-us-0.15"), | |
| "English (Indian)": Model("models/vosk-model-small-en-in-0.4"), | |
| "Hindi": Model("models/vosk-model-small-hi-0.22"), | |
| "Telugu": Model("models/vosk-model-small-te-0.42") | |
| } | |
| def preprocess_audio(audio_file): | |
| """Convert audio to mono PCM16 WAV at 16kHz.""" | |
| data, samplerate = sf.read(audio_file) | |
| # Convert to mono if stereo | |
| if len(data.shape) > 1: | |
| data = np.mean(data, axis=1) | |
| # Resample if needed | |
| if samplerate != 16000: | |
| data = librosa.resample(data, orig_sr=samplerate, target_sr=16000) | |
| # Save as PCM16 | |
| processed_path = "processed.wav" | |
| sf.write(processed_path, data, 16000, subtype="PCM_16") | |
| return processed_path | |
| def run_transcription(audio_file, language): | |
| """Core transcription function used by both Gradio and FastAPI.""" | |
| if audio_file is None: | |
| return "Please record or upload an audio file." | |
| processed_file = preprocess_audio(audio_file) | |
| wf = wave.open(processed_file, "rb") | |
| rec = KaldiRecognizer(models[language], wf.getframerate()) | |
| full_text = [] | |
| while True: | |
| data = wf.readframes(4000) | |
| if len(data) == 0: | |
| break | |
| if rec.AcceptWaveform(data): | |
| result = json.loads(rec.Result()) | |
| if "text" in result: | |
| full_text.append(result["text"]) | |
| final_result = json.loads(rec.FinalResult()) | |
| if "text" in final_result: | |
| full_text.append(final_result["text"]) | |
| return " ".join([t for t in full_text if t.strip()]) | |
| # ๐น Gradio UI | |
| def gradio_ui(): | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## ๐๏ธ Multi-Language Speech-to-Text with Vosk") | |
| with gr.Row(): | |
| audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath") | |
| lang_dropdown = gr.Dropdown(choices=list(models.keys()), value="English (US)", label="Language") | |
| output = gr.Textbox(label="Transcription") | |
| btn = gr.Button("Transcribe") | |
| btn.click(fn=run_transcription, inputs=[audio_input, lang_dropdown], outputs=output) | |
| return demo | |
| # ๐น FastAPI backend | |
| app = FastAPI() | |
| async def transcribe_api(language: str = Form(...), file: UploadFile = None): | |
| if file is None: | |
| return JSONResponse({"error": "No audio file provided"}, status_code=400) | |
| temp_path = f"/tmp/{file.filename}" | |
| with open(temp_path, "wb") as f: | |
| f.write(await file.read()) | |
| try: | |
| result_text = run_transcription(temp_path, language) | |
| return {"transcription": result_text} | |
| except Exception as e: | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| # ๐น Mount Gradio inside FastAPI | |
| demo = gradio_ui() | |
| gradio_app = gr.mount_gradio_app(app, demo, path="/") | |
| # ๐น Run server | |
| if __name__ == "__main__": | |
| uvicorn.run( | |
| gradio_app, | |
| host="0.0.0.0", | |
| port=7860 | |
| ) |