Spaces:
Sleeping
Sleeping
| """ | |
| Speech-to-text note taker Gradio app for Hugging Face Spaces | |
| Supports two backends: Vosk (offline) and OpenAI Whisper (local model). | |
| How to use: | |
| 1. Create a new Hugging Face Space (Gradio runtime) and upload this file as `app.py`. | |
| 2. Add the models you want to use for Vosk under a `models/vosk/` directory | |
| (e.g. `models/vosk/vosk-model-small-en-us-0.15`) and set the VOSK_MODEL_PATH field in the UI. | |
| 3. Space requirements (put in `requirements.txt`): | |
| gradio | |
| pydub | |
| soundfile | |
| vosk | |
| whisper | |
| numpy | |
| Notes: | |
| - Whisper model sizes can be large; choose `small` or `base` for Spaces with limited resources. | |
| - Vosk requires pre-downloaded models and works offline. | |
| - This app converts incoming audio to 16kHz mono WAV before transcribing. | |
| """ | |
| import os | |
| import tempfile | |
| import json | |
| from pathlib import Path | |
| from typing import Optional | |
| import gradio as gr | |
| from pydub import AudioSegment | |
| import soundfile as sf | |
| import numpy as np | |
| # Optional imports (lazy load) | |
| _whisper_model_cache = {} | |
| _vosk_model_cache = {} | |
| def ensure_wav_16k_mono(input_audio_path: str) -> str: | |
| """Convert arbitrary audio file to a 16kHz mono WAV and return the path.""" | |
| audio = AudioSegment.from_file(input_audio_path) | |
| audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2) | |
| out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") | |
| audio.export(out.name, format="wav") | |
| return out.name | |
| def transcribe_with_whisper(wav_path: str, model_size: str = "small") -> str: | |
| """Transcribe using OpenAI's whisper package (local model).""" | |
| try: | |
| import whisper | |
| except Exception as e: | |
| return f"Whisper import error: {e}. Make sure 'whisper' is installed in requirements.txt." | |
| global _whisper_model_cache | |
| if model_size not in _whisper_model_cache: | |
| try: | |
| _whisper_model_cache[model_size] = whisper.load_model(model_size) | |
| except Exception as e: | |
| return f"Failed to load Whisper model '{model_size}': {e}" | |
| model = _whisper_model_cache[model_size] | |
| try: | |
| result = model.transcribe(wav_path) | |
| return result.get("text", "") | |
| except Exception as e: | |
| return f"Whisper transcription error: {e}" | |
| def transcribe_with_vosk(wav_path: str, vosk_model_path: str) -> str: | |
| """Transcribe using Vosk local model. Expects a path to a downloaded Vosk model directory.""" | |
| try: | |
| from vosk import Model, KaldiRecognizer | |
| except Exception as e: | |
| return f"Vosk import error: {e}. Make sure 'vosk' is installed in requirements.txt." | |
| if not vosk_model_path or not os.path.isdir(vosk_model_path): | |
| return "Vosk model path is invalid or missing. Please provide a valid Vosk model directory." | |
| global _vosk_model_cache | |
| if vosk_model_path not in _vosk_model_cache: | |
| try: | |
| _vosk_model_cache[vosk_model_path] = Model(vosk_model_path) | |
| except Exception as e: | |
| return f"Failed to load Vosk model at '{vosk_model_path}': {e}" | |
| model = _vosk_model_cache[vosk_model_path] | |
| # Read audio frames | |
| import wave | |
| wf = wave.open(wav_path, "rb") | |
| if wf.getnchannels() != 1 or wf.getframerate() != 16000: | |
| return "Vosk expects 16kHz mono WAV. Conversion failed or wrong format." | |
| rec = KaldiRecognizer(model, wf.getframerate()) | |
| rec.SetWords(True) | |
| final_text_parts = [] | |
| while True: | |
| data = wf.readframes(4000) | |
| if len(data) == 0: | |
| break | |
| if rec.AcceptWaveform(data): | |
| res = json.loads(rec.Result()) | |
| if "text" in res: | |
| final_text_parts.append(res["text"]) | |
| res = json.loads(rec.FinalResult()) | |
| if "text" in res: | |
| final_text_parts.append(res["text"]) | |
| return " ".join(final_text_parts) | |
| def transcribe_audio(audio, backend: str, vosk_model_path: str, whisper_size: str): | |
| """Main handler called by Gradio. audio can be from mic or upload.""" | |
| if audio is None: | |
| return "No audio provided. Use the microphone or upload an audio file." | |
| # Gradio returns a file path string | |
| input_path = audio if isinstance(audio, str) else audio.get("name", None) | |
| if not input_path: | |
| return "Invalid audio input." | |
| # Convert to 16kHz mono WAV | |
| try: | |
| wav_path = ensure_wav_16k_mono(input_path) | |
| except Exception as e: | |
| return f"Audio conversion error: {e}" | |
| if backend == "whisper": | |
| text = transcribe_with_whisper(wav_path, model_size=whisper_size) | |
| elif backend == "vosk": | |
| text = transcribe_with_vosk(wav_path, vosk_model_path=vosk_model_path) | |
| else: | |
| text = "Unknown backend chosen." | |
| try: | |
| os.unlink(wav_path) | |
| except Exception: | |
| pass | |
| return text | |
| # Build Gradio UI | |
| with gr.Blocks(title="Speech-to-Text Note Taker") as demo: | |
| gr.Markdown( | |
| "# 🎙️ Speech-to-Text Note Taker\nChoose a backend (Vosk or Whisper), record or upload audio, and get a transcript you can edit or download." | |
| ) | |
| with gr.Row(): | |
| backend = gr.Radio( | |
| choices=["whisper", "vosk"], value="whisper", label="Backend" | |
| ) | |
| whisper_size = gr.Dropdown( | |
| choices=["tiny", "base", "small", "medium", "large"], | |
| value="small", | |
| label="Whisper model size (if using Whisper)", | |
| ) | |
| vosk_model_path = gr.Textbox( | |
| value="models/vosk/vosk-model-small-en-us-0.15", | |
| label="Vosk model path (if using Vosk)", | |
| ) | |
| with gr.Row(): | |
| mic = gr.Audio( | |
| sources=["microphone"], | |
| label="Record (microphone)", | |
| type="filepath", | |
| format="wav", | |
| ) | |
| upload = gr.Audio( | |
| sources=["upload"], | |
| label="Or upload an audio file", | |
| type="filepath", | |
| format="wav", | |
| ) | |
| transcribe_btn = gr.Button("Transcribe") | |
| output = gr.Textbox(label="Transcript", lines=8) | |
| def run(b, mfile, ufile, vpath, wsize): | |
| # prefer mic if data exists otherwise uploaded file | |
| audio_input = None | |
| if mfile: | |
| audio_input = mfile | |
| elif ufile: | |
| audio_input = ufile | |
| return transcribe_audio(audio_input, b, vpath, wsize) | |
| transcribe_btn.click(run, inputs=[backend, mic, upload, vosk_model_path, whisper_size], outputs=[output]) | |
| gr.Markdown( | |
| "---\n**Tips:**\n- If using Vosk, download a small English model and enter the path in the Vosk model path field.\n- If using Whisper, choose a smaller model for faster transcriptions on CPU.\n" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |