""" Speech-to-text note taker Gradio app for Hugging Face Spaces Supports two backends: Vosk (offline) and OpenAI Whisper (local model). How to use: 1. Create a new Hugging Face Space (Gradio runtime) and upload this file as `app.py`. 2. Add the models you want to use for Vosk under a `models/vosk/` directory (e.g. `models/vosk/vosk-model-small-en-us-0.15`) and set the VOSK_MODEL_PATH field in the UI. 3. Space requirements (put in `requirements.txt`): gradio pydub soundfile vosk whisper numpy Notes: - Whisper model sizes can be large; choose `small` or `base` for Spaces with limited resources. - Vosk requires pre-downloaded models and works offline. - This app converts incoming audio to 16kHz mono WAV before transcribing. """ import os import tempfile import json from pathlib import Path from typing import Optional import gradio as gr from pydub import AudioSegment import soundfile as sf import numpy as np # Optional imports (lazy load) _whisper_model_cache = {} _vosk_model_cache = {} def ensure_wav_16k_mono(input_audio_path: str) -> str: """Convert arbitrary audio file to a 16kHz mono WAV and return the path.""" audio = AudioSegment.from_file(input_audio_path) audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2) out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") audio.export(out.name, format="wav") return out.name def transcribe_with_whisper(wav_path: str, model_size: str = "small") -> str: """Transcribe using OpenAI's whisper package (local model).""" try: import whisper except Exception as e: return f"Whisper import error: {e}. Make sure 'whisper' is installed in requirements.txt." global _whisper_model_cache if model_size not in _whisper_model_cache: try: _whisper_model_cache[model_size] = whisper.load_model(model_size) except Exception as e: return f"Failed to load Whisper model '{model_size}': {e}" model = _whisper_model_cache[model_size] try: result = model.transcribe(wav_path) return result.get("text", "") except Exception as e: return f"Whisper transcription error: {e}" def transcribe_with_vosk(wav_path: str, vosk_model_path: str) -> str: """Transcribe using Vosk local model. Expects a path to a downloaded Vosk model directory.""" try: from vosk import Model, KaldiRecognizer except Exception as e: return f"Vosk import error: {e}. Make sure 'vosk' is installed in requirements.txt." if not vosk_model_path or not os.path.isdir(vosk_model_path): return "Vosk model path is invalid or missing. Please provide a valid Vosk model directory." global _vosk_model_cache if vosk_model_path not in _vosk_model_cache: try: _vosk_model_cache[vosk_model_path] = Model(vosk_model_path) except Exception as e: return f"Failed to load Vosk model at '{vosk_model_path}': {e}" model = _vosk_model_cache[vosk_model_path] # Read audio frames import wave wf = wave.open(wav_path, "rb") if wf.getnchannels() != 1 or wf.getframerate() != 16000: return "Vosk expects 16kHz mono WAV. Conversion failed or wrong format." rec = KaldiRecognizer(model, wf.getframerate()) rec.SetWords(True) final_text_parts = [] while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): res = json.loads(rec.Result()) if "text" in res: final_text_parts.append(res["text"]) res = json.loads(rec.FinalResult()) if "text" in res: final_text_parts.append(res["text"]) return " ".join(final_text_parts) def transcribe_audio(audio, backend: str, vosk_model_path: str, whisper_size: str): """Main handler called by Gradio. audio can be from mic or upload.""" if audio is None: return "No audio provided. Use the microphone or upload an audio file." # Gradio returns a file path string input_path = audio if isinstance(audio, str) else audio.get("name", None) if not input_path: return "Invalid audio input." # Convert to 16kHz mono WAV try: wav_path = ensure_wav_16k_mono(input_path) except Exception as e: return f"Audio conversion error: {e}" if backend == "whisper": text = transcribe_with_whisper(wav_path, model_size=whisper_size) elif backend == "vosk": text = transcribe_with_vosk(wav_path, vosk_model_path=vosk_model_path) else: text = "Unknown backend chosen." try: os.unlink(wav_path) except Exception: pass return text # Build Gradio UI with gr.Blocks(title="Speech-to-Text Note Taker") as demo: gr.Markdown( "# 🎙️ Speech-to-Text Note Taker\nChoose a backend (Vosk or Whisper), record or upload audio, and get a transcript you can edit or download." ) with gr.Row(): backend = gr.Radio( choices=["whisper", "vosk"], value="whisper", label="Backend" ) whisper_size = gr.Dropdown( choices=["tiny", "base", "small", "medium", "large"], value="small", label="Whisper model size (if using Whisper)", ) vosk_model_path = gr.Textbox( value="models/vosk/vosk-model-small-en-us-0.15", label="Vosk model path (if using Vosk)", ) with gr.Row(): mic = gr.Audio( sources=["microphone"], label="Record (microphone)", type="filepath", format="wav", ) upload = gr.Audio( sources=["upload"], label="Or upload an audio file", type="filepath", format="wav", ) transcribe_btn = gr.Button("Transcribe") output = gr.Textbox(label="Transcript", lines=8) def run(b, mfile, ufile, vpath, wsize): # prefer mic if data exists otherwise uploaded file audio_input = None if mfile: audio_input = mfile elif ufile: audio_input = ufile return transcribe_audio(audio_input, b, vpath, wsize) transcribe_btn.click(run, inputs=[backend, mic, upload, vosk_model_path, whisper_size], outputs=[output]) gr.Markdown( "---\n**Tips:**\n- If using Vosk, download a small English model and enter the path in the Vosk model path field.\n- If using Whisper, choose a smaller model for faster transcriptions on CPU.\n" ) if __name__ == "__main__": demo.launch()