note_taker / app.py
muhammadharis222's picture
Update app.py
5ad3916 verified
"""
Speech-to-text note taker Gradio app for Hugging Face Spaces
Supports two backends: Vosk (offline) and OpenAI Whisper (local model).
How to use:
1. Create a new Hugging Face Space (Gradio runtime) and upload this file as `app.py`.
2. Add the models you want to use for Vosk under a `models/vosk/` directory
(e.g. `models/vosk/vosk-model-small-en-us-0.15`) and set the VOSK_MODEL_PATH field in the UI.
3. Space requirements (put in `requirements.txt`):
gradio
pydub
soundfile
vosk
whisper
numpy
Notes:
- Whisper model sizes can be large; choose `small` or `base` for Spaces with limited resources.
- Vosk requires pre-downloaded models and works offline.
- This app converts incoming audio to 16kHz mono WAV before transcribing.
"""
import os
import tempfile
import json
from pathlib import Path
from typing import Optional
import gradio as gr
from pydub import AudioSegment
import soundfile as sf
import numpy as np
# Optional imports (lazy load)
_whisper_model_cache = {}
_vosk_model_cache = {}
def ensure_wav_16k_mono(input_audio_path: str) -> str:
"""Convert arbitrary audio file to a 16kHz mono WAV and return the path."""
audio = AudioSegment.from_file(input_audio_path)
audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
audio.export(out.name, format="wav")
return out.name
def transcribe_with_whisper(wav_path: str, model_size: str = "small") -> str:
"""Transcribe using OpenAI's whisper package (local model)."""
try:
import whisper
except Exception as e:
return f"Whisper import error: {e}. Make sure 'whisper' is installed in requirements.txt."
global _whisper_model_cache
if model_size not in _whisper_model_cache:
try:
_whisper_model_cache[model_size] = whisper.load_model(model_size)
except Exception as e:
return f"Failed to load Whisper model '{model_size}': {e}"
model = _whisper_model_cache[model_size]
try:
result = model.transcribe(wav_path)
return result.get("text", "")
except Exception as e:
return f"Whisper transcription error: {e}"
def transcribe_with_vosk(wav_path: str, vosk_model_path: str) -> str:
"""Transcribe using Vosk local model. Expects a path to a downloaded Vosk model directory."""
try:
from vosk import Model, KaldiRecognizer
except Exception as e:
return f"Vosk import error: {e}. Make sure 'vosk' is installed in requirements.txt."
if not vosk_model_path or not os.path.isdir(vosk_model_path):
return "Vosk model path is invalid or missing. Please provide a valid Vosk model directory."
global _vosk_model_cache
if vosk_model_path not in _vosk_model_cache:
try:
_vosk_model_cache[vosk_model_path] = Model(vosk_model_path)
except Exception as e:
return f"Failed to load Vosk model at '{vosk_model_path}': {e}"
model = _vosk_model_cache[vosk_model_path]
# Read audio frames
import wave
wf = wave.open(wav_path, "rb")
if wf.getnchannels() != 1 or wf.getframerate() != 16000:
return "Vosk expects 16kHz mono WAV. Conversion failed or wrong format."
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)
final_text_parts = []
while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
res = json.loads(rec.Result())
if "text" in res:
final_text_parts.append(res["text"])
res = json.loads(rec.FinalResult())
if "text" in res:
final_text_parts.append(res["text"])
return " ".join(final_text_parts)
def transcribe_audio(audio, backend: str, vosk_model_path: str, whisper_size: str):
"""Main handler called by Gradio. audio can be from mic or upload."""
if audio is None:
return "No audio provided. Use the microphone or upload an audio file."
# Gradio returns a file path string
input_path = audio if isinstance(audio, str) else audio.get("name", None)
if not input_path:
return "Invalid audio input."
# Convert to 16kHz mono WAV
try:
wav_path = ensure_wav_16k_mono(input_path)
except Exception as e:
return f"Audio conversion error: {e}"
if backend == "whisper":
text = transcribe_with_whisper(wav_path, model_size=whisper_size)
elif backend == "vosk":
text = transcribe_with_vosk(wav_path, vosk_model_path=vosk_model_path)
else:
text = "Unknown backend chosen."
try:
os.unlink(wav_path)
except Exception:
pass
return text
# Build Gradio UI
with gr.Blocks(title="Speech-to-Text Note Taker") as demo:
gr.Markdown(
"# 🎙️ Speech-to-Text Note Taker\nChoose a backend (Vosk or Whisper), record or upload audio, and get a transcript you can edit or download."
)
with gr.Row():
backend = gr.Radio(
choices=["whisper", "vosk"], value="whisper", label="Backend"
)
whisper_size = gr.Dropdown(
choices=["tiny", "base", "small", "medium", "large"],
value="small",
label="Whisper model size (if using Whisper)",
)
vosk_model_path = gr.Textbox(
value="models/vosk/vosk-model-small-en-us-0.15",
label="Vosk model path (if using Vosk)",
)
with gr.Row():
mic = gr.Audio(
sources=["microphone"],
label="Record (microphone)",
type="filepath",
format="wav",
)
upload = gr.Audio(
sources=["upload"],
label="Or upload an audio file",
type="filepath",
format="wav",
)
transcribe_btn = gr.Button("Transcribe")
output = gr.Textbox(label="Transcript", lines=8)
def run(b, mfile, ufile, vpath, wsize):
# prefer mic if data exists otherwise uploaded file
audio_input = None
if mfile:
audio_input = mfile
elif ufile:
audio_input = ufile
return transcribe_audio(audio_input, b, vpath, wsize)
transcribe_btn.click(run, inputs=[backend, mic, upload, vosk_model_path, whisper_size], outputs=[output])
gr.Markdown(
"---\n**Tips:**\n- If using Vosk, download a small English model and enter the path in the Vosk model path field.\n- If using Whisper, choose a smaller model for faster transcriptions on CPU.\n"
)
if __name__ == "__main__":
demo.launch()