Spaces:

muhammadharis222
/

note_taker

Sleeping

App Files Files Community

note_taker / app.py

muhammadharis222

Update app.py

5ad3916 verified 4 months ago

raw

history blame contribute delete

6.6 kB

	"""
	Speech-to-text note taker Gradio app for Hugging Face Spaces
	Supports two backends: Vosk (offline) and OpenAI Whisper (local model).

	How to use:
	1. Create a new Hugging Face Space (Gradio runtime) and upload this file as `app.py`.
	2. Add the models you want to use for Vosk under a `models/vosk/` directory
	(e.g. `models/vosk/vosk-model-small-en-us-0.15`) and set the VOSK_MODEL_PATH field in the UI.
	3. Space requirements (put in `requirements.txt`):
	gradio
	pydub
	soundfile
	vosk
	whisper
	numpy

	Notes:
	- Whisper model sizes can be large; choose `small` or `base` for Spaces with limited resources.
	- Vosk requires pre-downloaded models and works offline.
	- This app converts incoming audio to 16kHz mono WAV before transcribing.
	"""

	import os
	import tempfile
	import json
	from pathlib import Path
	from typing import Optional

	import gradio as gr
	from pydub import AudioSegment
	import soundfile as sf
	import numpy as np

	# Optional imports (lazy load)
	_whisper_model_cache = {}
	_vosk_model_cache = {}


	def ensure_wav_16k_mono(input_audio_path: str) -> str:
	"""Convert arbitrary audio file to a 16kHz mono WAV and return the path."""
	audio = AudioSegment.from_file(input_audio_path)
	audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
	out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
	audio.export(out.name, format="wav")
	return out.name


	def transcribe_with_whisper(wav_path: str, model_size: str = "small") -> str:
	"""Transcribe using OpenAI's whisper package (local model)."""
	try:
	import whisper
	except Exception as e:
	return f"Whisper import error: {e}. Make sure 'whisper' is installed in requirements.txt."

	global _whisper_model_cache
	if model_size not in _whisper_model_cache:
	try:
	_whisper_model_cache[model_size] = whisper.load_model(model_size)
	except Exception as e:
	return f"Failed to load Whisper model '{model_size}': {e}"

	model = _whisper_model_cache[model_size]
	try:
	result = model.transcribe(wav_path)
	return result.get("text", "")
	except Exception as e:
	return f"Whisper transcription error: {e}"


	def transcribe_with_vosk(wav_path: str, vosk_model_path: str) -> str:
	"""Transcribe using Vosk local model. Expects a path to a downloaded Vosk model directory."""
	try:
	from vosk import Model, KaldiRecognizer
	except Exception as e:
	return f"Vosk import error: {e}. Make sure 'vosk' is installed in requirements.txt."

	if not vosk_model_path or not os.path.isdir(vosk_model_path):
	return "Vosk model path is invalid or missing. Please provide a valid Vosk model directory."

	global _vosk_model_cache
	if vosk_model_path not in _vosk_model_cache:
	try:
	_vosk_model_cache[vosk_model_path] = Model(vosk_model_path)
	except Exception as e:
	return f"Failed to load Vosk model at '{vosk_model_path}': {e}"

	model = _vosk_model_cache[vosk_model_path]

	# Read audio frames
	import wave
	wf = wave.open(wav_path, "rb")
	if wf.getnchannels() != 1 or wf.getframerate() != 16000:
	return "Vosk expects 16kHz mono WAV. Conversion failed or wrong format."

	rec = KaldiRecognizer(model, wf.getframerate())
	rec.SetWords(True)

	final_text_parts = []
	while True:
	data = wf.readframes(4000)
	if len(data) == 0:
	break
	if rec.AcceptWaveform(data):
	res = json.loads(rec.Result())
	if "text" in res:
	final_text_parts.append(res["text"])
	res = json.loads(rec.FinalResult())
	if "text" in res:
	final_text_parts.append(res["text"])

	return " ".join(final_text_parts)


	def transcribe_audio(audio, backend: str, vosk_model_path: str, whisper_size: str):
	"""Main handler called by Gradio. audio can be from mic or upload."""
	if audio is None:
	return "No audio provided. Use the microphone or upload an audio file."

	# Gradio returns a file path string
	input_path = audio if isinstance(audio, str) else audio.get("name", None)
	if not input_path:
	return "Invalid audio input."

	# Convert to 16kHz mono WAV
	try:
	wav_path = ensure_wav_16k_mono(input_path)
	except Exception as e:
	return f"Audio conversion error: {e}"

	if backend == "whisper":
	text = transcribe_with_whisper(wav_path, model_size=whisper_size)
	elif backend == "vosk":
	text = transcribe_with_vosk(wav_path, vosk_model_path=vosk_model_path)
	else:
	text = "Unknown backend chosen."

	try:
	os.unlink(wav_path)
	except Exception:
	pass

	return text


	# Build Gradio UI
	with gr.Blocks(title="Speech-to-Text Note Taker") as demo:
	gr.Markdown(
	"# 🎙️ Speech-to-Text Note Taker\nChoose a backend (Vosk or Whisper), record or upload audio, and get a transcript you can edit or download."
	)

	with gr.Row():
	backend = gr.Radio(
	choices=["whisper", "vosk"], value="whisper", label="Backend"
	)
	whisper_size = gr.Dropdown(
	choices=["tiny", "base", "small", "medium", "large"],
	value="small",
	label="Whisper model size (if using Whisper)",
	)

	vosk_model_path = gr.Textbox(
	value="models/vosk/vosk-model-small-en-us-0.15",
	label="Vosk model path (if using Vosk)",
	)

	with gr.Row():
	mic = gr.Audio(
	sources=["microphone"],
	label="Record (microphone)",
	type="filepath",
	format="wav",
	)
	upload = gr.Audio(
	sources=["upload"],
	label="Or upload an audio file",
	type="filepath",
	format="wav",
	)

	transcribe_btn = gr.Button("Transcribe")
	output = gr.Textbox(label="Transcript", lines=8)

	def run(b, mfile, ufile, vpath, wsize):
	# prefer mic if data exists otherwise uploaded file
	audio_input = None
	if mfile:
	audio_input = mfile
	elif ufile:
	audio_input = ufile
	return transcribe_audio(audio_input, b, vpath, wsize)

	transcribe_btn.click(run, inputs=[backend, mic, upload, vosk_model_path, whisper_size], outputs=[output])

	gr.Markdown(
	"---\nTips:\n- If using Vosk, download a small English model and enter the path in the Vosk model path field.\n- If using Whisper, choose a smaller model for faster transcriptions on CPU.\n"
	)

	if __name__ == "__main__":
	demo.launch()