Spaces:

muhammadharis222
/

note_taker

Sleeping

App Files Files Community

muhammadharis222 commited on Nov 7, 2025

Commit

5ad3916

verified ·

1 Parent(s): 46b4574

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -20

app.py CHANGED Viewed

@@ -4,7 +4,8 @@ Supports two backends: Vosk (offline) and OpenAI Whisper (local model).
 How to use:
 1. Create a new Hugging Face Space (Gradio runtime) and upload this file as `app.py`.
-2. Add the models you want to use for Vosk under a `models/vosk/` directory (e.g. `models/vosk/vosk-model-small-en-us-0.15`) and set the VOSK_MODEL_PATH field in the UI.
 3. Space requirements (put in `requirements.txt`):
    gradio
    pydub
@@ -17,7 +18,6 @@ Notes:
 - Whisper model sizes can be large; choose `small` or `base` for Spaces with limited resources.
 - Vosk requires pre-downloaded models and works offline.
 - This app converts incoming audio to 16kHz mono WAV before transcribing.
 """
 import os
@@ -31,7 +31,7 @@ from pydub import AudioSegment
 import soundfile as sf
 import numpy as np
-# Optional imports (we import lazily inside functions to avoid heavy startup)
 _whisper_model_cache = {}
 _vosk_model_cache = {}
@@ -112,17 +112,16 @@ def transcribe_with_vosk(wav_path: str, vosk_model_path: str) -> str:
 def transcribe_audio(audio, backend: str, vosk_model_path: str, whisper_size: str):
-    """Main handler called by Gradio. audio can be either None, an uploaded file, or a dict from the mic component."""
     if audio is None:
         return "No audio provided. Use the microphone or upload an audio file."
-    # Gradio will give either a path string or a dict with 'name' depending on input; handle both.
-    if isinstance(audio, dict) and "name" in audio:
-        input_path = audio["name"]
-    else:
-        input_path = audio
-    # Convert and normalize to 16k mono WAV
     try:
         wav_path = ensure_wav_16k_mono(input_path)
     except Exception as e:
@@ -135,7 +134,6 @@ def transcribe_audio(audio, backend: str, vosk_model_path: str, whisper_size: st
     else:
         text = "Unknown backend chosen."
-    # Clean up temporary WAV file
     try:
         os.unlink(wav_path)
     except Exception:
@@ -144,19 +142,40 @@ def transcribe_audio(audio, backend: str, vosk_model_path: str, whisper_size: st
     return text
-# Build the Gradio UI
 with gr.Blocks(title="Speech-to-Text Note Taker") as demo:
-    gr.Markdown("# Speech-to-Text Note Taker\nChoose a backend (Vosk or Whisper), record or upload audio, and get a transcript you can edit or download.")
     with gr.Row():
-        backend = gr.Radio(choices=["whisper", "vosk"], value="whisper", label="Backend")
-        whisper_size = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="small", label="Whisper model size (if using Whisper)")
-    vosk_model_path = gr.Textbox(value="models/vosk/vosk-model-small-en-us-0.15", label="Vosk model path (if using Vosk)")
     with gr.Row():
-        mic = gr.Audio(label="Record (microphone)", type="filepath", format="wav", interactive=True, show_download_button=False)
-        upload = gr.Audio(label="Or upload an audio file", type="filepath", format="wav", interactive=True)
     transcribe_btn = gr.Button("Transcribe")
     output = gr.Textbox(label="Transcript", lines=8)
@@ -172,7 +191,9 @@ with gr.Blocks(title="Speech-to-Text Note Taker") as demo:
     transcribe_btn.click(run, inputs=[backend, mic, upload, vosk_model_path, whisper_size], outputs=[output])
-    gr.Markdown("---\n**Tips:**\n- If using Vosk, download a small English model and enter the path in the Vosk model path field.\n- If using Whisper, choose a smaller model for faster transcriptions on CPU.\n")
 if __name__ == "__main__":
     demo.launch()

 How to use:
 1. Create a new Hugging Face Space (Gradio runtime) and upload this file as `app.py`.
+2. Add the models you want to use for Vosk under a `models/vosk/` directory
+   (e.g. `models/vosk/vosk-model-small-en-us-0.15`) and set the VOSK_MODEL_PATH field in the UI.
 3. Space requirements (put in `requirements.txt`):
    gradio
    pydub
 - Whisper model sizes can be large; choose `small` or `base` for Spaces with limited resources.
 - Vosk requires pre-downloaded models and works offline.
 - This app converts incoming audio to 16kHz mono WAV before transcribing.
 """
 import os
 import soundfile as sf
 import numpy as np
+# Optional imports (lazy load)
 _whisper_model_cache = {}
 _vosk_model_cache = {}
 def transcribe_audio(audio, backend: str, vosk_model_path: str, whisper_size: str):
+    """Main handler called by Gradio. audio can be from mic or upload."""
     if audio is None:
         return "No audio provided. Use the microphone or upload an audio file."
+    # Gradio returns a file path string
+    input_path = audio if isinstance(audio, str) else audio.get("name", None)
+    if not input_path:
+        return "Invalid audio input."
+    # Convert to 16kHz mono WAV
     try:
         wav_path = ensure_wav_16k_mono(input_path)
     except Exception as e:
     else:
         text = "Unknown backend chosen."
     try:
         os.unlink(wav_path)
     except Exception:
     return text
+# Build Gradio UI
 with gr.Blocks(title="Speech-to-Text Note Taker") as demo:
+    gr.Markdown(
+        "# 🎙️ Speech-to-Text Note Taker\nChoose a backend (Vosk or Whisper), record or upload audio, and get a transcript you can edit or download."
+    )
     with gr.Row():
+        backend = gr.Radio(
+            choices=["whisper", "vosk"], value="whisper", label="Backend"
+        )
+        whisper_size = gr.Dropdown(
+            choices=["tiny", "base", "small", "medium", "large"],
+            value="small",
+            label="Whisper model size (if using Whisper)",
+        )
+    vosk_model_path = gr.Textbox(
+        value="models/vosk/vosk-model-small-en-us-0.15",
+        label="Vosk model path (if using Vosk)",
+    )
     with gr.Row():
+        mic = gr.Audio(
+            sources=["microphone"],
+            label="Record (microphone)",
+            type="filepath",
+            format="wav",
+        )
+        upload = gr.Audio(
+            sources=["upload"],
+            label="Or upload an audio file",
+            type="filepath",
+            format="wav",
+        )
     transcribe_btn = gr.Button("Transcribe")
     output = gr.Textbox(label="Transcript", lines=8)
     transcribe_btn.click(run, inputs=[backend, mic, upload, vosk_model_path, whisper_size], outputs=[output])
+    gr.Markdown(
+        "---\n**Tips:**\n- If using Vosk, download a small English model and enter the path in the Vosk model path field.\n- If using Whisper, choose a smaller model for faster transcriptions on CPU.\n"
+    )
 if __name__ == "__main__":
     demo.launch()