muhammadharis222 commited on
Commit
5ad3916
·
verified ·
1 Parent(s): 46b4574

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -20
app.py CHANGED
@@ -4,7 +4,8 @@ Supports two backends: Vosk (offline) and OpenAI Whisper (local model).
4
 
5
  How to use:
6
  1. Create a new Hugging Face Space (Gradio runtime) and upload this file as `app.py`.
7
- 2. Add the models you want to use for Vosk under a `models/vosk/` directory (e.g. `models/vosk/vosk-model-small-en-us-0.15`) and set the VOSK_MODEL_PATH field in the UI.
 
8
  3. Space requirements (put in `requirements.txt`):
9
  gradio
10
  pydub
@@ -17,7 +18,6 @@ Notes:
17
  - Whisper model sizes can be large; choose `small` or `base` for Spaces with limited resources.
18
  - Vosk requires pre-downloaded models and works offline.
19
  - This app converts incoming audio to 16kHz mono WAV before transcribing.
20
-
21
  """
22
 
23
  import os
@@ -31,7 +31,7 @@ from pydub import AudioSegment
31
  import soundfile as sf
32
  import numpy as np
33
 
34
- # Optional imports (we import lazily inside functions to avoid heavy startup)
35
  _whisper_model_cache = {}
36
  _vosk_model_cache = {}
37
 
@@ -112,17 +112,16 @@ def transcribe_with_vosk(wav_path: str, vosk_model_path: str) -> str:
112
 
113
 
114
  def transcribe_audio(audio, backend: str, vosk_model_path: str, whisper_size: str):
115
- """Main handler called by Gradio. audio can be either None, an uploaded file, or a dict from the mic component."""
116
  if audio is None:
117
  return "No audio provided. Use the microphone or upload an audio file."
118
 
119
- # Gradio will give either a path string or a dict with 'name' depending on input; handle both.
120
- if isinstance(audio, dict) and "name" in audio:
121
- input_path = audio["name"]
122
- else:
123
- input_path = audio
124
 
125
- # Convert and normalize to 16k mono WAV
126
  try:
127
  wav_path = ensure_wav_16k_mono(input_path)
128
  except Exception as e:
@@ -135,7 +134,6 @@ def transcribe_audio(audio, backend: str, vosk_model_path: str, whisper_size: st
135
  else:
136
  text = "Unknown backend chosen."
137
 
138
- # Clean up temporary WAV file
139
  try:
140
  os.unlink(wav_path)
141
  except Exception:
@@ -144,19 +142,40 @@ def transcribe_audio(audio, backend: str, vosk_model_path: str, whisper_size: st
144
  return text
145
 
146
 
147
- # Build the Gradio UI
148
  with gr.Blocks(title="Speech-to-Text Note Taker") as demo:
149
- gr.Markdown("# Speech-to-Text Note Taker\nChoose a backend (Vosk or Whisper), record or upload audio, and get a transcript you can edit or download.")
 
 
150
 
151
  with gr.Row():
152
- backend = gr.Radio(choices=["whisper", "vosk"], value="whisper", label="Backend")
153
- whisper_size = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="small", label="Whisper model size (if using Whisper)")
154
-
155
- vosk_model_path = gr.Textbox(value="models/vosk/vosk-model-small-en-us-0.15", label="Vosk model path (if using Vosk)")
 
 
 
 
 
 
 
 
 
156
 
157
  with gr.Row():
158
- mic = gr.Audio(label="Record (microphone)", type="filepath", format="wav", interactive=True, show_download_button=False)
159
- upload = gr.Audio(label="Or upload an audio file", type="filepath", format="wav", interactive=True)
 
 
 
 
 
 
 
 
 
 
160
 
161
  transcribe_btn = gr.Button("Transcribe")
162
  output = gr.Textbox(label="Transcript", lines=8)
@@ -172,7 +191,9 @@ with gr.Blocks(title="Speech-to-Text Note Taker") as demo:
172
 
173
  transcribe_btn.click(run, inputs=[backend, mic, upload, vosk_model_path, whisper_size], outputs=[output])
174
 
175
- gr.Markdown("---\n**Tips:**\n- If using Vosk, download a small English model and enter the path in the Vosk model path field.\n- If using Whisper, choose a smaller model for faster transcriptions on CPU.\n")
 
 
176
 
177
  if __name__ == "__main__":
178
  demo.launch()
 
4
 
5
  How to use:
6
  1. Create a new Hugging Face Space (Gradio runtime) and upload this file as `app.py`.
7
+ 2. Add the models you want to use for Vosk under a `models/vosk/` directory
8
+ (e.g. `models/vosk/vosk-model-small-en-us-0.15`) and set the VOSK_MODEL_PATH field in the UI.
9
  3. Space requirements (put in `requirements.txt`):
10
  gradio
11
  pydub
 
18
  - Whisper model sizes can be large; choose `small` or `base` for Spaces with limited resources.
19
  - Vosk requires pre-downloaded models and works offline.
20
  - This app converts incoming audio to 16kHz mono WAV before transcribing.
 
21
  """
22
 
23
  import os
 
31
  import soundfile as sf
32
  import numpy as np
33
 
34
+ # Optional imports (lazy load)
35
  _whisper_model_cache = {}
36
  _vosk_model_cache = {}
37
 
 
112
 
113
 
114
  def transcribe_audio(audio, backend: str, vosk_model_path: str, whisper_size: str):
115
+ """Main handler called by Gradio. audio can be from mic or upload."""
116
  if audio is None:
117
  return "No audio provided. Use the microphone or upload an audio file."
118
 
119
+ # Gradio returns a file path string
120
+ input_path = audio if isinstance(audio, str) else audio.get("name", None)
121
+ if not input_path:
122
+ return "Invalid audio input."
 
123
 
124
+ # Convert to 16kHz mono WAV
125
  try:
126
  wav_path = ensure_wav_16k_mono(input_path)
127
  except Exception as e:
 
134
  else:
135
  text = "Unknown backend chosen."
136
 
 
137
  try:
138
  os.unlink(wav_path)
139
  except Exception:
 
142
  return text
143
 
144
 
145
+ # Build Gradio UI
146
  with gr.Blocks(title="Speech-to-Text Note Taker") as demo:
147
+ gr.Markdown(
148
+ "# 🎙️ Speech-to-Text Note Taker\nChoose a backend (Vosk or Whisper), record or upload audio, and get a transcript you can edit or download."
149
+ )
150
 
151
  with gr.Row():
152
+ backend = gr.Radio(
153
+ choices=["whisper", "vosk"], value="whisper", label="Backend"
154
+ )
155
+ whisper_size = gr.Dropdown(
156
+ choices=["tiny", "base", "small", "medium", "large"],
157
+ value="small",
158
+ label="Whisper model size (if using Whisper)",
159
+ )
160
+
161
+ vosk_model_path = gr.Textbox(
162
+ value="models/vosk/vosk-model-small-en-us-0.15",
163
+ label="Vosk model path (if using Vosk)",
164
+ )
165
 
166
  with gr.Row():
167
+ mic = gr.Audio(
168
+ sources=["microphone"],
169
+ label="Record (microphone)",
170
+ type="filepath",
171
+ format="wav",
172
+ )
173
+ upload = gr.Audio(
174
+ sources=["upload"],
175
+ label="Or upload an audio file",
176
+ type="filepath",
177
+ format="wav",
178
+ )
179
 
180
  transcribe_btn = gr.Button("Transcribe")
181
  output = gr.Textbox(label="Transcript", lines=8)
 
191
 
192
  transcribe_btn.click(run, inputs=[backend, mic, upload, vosk_model_path, whisper_size], outputs=[output])
193
 
194
+ gr.Markdown(
195
+ "---\n**Tips:**\n- If using Vosk, download a small English model and enter the path in the Vosk model path field.\n- If using Whisper, choose a smaller model for faster transcriptions on CPU.\n"
196
+ )
197
 
198
  if __name__ == "__main__":
199
  demo.launch()