oddadmix commited on
Commit
06bbda0
·
verified ·
1 Parent(s): 1cf51f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -166
app.py CHANGED
@@ -3,7 +3,6 @@ import gradio as gr
3
  from transformers import AutoProcessor, Gemma3nForConditionalGeneration
4
  import torch
5
  import os
6
- import numpy as np
7
 
8
  # Global variables for model and processor
9
  model = None
@@ -16,6 +15,7 @@ def load_model():
16
  print("Loading model...")
17
  model_id = "oddadmix/egyptian-code-switching-b4-g2-merged"
18
 
 
19
  model = Gemma3nForConditionalGeneration.from_pretrained(
20
  model_id,
21
  device_map="auto",
@@ -81,85 +81,6 @@ def transcribe_audio(audio_path, max_tokens=128):
81
  except Exception as e:
82
  return f"Error during transcription: {str(e)}"
83
 
84
- @spaces.GPU
85
- def live_transcribe(audio_stream, max_tokens=128):
86
- """Transcribe audio stream in real-time"""
87
- if model is None or processor is None:
88
- yield "Error: Model not loaded"
89
- return
90
-
91
- if audio_stream is None:
92
- yield "Waiting for audio input..."
93
- return
94
-
95
- try:
96
- # Extract sample rate and audio data
97
- sample_rate, audio_data = audio_stream
98
-
99
- # Check if we have enough audio data (at least 1 second)
100
- if len(audio_data) < sample_rate:
101
- yield "Recording... (speak now)"
102
- return
103
-
104
- # Save temporary audio file
105
- import tempfile
106
- import soundfile as sf
107
-
108
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
109
- tmp_path = tmp_file.name
110
- sf.write(tmp_path, audio_data, sample_rate)
111
-
112
- try:
113
- messages = [
114
- {
115
- "role": "system",
116
- "content": [
117
- {
118
- "type": "text",
119
- "text": "You are an assistant that transcribes speech accurately.",
120
- }
121
- ],
122
- },
123
- {
124
- "role": "user",
125
- "content": [
126
- {"type": "audio", "url": tmp_path},
127
- {"type": "text", "text": "Please transcribe this audio."}
128
- ]
129
- }
130
- ]
131
-
132
- inputs = processor.apply_chat_template(
133
- messages,
134
- add_generation_prompt=True,
135
- tokenize=True,
136
- return_dict=True,
137
- return_tensors="pt",
138
- ).to(model.device)
139
-
140
- input_len = inputs["input_ids"].shape[-1]
141
-
142
- # Generate transcription
143
- with torch.inference_mode():
144
- generation = model.generate(
145
- **inputs,
146
- max_new_tokens=max_tokens,
147
- do_sample=False
148
- )
149
- generation = generation[0][input_len:]
150
-
151
- response = processor.decode(generation, skip_special_tokens=True)
152
-
153
- yield response
154
-
155
- finally:
156
- # Clean up temporary file
157
- if os.path.exists(tmp_path):
158
- os.unlink(tmp_path)
159
-
160
- except Exception as e:
161
- yield f"Error during transcription: {str(e)}"
162
-
163
  # Load model at startup
164
  load_model()
165
 
@@ -169,97 +90,33 @@ with gr.Blocks(title="Egyptian Code Switching Audio Transcription") as demo:
169
  """
170
  # 🎙️ Egyptian Code Switching Audio Transcription
171
 
172
- Choose between live transcription or file upload for automatic transcription.
173
  Specialized for Egyptian Arabic with English code-switching.
174
  """
175
  )
176
 
177
- with gr.Tabs():
178
- # Live Transcription Tab
179
- with gr.Tab("Live Transcription"):
180
- gr.Markdown(
181
- """
182
- ### 🔴 Live Transcription Mode
183
- Click the microphone button below and start speaking. The transcription will update in real-time.
184
- """
185
  )
186
-
187
- with gr.Row():
188
- with gr.Column():
189
- live_audio = gr.Audio(
190
- sources=["microphone"],
191
- type="numpy",
192
- label="Live Audio Input",
193
- streaming=True
194
- )
195
- live_max_tokens = gr.Slider(
196
- minimum=32,
197
- maximum=512,
198
- value=128,
199
- step=32,
200
- label="Max Output Tokens"
201
- )
202
-
203
- with gr.Column():
204
- live_output = gr.Textbox(
205
- label="Live Transcription",
206
- placeholder="Start speaking and transcription will appear here...",
207
- lines=10,
208
- rtl=True
209
- )
210
-
211
- # Set up live transcription
212
- live_audio.stream(
213
- fn=live_transcribe,
214
- inputs=[live_audio, live_max_tokens],
215
- outputs=live_output
216
- )
217
-
218
- # File Upload Tab
219
- with gr.Tab("File Upload"):
220
- gr.Markdown(
221
- """
222
- ### 📁 File Upload Mode
223
- Upload an audio file or record your voice to get a transcription.
224
- """
225
  )
 
226
 
227
- with gr.Row():
228
- with gr.Column():
229
- audio_input = gr.Audio(
230
- sources=["upload", "microphone"],
231
- type="filepath",
232
- label="Audio Input"
233
- )
234
- max_tokens_slider = gr.Slider(
235
- minimum=32,
236
- maximum=512,
237
- value=128,
238
- step=32,
239
- label="Max Output Tokens"
240
- )
241
- transcribe_btn = gr.Button("Transcribe", variant="primary")
242
-
243
- with gr.Column():
244
- output_text = gr.Textbox(
245
- label="Transcription",
246
- placeholder="Your transcription will appear here...",
247
- lines=10,
248
- rtl=True
249
- )
250
-
251
- # Set up the transcription action
252
- transcribe_btn.click(
253
- fn=transcribe_audio,
254
- inputs=[audio_input, max_tokens_slider],
255
- outputs=output_text
256
- )
257
-
258
- # Also allow transcription on audio upload/record
259
- audio_input.change(
260
- fn=transcribe_audio,
261
- inputs=[audio_input, max_tokens_slider],
262
- outputs=output_text
263
  )
264
 
265
  gr.Markdown(
@@ -267,10 +124,23 @@ with gr.Blocks(title="Egyptian Code Switching Audio Transcription") as demo:
267
  ### Tips:
268
  - For best results, use clear audio with minimal background noise
269
  - The model specializes in Egyptian Arabic with English code-switching
270
- - Live mode: Speak in short segments for better results
271
- - File mode: Recording length should be reasonable (under 30 seconds recommended)
272
  """
273
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
  # Launch the app
276
  if __name__ == "__main__":
 
3
  from transformers import AutoProcessor, Gemma3nForConditionalGeneration
4
  import torch
5
  import os
 
6
 
7
  # Global variables for model and processor
8
  model = None
 
15
  print("Loading model...")
16
  model_id = "oddadmix/egyptian-code-switching-b4-g2-merged"
17
 
18
+
19
  model = Gemma3nForConditionalGeneration.from_pretrained(
20
  model_id,
21
  device_map="auto",
 
81
  except Exception as e:
82
  return f"Error during transcription: {str(e)}"
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  # Load model at startup
85
  load_model()
86
 
 
90
  """
91
  # 🎙️ Egyptian Code Switching Audio Transcription
92
 
93
+ Upload an audio file or record your voice to get an automatic transcription.
94
  Specialized for Egyptian Arabic with English code-switching.
95
  """
96
  )
97
 
98
+ with gr.Row():
99
+ with gr.Column():
100
+ audio_input = gr.Audio(
101
+ sources=["upload", "microphone"],
102
+ type="filepath",
103
+ label="Audio Input"
 
 
104
  )
105
+ max_tokens_slider = gr.Slider(
106
+ minimum=32,
107
+ maximum=512,
108
+ value=128,
109
+ step=32,
110
+ label="Max Output Tokens"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  )
112
+ transcribe_btn = gr.Button("Transcribe", variant="primary")
113
 
114
+ with gr.Column():
115
+ output_text = gr.Textbox(
116
+ label="Transcription",
117
+ placeholder="Your transcription will appear here...",
118
+ lines=10,
119
+ rtl=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  )
121
 
122
  gr.Markdown(
 
124
  ### Tips:
125
  - For best results, use clear audio with minimal background noise
126
  - The model specializes in Egyptian Arabic with English code-switching
127
+ - Recording length should be reasonable (under 30 seconds recommended)
 
128
  """
129
  )
130
+
131
+ # Set up the transcription action
132
+ transcribe_btn.click(
133
+ fn=transcribe_audio,
134
+ inputs=[audio_input, max_tokens_slider],
135
+ outputs=output_text
136
+ )
137
+
138
+ # Also allow transcription on audio upload/record
139
+ audio_input.change(
140
+ fn=transcribe_audio,
141
+ inputs=[audio_input, max_tokens_slider],
142
+ outputs=output_text
143
+ )
144
 
145
  # Launch the app
146
  if __name__ == "__main__":