oddadmix commited on
Commit
1cf51f9
Β·
verified Β·
1 Parent(s): 20554b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -37
app.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
  from transformers import AutoProcessor, Gemma3nForConditionalGeneration
4
  import torch
5
  import os
 
6
 
7
  # Global variables for model and processor
8
  model = None
@@ -13,8 +14,7 @@ def load_model():
13
  global model, processor
14
 
15
  print("Loading model...")
16
- model_id = "oddadmix/gemma-4b-egyptian-code-switching-b4-g2-merged"
17
-
18
 
19
  model = Gemma3nForConditionalGeneration.from_pretrained(
20
  model_id,
@@ -81,6 +81,85 @@ def transcribe_audio(audio_path, max_tokens=128):
81
  except Exception as e:
82
  return f"Error during transcription: {str(e)}"
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  # Load model at startup
85
  load_model()
86
 
@@ -90,33 +169,97 @@ with gr.Blocks(title="Egyptian Code Switching Audio Transcription") as demo:
90
  """
91
  # πŸŽ™οΈ Egyptian Code Switching Audio Transcription
92
 
93
- Upload an audio file or record your voice to get an automatic transcription.
94
  Specialized for Egyptian Arabic with English code-switching.
95
  """
96
  )
97
 
98
- with gr.Row():
99
- with gr.Column():
100
- audio_input = gr.Audio(
101
- sources=["upload", "microphone"],
102
- type="filepath",
103
- label="Audio Input"
 
 
104
  )
105
- max_tokens_slider = gr.Slider(
106
- minimum=32,
107
- maximum=512,
108
- value=128,
109
- step=32,
110
- label="Max Output Tokens"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  )
112
- transcribe_btn = gr.Button("Transcribe", variant="primary")
113
 
114
- with gr.Column():
115
- output_text = gr.Textbox(
116
- label="Transcription",
117
- placeholder="Your transcription will appear here...",
118
- lines=10,
119
- rtl=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  )
121
 
122
  gr.Markdown(
@@ -124,23 +267,10 @@ with gr.Blocks(title="Egyptian Code Switching Audio Transcription") as demo:
124
  ### Tips:
125
  - For best results, use clear audio with minimal background noise
126
  - The model specializes in Egyptian Arabic with English code-switching
127
- - Recording length should be reasonable (under 30 seconds recommended)
 
128
  """
129
  )
130
-
131
- # Set up the transcription action
132
- transcribe_btn.click(
133
- fn=transcribe_audio,
134
- inputs=[audio_input, max_tokens_slider],
135
- outputs=output_text
136
- )
137
-
138
- # Also allow transcription on audio upload/record
139
- audio_input.change(
140
- fn=transcribe_audio,
141
- inputs=[audio_input, max_tokens_slider],
142
- outputs=output_text
143
- )
144
 
145
  # Launch the app
146
  if __name__ == "__main__":
 
3
  from transformers import AutoProcessor, Gemma3nForConditionalGeneration
4
  import torch
5
  import os
6
+ import numpy as np
7
 
8
  # Global variables for model and processor
9
  model = None
 
14
  global model, processor
15
 
16
  print("Loading model...")
17
+ model_id = "oddadmix/egyptian-code-switching-b4-g2-merged"
 
18
 
19
  model = Gemma3nForConditionalGeneration.from_pretrained(
20
  model_id,
 
81
  except Exception as e:
82
  return f"Error during transcription: {str(e)}"
83
 
84
+ @spaces.GPU
85
+ def live_transcribe(audio_stream, max_tokens=128):
86
+ """Transcribe audio stream in real-time"""
87
+ if model is None or processor is None:
88
+ yield "Error: Model not loaded"
89
+ return
90
+
91
+ if audio_stream is None:
92
+ yield "Waiting for audio input..."
93
+ return
94
+
95
+ try:
96
+ # Extract sample rate and audio data
97
+ sample_rate, audio_data = audio_stream
98
+
99
+ # Check if we have enough audio data (at least 1 second)
100
+ if len(audio_data) < sample_rate:
101
+ yield "Recording... (speak now)"
102
+ return
103
+
104
+ # Save temporary audio file
105
+ import tempfile
106
+ import soundfile as sf
107
+
108
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
109
+ tmp_path = tmp_file.name
110
+ sf.write(tmp_path, audio_data, sample_rate)
111
+
112
+ try:
113
+ messages = [
114
+ {
115
+ "role": "system",
116
+ "content": [
117
+ {
118
+ "type": "text",
119
+ "text": "You are an assistant that transcribes speech accurately.",
120
+ }
121
+ ],
122
+ },
123
+ {
124
+ "role": "user",
125
+ "content": [
126
+ {"type": "audio", "url": tmp_path},
127
+ {"type": "text", "text": "Please transcribe this audio."}
128
+ ]
129
+ }
130
+ ]
131
+
132
+ inputs = processor.apply_chat_template(
133
+ messages,
134
+ add_generation_prompt=True,
135
+ tokenize=True,
136
+ return_dict=True,
137
+ return_tensors="pt",
138
+ ).to(model.device)
139
+
140
+ input_len = inputs["input_ids"].shape[-1]
141
+
142
+ # Generate transcription
143
+ with torch.inference_mode():
144
+ generation = model.generate(
145
+ **inputs,
146
+ max_new_tokens=max_tokens,
147
+ do_sample=False
148
+ )
149
+ generation = generation[0][input_len:]
150
+
151
+ response = processor.decode(generation, skip_special_tokens=True)
152
+
153
+ yield response
154
+
155
+ finally:
156
+ # Clean up temporary file
157
+ if os.path.exists(tmp_path):
158
+ os.unlink(tmp_path)
159
+
160
+ except Exception as e:
161
+ yield f"Error during transcription: {str(e)}"
162
+
163
  # Load model at startup
164
  load_model()
165
 
 
169
  """
170
  # πŸŽ™οΈ Egyptian Code Switching Audio Transcription
171
 
172
+ Choose between live transcription or file upload for automatic transcription.
173
  Specialized for Egyptian Arabic with English code-switching.
174
  """
175
  )
176
 
177
+ with gr.Tabs():
178
+ # Live Transcription Tab
179
+ with gr.Tab("Live Transcription"):
180
+ gr.Markdown(
181
+ """
182
+ ### πŸ”΄ Live Transcription Mode
183
+ Click the microphone button below and start speaking. The transcription will update in real-time.
184
+ """
185
  )
186
+
187
+ with gr.Row():
188
+ with gr.Column():
189
+ live_audio = gr.Audio(
190
+ sources=["microphone"],
191
+ type="numpy",
192
+ label="Live Audio Input",
193
+ streaming=True
194
+ )
195
+ live_max_tokens = gr.Slider(
196
+ minimum=32,
197
+ maximum=512,
198
+ value=128,
199
+ step=32,
200
+ label="Max Output Tokens"
201
+ )
202
+
203
+ with gr.Column():
204
+ live_output = gr.Textbox(
205
+ label="Live Transcription",
206
+ placeholder="Start speaking and transcription will appear here...",
207
+ lines=10,
208
+ rtl=True
209
+ )
210
+
211
+ # Set up live transcription
212
+ live_audio.stream(
213
+ fn=live_transcribe,
214
+ inputs=[live_audio, live_max_tokens],
215
+ outputs=live_output
216
+ )
217
+
218
+ # File Upload Tab
219
+ with gr.Tab("File Upload"):
220
+ gr.Markdown(
221
+ """
222
+ ### πŸ“ File Upload Mode
223
+ Upload an audio file or record your voice to get a transcription.
224
+ """
225
  )
 
226
 
227
+ with gr.Row():
228
+ with gr.Column():
229
+ audio_input = gr.Audio(
230
+ sources=["upload", "microphone"],
231
+ type="filepath",
232
+ label="Audio Input"
233
+ )
234
+ max_tokens_slider = gr.Slider(
235
+ minimum=32,
236
+ maximum=512,
237
+ value=128,
238
+ step=32,
239
+ label="Max Output Tokens"
240
+ )
241
+ transcribe_btn = gr.Button("Transcribe", variant="primary")
242
+
243
+ with gr.Column():
244
+ output_text = gr.Textbox(
245
+ label="Transcription",
246
+ placeholder="Your transcription will appear here...",
247
+ lines=10,
248
+ rtl=True
249
+ )
250
+
251
+ # Set up the transcription action
252
+ transcribe_btn.click(
253
+ fn=transcribe_audio,
254
+ inputs=[audio_input, max_tokens_slider],
255
+ outputs=output_text
256
+ )
257
+
258
+ # Also allow transcription on audio upload/record
259
+ audio_input.change(
260
+ fn=transcribe_audio,
261
+ inputs=[audio_input, max_tokens_slider],
262
+ outputs=output_text
263
  )
264
 
265
  gr.Markdown(
 
267
  ### Tips:
268
  - For best results, use clear audio with minimal background noise
269
  - The model specializes in Egyptian Arabic with English code-switching
270
+ - Live mode: Speak in short segments for better results
271
+ - File mode: Recording length should be reasonable (under 30 seconds recommended)
272
  """
273
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
  # Launch the app
276
  if __name__ == "__main__":