SherinMohamed commited on
Commit
8115984
·
verified ·
1 Parent(s): 60358b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -223
app.py CHANGED
@@ -1,107 +1,47 @@
1
  import re
2
- from pathlib import Path
3
-
4
  import gradio as gr
5
  import spaces
6
  import torch
 
7
 
8
- from transformers import (
9
- pipeline,
10
- AutoTokenizer,
11
- AutoModelForCausalLM,
12
- )
13
-
14
- # ======= EGTTS imports (Coqui XTTS) =======
15
- from TTS.tts.configs.xtts_config import XttsConfig
16
- from TTS.tts.models.xtts import Xtts
17
-
18
-
19
- # =========================================================
20
- # 0) CONFIG
21
- # =========================================================
22
- # Translator model (MSA <-> Egyptian)
23
  TRANSLATOR_MODEL = "oddadmix/Masrawy-BiLingual-v1"
24
-
25
- # ASR model (Audio -> text)
26
  ASR_MODEL = "openai/whisper-small"
27
-
28
- # LLM model (Qwen 3B)
29
  LLM_MODEL = "Qwen/Qwen2.5-3B-Instruct"
30
 
31
- # EGTTS (Egyptian TTS) model files hosted on HF (from your provided code) :contentReference[oaicite:4]{index=4}
32
- CONFIG_URL = "https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/config.json"
33
- VOCAB_URL = "https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/vocab.json"
34
- MODEL_URL = "https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/model.pth"
35
- SPEAKER_AUDIO_URL = "https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/speaker_reference.wav"
36
-
37
  USE_GPU = torch.cuda.is_available()
38
- DEVICE_PIPELINE = 0 if USE_GPU else -1
39
- DEVICE_TORCH = "cuda" if USE_GPU else "cpu"
40
-
41
-
42
- # =========================================================
43
- # 1) DOWNLOAD EGTTS FILES (once)
44
- # =========================================================
45
- base_path = Path(__file__).parent
46
- config_path = base_path / "config.json"
47
- vocab_path = base_path / "vocab.json"
48
- model_path = base_path / "model.pth"
49
- default_speaker_path = base_path / "speaker_reference.wav"
50
-
51
- def _download_if_missing(url: str, dst: Path):
52
- if not dst.exists():
53
- torch.hub.download_url_to_file(url, str(dst))
54
 
55
- _download_if_missing(CONFIG_URL, config_path)
56
- _download_if_missing(VOCAB_URL, vocab_path)
57
- _download_if_missing(MODEL_URL, model_path)
58
- _download_if_missing(SPEAKER_AUDIO_URL, default_speaker_path)
59
 
 
 
 
 
 
60
 
61
- # =========================================================
62
- # 2) LOAD MODELS (once)
63
- # =========================================================
64
-
65
- # --- Translator pipeline
66
- translator = pipeline("translation", model=TRANSLATOR_MODEL, device=DEVICE_PIPELINE)
67
-
68
- # --- ASR pipeline
69
- asr = pipeline("automatic-speech-recognition", model=ASR_MODEL, device=DEVICE_PIPELINE)
70
-
71
- # --- Qwen LLM
72
  tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, trust_remote_code=True)
73
- model_llm = AutoModelForCausalLM.from_pretrained(
74
  LLM_MODEL,
75
  torch_dtype="auto",
76
  device_map="auto" if USE_GPU else None,
77
  trust_remote_code=True
78
  )
79
  if not USE_GPU:
80
- model_llm = model_llm.to("cpu")
81
-
82
- # --- EGTTS model
83
- tts_config = XttsConfig()
84
- tts_config.load_json(str(config_path))
85
-
86
- print("Loading EGTTS model...")
87
- tts_model = Xtts.init_from_config(tts_config)
88
- tts_model.load_checkpoint(
89
- tts_config,
90
- checkpoint_path=str(model_path),
91
- use_deepspeed=False,
92
- vocab_path=str(vocab_path),
93
- eval=True
94
- )
95
- tts_model.to(DEVICE_TORCH)
96
- print("EGTTS loaded on:", DEVICE_TORCH)
97
-
98
 
99
- # =========================================================
100
- # 3) TRANSLATION HELPERS (explicit directions)
101
- # =========================================================
102
  def to_msa(text: str) -> str:
103
  """
104
- Convert ANY Arabic (Egyptian/MSA/mix) -> MSA using <ar>
 
105
  """
106
  text = (text or "").strip()
107
  if not text:
@@ -110,46 +50,57 @@ def to_msa(text: str) -> str:
110
 
111
  def to_egyptian(text: str) -> str:
112
  """
113
- Convert MSA -> Egyptian using <arz>
 
114
  """
115
  text = (text or "").strip()
116
  if not text:
117
  return ""
118
  return translator(text + " <arz>")[0]["translation_text"]
119
 
120
-
121
- # =========================================================
122
- # 4) STYLE CLEANUP (remove defensive/meta behavior)
123
- # =========================================================
124
  _BANNED_PHRASES = [
125
- "كمساعد", "كمساعد ذكي", "معلش", "آسف", "اعتذر", "مش عارف",
126
- "لا أستطيع", "غير قادر", "لا يمكنني", "لا أقدر", "لا أملك معلومات",
127
- "قد لا يكون", "ربما", "عادةً", "بشكل عام"
128
  ]
129
 
130
  def clean_egyptian(text: str) -> str:
 
 
 
 
131
  t = (text or "").strip()
 
 
132
  for p in _BANNED_PHRASES:
133
  t = t.replace(p, "")
 
 
134
  t = re.sub(r"\s+", " ", t).strip()
 
 
135
  t = re.sub(r"[.،]{3,}", "…", t).strip()
 
 
136
  if not t:
137
- t = "تمام—قولّي تحب تعمل إيه النهارده: شغل، مذاكرة، ولا راحة؟"
138
- return t
139
 
 
140
 
141
- # =========================================================
142
- # 5) QWEN GENERATION (stable behavior: respond in simple MSA)
143
- # =========================================================
144
  def qwen_generate_msa(msa_prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
145
  msa_prompt = (msa_prompt or "").strip()
146
  if not msa_prompt:
147
  return ""
148
 
149
- # Behavior-first system prompt (important)
150
  system_msg = (
151
  "أنت مساعد شخصي عملي. "
152
- "إذا كان سؤال المستخدم عامًا أو مفتوحًا، اقترح خطة أو خطوات عملية فورًا "
153
  "بدون اعتذار وبدون تبرير لحدودك. "
154
  "اجعل الرد قصيرًا ومباشرًا ومفيدًا. "
155
  "اكتب باللغة العربية الفصحى البسيطة فقط."
@@ -165,11 +116,12 @@ def qwen_generate_msa(msa_prompt: str, max_new_tokens: int, temperature: float,
165
  add_generation_prompt=True,
166
  return_tensors="pt"
167
  )
 
168
  if USE_GPU:
169
- input_ids = input_ids.to(model_llm.device)
170
 
171
  with torch.no_grad():
172
- output_ids = model_llm.generate(
173
  input_ids,
174
  max_new_tokens=max_new_tokens,
175
  do_sample=True,
@@ -179,184 +131,110 @@ def qwen_generate_msa(msa_prompt: str, max_new_tokens: int, temperature: float,
179
  )
180
 
181
  gen_ids = output_ids[0][input_ids.shape[-1]:]
182
- return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
183
-
184
 
185
- # =========================================================
186
- # 6) EGTTS INFERENCE
187
- # =========================================================
188
- def egtss_speak(text_egy: str, speaker_audio_fp: str, tts_temperature: float):
189
  """
190
- text_egy: Egyptian Arabic text (we pass language 'ar' as in your code) :contentReference[oaicite:5]{index=5}
191
- speaker_audio_fp: path to reference audio (4-5 sec)
192
- returns (sr, wav_np)
193
- """
194
- text_egy = (text_egy or "").strip()
195
- if not text_egy:
196
- # empty audio
197
- return None
198
-
199
- ref_path = speaker_audio_fp or str(default_speaker_path)
200
-
201
- # compute speaker latents
202
- gpt_cond_latent, speaker_embedding = tts_model.get_conditioning_latents(audio_path=[ref_path])
203
-
204
- # inference
205
- out = tts_model.inference(
206
- text_egy,
207
- "ar",
208
- gpt_cond_latent,
209
- speaker_embedding,
210
- temperature=tts_temperature
211
- )
212
- return 24000, out["wav"]
213
-
214
-
215
- # =========================================================
216
- # 7) CORE PIPELINE (Text/Audio -> Egyptian text -> TTS audio)
217
- # =========================================================
218
- def _pipeline_text_to_egy_and_audio(
219
- user_text: str,
220
- max_new_tokens: int,
221
- temperature: float,
222
- top_p: float,
223
- speaker_ref: str,
224
- tts_temperature: float
225
- ):
226
- """
227
- Returns:
228
- msa_in, llm_msa, final_egy, audio_tuple(sr, wav)
229
  """
230
  user_text = (user_text or "").strip()
231
  if not user_text:
232
- return "", "", "", None
233
 
234
- # 1) Normalize input to MSA
235
  msa_in = to_msa(user_text)
236
 
237
- # 2) LLM in MSA
238
  llm_msa = qwen_generate_msa(msa_in, max_new_tokens, temperature, top_p)
239
 
240
- # 3) Convert to Egyptian + clean
241
  final_egy = clean_egyptian(to_egyptian(llm_msa))
242
-
243
- # 4) TTS
244
- audio = egtss_speak(final_egy, speaker_ref, tts_temperature)
245
-
246
- return msa_in, llm_msa, final_egy, audio
247
-
248
 
249
  @spaces.GPU
250
- def generate_from_text(
251
- user_text: str,
252
- max_new_tokens: int,
253
- temperature: float,
254
- top_p: float,
255
- speaker_ref: str,
256
- tts_temperature: float,
257
- show_debug: bool
258
- ):
259
- msa_in, llm_msa, final_egy, audio = _pipeline_text_to_egy_and_audio(
260
- user_text, max_new_tokens, temperature, top_p, speaker_ref, tts_temperature
261
- )
262
 
263
  if show_debug:
264
- return msa_in, llm_msa, final_egy, audio
265
 
266
  # hide debug outputs
267
- return "", "", final_egy, audio
268
-
269
 
270
  @spaces.GPU
271
- def generate_from_audio(
272
- audio_path: str,
273
- max_new_tokens: int,
274
- temperature: float,
275
- top_p: float,
276
- speaker_ref: str,
277
- tts_temperature: float,
278
- show_debug: bool
279
- ):
280
  if not audio_path:
281
  if show_debug:
282
- return "", "", "", "", None
283
- return "", "", "", "", None
284
 
285
- # 1) ASR
286
  asr_out = asr(audio_path)
287
  asr_text = (asr_out.get("text", "") if isinstance(asr_out, dict) else str(asr_out)).strip()
 
288
  if not asr_text:
289
  if show_debug:
290
- return "", "", "", "", None
291
- return "", "", "", "", None
292
 
293
- # 2) Full pipeline
294
- msa_in, llm_msa, final_egy, audio = _pipeline_text_to_egy_and_audio(
295
- asr_text, max_new_tokens, temperature, top_p, speaker_ref, tts_temperature
296
- )
297
 
298
  if show_debug:
299
- return asr_text, msa_in, llm_msa, final_egy, audio
300
 
301
- # hide debug except ASR + final + audio
302
- return asr_text, "", "", final_egy, audio
303
 
304
-
305
- # =========================================================
306
- # 8) GRADIO UI
307
- # =========================================================
308
- with gr.Blocks(title="Egyptian Arabic Assistant (Chatbot + TTS)") as demo:
309
  gr.Markdown(
310
- "## Egyptian Arabic Assistant (Chatbot + TTS)\n"
311
- "**Pipeline:** Input → (to MSA) → Qwen (MSA) → (to Egyptian) → **EGTTS صوت**\n\n"
312
- "ملاحظة: تقدر ترفع Speaker Reference (4–5 ثواني) أو تسيبه الافتراضي."
 
313
  )
314
 
315
  with gr.Row():
316
  max_new_tokens = gr.Slider(64, 512, value=256, step=16, label="Max new tokens")
317
- temp = gr.Slider(0.1, 1.2, value=0.7, step=0.05, label="Temperature")
318
  top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p")
319
-
320
- with gr.Row():
321
- speaker_ref = gr.Audio(
322
- label="Speaker reference (optional)",
323
- value=str(default_speaker_path),
324
- type="filepath"
325
- )
326
- tts_temp = gr.Slider(0.1, 1.0, value=0.75, step=0.05, label="TTS Temperature")
327
-
328
- show_debug = gr.Checkbox(value=False, label="Show debug outputs")
329
 
330
  with gr.Tabs():
331
  with gr.TabItem("Text Input"):
332
- txt_in = gr.Textbox(lines=4, placeholder="اكتب هنا (مصري/فصحى)", label="Input Text")
333
- btn = gr.Button("Generate (Text → Reply + Voice)", variant="primary")
334
 
335
  dbg_msa_in = gr.Textbox(lines=2, label="(Debug) Input after to_msa")
336
  dbg_llm_msa = gr.Textbox(lines=3, label="(Debug) Qwen output (MSA)")
337
- out_egy = gr.Textbox(lines=4, label="Final Output (Egyptian)")
338
- out_audio = gr.Audio(label="Synthesized audio (EGTTS)")
339
 
340
- btn.click(
341
- generate_from_text,
342
- inputs=[txt_in, max_new_tokens, temp, top_p, speaker_ref, tts_temp, show_debug],
343
- outputs=[dbg_msa_in, dbg_llm_msa, out_egy, out_audio]
344
  )
345
 
346
  with gr.TabItem("Audio Input"):
347
  aud_in = gr.Audio(type="filepath", label="Upload Audio (WAV/MP3)")
348
- btn_a = gr.Button("Generate (Audio → Reply + Voice)", variant="primary")
349
 
350
  asr_txt = gr.Textbox(lines=2, label="ASR Text")
351
  dbg_msa_in_a = gr.Textbox(lines=2, label="(Debug) ASR after to_msa")
352
  dbg_llm_msa_a = gr.Textbox(lines=3, label="(Debug) Qwen output (MSA)")
353
- out_egy_a = gr.Textbox(lines=4, label="Final Output (Egyptian)")
354
- out_audio_a = gr.Audio(label="Synthesized audio (EGTTS)")
355
 
356
- btn_a.click(
357
- generate_from_audio,
358
- inputs=[aud_in, max_new_tokens, temp, top_p, speaker_ref, tts_temp, show_debug],
359
- outputs=[asr_txt, dbg_msa_in_a, dbg_llm_msa_a, out_egy_a, out_audio_a]
360
  )
361
 
362
  demo.launch()
 
1
  import re
 
 
2
  import gradio as gr
3
  import spaces
4
  import torch
5
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
6
 
7
+ # =========================
8
+ # 0) Config
9
+ # =========================
 
 
 
 
 
 
 
 
 
 
 
 
10
  TRANSLATOR_MODEL = "oddadmix/Masrawy-BiLingual-v1"
 
 
11
  ASR_MODEL = "openai/whisper-small"
 
 
12
  LLM_MODEL = "Qwen/Qwen2.5-3B-Instruct"
13
 
 
 
 
 
 
 
14
  USE_GPU = torch.cuda.is_available()
15
+ DEVICE = 0 if USE_GPU else -1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # =========================
18
+ # 1) Load models (once)
19
+ # =========================
20
+ translator = pipeline("translation", model=TRANSLATOR_MODEL, device=DEVICE)
21
 
22
+ asr = pipeline(
23
+ "automatic-speech-recognition",
24
+ model=ASR_MODEL,
25
+ device=DEVICE
26
+ )
27
 
 
 
 
 
 
 
 
 
 
 
 
28
  tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, trust_remote_code=True)
29
+ model = AutoModelForCausalLM.from_pretrained(
30
  LLM_MODEL,
31
  torch_dtype="auto",
32
  device_map="auto" if USE_GPU else None,
33
  trust_remote_code=True
34
  )
35
  if not USE_GPU:
36
+ model = model.to("cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # =========================
39
+ # 2) Translator helpers (explicit direction, non-ambiguous)
40
+ # =========================
41
  def to_msa(text: str) -> str:
42
  """
43
+ Convert ANY Arabic (Egyptian/MSA/mix) -> MSA.
44
+ Uses tag <ar> (model behavior in your translator code).
45
  """
46
  text = (text or "").strip()
47
  if not text:
 
50
 
51
  def to_egyptian(text: str) -> str:
52
  """
53
+ Convert MSA -> Egyptian.
54
+ Uses tag <arz>.
55
  """
56
  text = (text or "").strip()
57
  if not text:
58
  return ""
59
  return translator(text + " <arz>")[0]["translation_text"]
60
 
61
+ # =========================
62
+ # 3) Output cleaning (Detox / style shaping)
63
+ # =========================
 
64
  _BANNED_PHRASES = [
65
+ "كمساعد", "كمساع�� ذكي", "معلش", "آسف", "اعتذر", "مش عارف", "لا أستطيع", "غير قادر",
66
+ "لا يمكنني", "لا أقدر", "لا أملك معلومات", "قد لا يكون", "ربما", "عادةً", "بشكل عام"
 
67
  ]
68
 
69
  def clean_egyptian(text: str) -> str:
70
+ """
71
+ Lightweight cleanup to remove annoying meta/defensive phrasing.
72
+ Not meant to be perfect; keeps it simple and safe.
73
+ """
74
  t = (text or "").strip()
75
+
76
+ # Remove banned phrases (simple replace)
77
  for p in _BANNED_PHRASES:
78
  t = t.replace(p, "")
79
+
80
+ # Collapse extra spaces
81
  t = re.sub(r"\s+", " ", t).strip()
82
+
83
+ # Remove repeated punctuation
84
  t = re.sub(r"[.،]{3,}", "…", t).strip()
85
+
86
+ # If it becomes empty, fall back to a helpful default
87
  if not t:
88
+ t = "تمام—قولي انت فاضي ولا عندك شغل/مذاكرة النهارده؟"
 
89
 
90
+ return t
91
 
92
+ # =========================
93
+ # 4) Qwen generation (in MSA for stability)
94
+ # =========================
95
  def qwen_generate_msa(msa_prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
96
  msa_prompt = (msa_prompt or "").strip()
97
  if not msa_prompt:
98
  return ""
99
 
100
+ # Behavior-first system message (MOST IMPORTANT CHANGE)
101
  system_msg = (
102
  "أنت مساعد شخصي عملي. "
103
+ "إذا كان سؤال المستخدم عامًا أو مفتوحًا، اقترح خطة أو خطوات عملية من نفسك فورًا "
104
  "بدون اعتذار وبدون تبرير لحدودك. "
105
  "اجعل الرد قصيرًا ومباشرًا ومفيدًا. "
106
  "اكتب باللغة العربية الفصحى البسيطة فقط."
 
116
  add_generation_prompt=True,
117
  return_tensors="pt"
118
  )
119
+
120
  if USE_GPU:
121
+ input_ids = input_ids.to(model.device)
122
 
123
  with torch.no_grad():
124
+ output_ids = model.generate(
125
  input_ids,
126
  max_new_tokens=max_new_tokens,
127
  do_sample=True,
 
131
  )
132
 
133
  gen_ids = output_ids[0][input_ids.shape[-1]:]
134
+ text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
135
+ return text
136
 
137
+ # =========================
138
+ # 5) Core pipeline (stable + non-ambiguous)
139
+ # =========================
140
+ def _pipeline_from_text(user_text: str, max_new_tokens: int, temperature: float, top_p: float):
141
  """
142
+ Input -> (to MSA) -> Qwen (MSA) -> (to Egyptian) -> clean
143
+ Returns: msa_in, llm_msa, final_egy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  """
145
  user_text = (user_text or "").strip()
146
  if not user_text:
147
+ return "", "", ""
148
 
149
+ # 1) Normalize input to MSA (stable for LLM)
150
  msa_in = to_msa(user_text)
151
 
152
+ # 2) LLM outputs in MSA (behavior controlled by system prompt)
153
  llm_msa = qwen_generate_msa(msa_in, max_new_tokens, temperature, top_p)
154
 
155
+ # 3) Force Egyptian output + clean
156
  final_egy = clean_egyptian(to_egyptian(llm_msa))
157
+ return msa_in, llm_msa, final_egy
 
 
 
 
 
158
 
159
  @spaces.GPU
160
+ def process_text(user_text: str, max_new_tokens: int, temperature: float, top_p: float, show_debug: bool):
161
+ msa_in, llm_msa, final_egy = _pipeline_from_text(user_text, max_new_tokens, temperature, top_p)
 
 
 
 
 
 
 
 
 
 
162
 
163
  if show_debug:
164
+ return msa_in, llm_msa, final_egy
165
 
166
  # hide debug outputs
167
+ return "", "", final_egy
 
168
 
169
  @spaces.GPU
170
+ def process_audio(audio_path: str, max_new_tokens: int, temperature: float, top_p: float, show_debug: bool):
 
 
 
 
 
 
 
 
171
  if not audio_path:
172
  if show_debug:
173
+ return "", "", "", ""
174
+ return "", "", "", ""
175
 
176
+ # ASR
177
  asr_out = asr(audio_path)
178
  asr_text = (asr_out.get("text", "") if isinstance(asr_out, dict) else str(asr_out)).strip()
179
+
180
  if not asr_text:
181
  if show_debug:
182
+ return "", "", "", ""
183
+ return "", "", "", ""
184
 
185
+ msa_in, llm_msa, final_egy = _pipeline_from_text(asr_text, max_new_tokens, temperature, top_p)
 
 
 
186
 
187
  if show_debug:
188
+ return asr_text, msa_in, llm_msa, final_egy
189
 
190
+ # hide debug outputs except ASR text + final
191
+ return asr_text, "", "", final_egy
192
 
193
+ # =========================
194
+ # 6) Gradio UI
195
+ # =========================
196
+ with gr.Blocks(title="Egyptian Arabic Assistant") as demo:
 
197
  gr.Markdown(
198
+ "## Egyptian Arabic Assistant\n"
199
+ "منطق ثابت وواضح:\n"
200
+ "**Input (to MSA) Qwen (MSA) (to Egyptian) → Output**\n\n"
201
+ "السلوك: رد عملي ومباشر، بدون اعتذار وبدون كلام Meta."
202
  )
203
 
204
  with gr.Row():
205
  max_new_tokens = gr.Slider(64, 512, value=256, step=16, label="Max new tokens")
206
+ temperature = gr.Slider(0.1, 1.2, value=0.7, step=0.05, label="Temperature")
207
  top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p")
208
+ show_debug = gr.Checkbox(value=False, label="Show debug outputs")
 
 
 
 
 
 
 
 
 
209
 
210
  with gr.Tabs():
211
  with gr.TabItem("Text Input"):
212
+ txt_in = gr.Textbox(lines=4, placeholder="اكتب هنا (مصري/فصحى)", label="Input")
213
+ txt_btn = gr.Button("Generate")
214
 
215
  dbg_msa_in = gr.Textbox(lines=2, label="(Debug) Input after to_msa")
216
  dbg_llm_msa = gr.Textbox(lines=3, label="(Debug) Qwen output (MSA)")
217
+ out_egy = gr.Textbox(lines=5, label="Final Output (Egyptian)")
 
218
 
219
+ txt_btn.click(
220
+ process_text,
221
+ inputs=[txt_in, max_new_tokens, temperature, top_p, show_debug],
222
+ outputs=[dbg_msa_in, dbg_llm_msa, out_egy],
223
  )
224
 
225
  with gr.TabItem("Audio Input"):
226
  aud_in = gr.Audio(type="filepath", label="Upload Audio (WAV/MP3)")
227
+ aud_btn = gr.Button("Transcribe + Generate")
228
 
229
  asr_txt = gr.Textbox(lines=2, label="ASR Text")
230
  dbg_msa_in_a = gr.Textbox(lines=2, label="(Debug) ASR after to_msa")
231
  dbg_llm_msa_a = gr.Textbox(lines=3, label="(Debug) Qwen output (MSA)")
232
+ out_egy_a = gr.Textbox(lines=5, label="Final Output (Egyptian)")
 
233
 
234
+ aud_btn.click(
235
+ process_audio,
236
+ inputs=[aud_in, max_new_tokens, temperature, top_p, show_debug],
237
+ outputs=[asr_txt, dbg_msa_in_a, dbg_llm_msa_a, out_egy_a],
238
  )
239
 
240
  demo.launch()