MuhammadHijazii commited on
Commit
8489329
·
verified ·
1 Parent(s): 8fff02a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -60
app.py CHANGED
@@ -12,15 +12,28 @@ from transformers import AutoTokenizer, AutoModel
12
  import soundfile as sf
13
 
14
  # =========================
15
- # Device & global config
16
  # =========================
17
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
- CPU_MODE = (DEVICE != "cuda")
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- # أمان الذاكرة على CPU
21
- DEFAULT_WHISPER_CPU = "large-v3"
22
- DEFAULT_COMPUTE_CPU = "int8"
23
- DEFAULT_USE_MARBERT_CPU = True
 
24
 
25
  # =========================
26
  # Lazy models
@@ -33,28 +46,28 @@ _WHISPER = None
33
  def load_models(
34
  sbert_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
35
  marbert_name="UBC-NLP/MARBERT",
36
- whisper_name="small",
37
- whisper_compute="int8"
 
38
  ):
39
- """Load models only once."""
40
  global _SBERT, _MARBERT_TOK, _MARBERT, _WHISPER
41
 
42
- # حماية على CPU: اجبار نماذج أخف
43
- if CPU_MODE:
44
- whisper_name = DEFAULT_WHISPER_CPU
45
- whisper_compute = DEFAULT_COMPUTE_CPU
46
-
47
  if _SBERT is None:
48
- _SBERT = SentenceTransformer(sbert_name, device=DEVICE)
 
49
 
50
- # حمّل MARBERT فقط عند الحاجة (قد يستهلك RAM)
51
- if _MARBERT is None and (not CPU_MODE):
52
  _MARBERT_TOK = AutoTokenizer.from_pretrained(marbert_name)
53
- _MARBERT = AutoModel.from_pretrained(marbert_name).to(DEVICE)
54
  _MARBERT.eval()
 
55
 
56
  if _WHISPER is None:
57
- _WHISPER = WhisperModel(whisper_name, device=DEVICE, compute_type=whisper_compute)
 
 
58
 
59
  # =========================
60
  # Normalization / Tokenization / Alignment
@@ -68,7 +81,6 @@ def normalize_ar_orth(text: str) -> str:
68
  return text
69
 
70
  def simple_tokenize(text: str):
71
- """يحاول punkt؛ وإن فشل يستخدم تجزئة بسيطة بالمسافات."""
72
  t = normalize_ar_orth(text)
73
  try:
74
  import nltk
@@ -108,7 +120,8 @@ def arabic_soundex(word):
108
  for ch in w:
109
  for rep, chars in groups.items():
110
  if ch in chars:
111
- code.append(rep); break
 
112
  return "".join(code)
113
 
114
  def phonetic_similarity(w1, w2):
@@ -168,8 +181,8 @@ def marbert_cls_similarity(a: str, b: str) -> float:
168
  if _MARBERT is None:
169
  return 0.0
170
  with torch.no_grad():
171
- ta = _MARBERT_TOK(a, return_tensors='pt', truncation=True, padding=True).to(DEVICE)
172
- tb = _MARBERT_TOK(b, return_tensors='pt', truncation=True, padding=True).to(DEVICE)
173
  ea = _MARBERT(**ta).last_hidden_state[:,0,:]
174
  eb = _MARBERT(**tb).last_hidden_state[:,0,:]
175
  sim = util.cos_sim(ea, eb).item()
@@ -390,7 +403,7 @@ def literal_similarity(original, recited):
390
  return {"levenshtein": round(lev,3), "word_overlap": round(word_overlap,3),
391
  "bleu1": round(bleu1,3), "literal_score": round(final_score,3)}
392
 
393
- def semantic_similarity(original, recited, use_marbert=True):
394
  sbert_sim = float(util.pytorch_cos_sim(_SBERT.encode(original, convert_to_tensor=True),
395
  _SBERT.encode(recited, convert_to_tensor=True)))
396
  marbert_sim = marbert_cls_similarity(original, recited) if use_marbert else 0.0
@@ -398,10 +411,9 @@ def semantic_similarity(original, recited, use_marbert=True):
398
  "semantic_score": round(max(sbert_sim, marbert_sim),3)}
399
 
400
  # =========================
401
- # Audio input helper
402
  # =========================
403
  def ensure_audio_path(audio):
404
- """Accepts filepath (str) OR (numpy_array, sr). Returns a valid filepath."""
405
  if isinstance(audio, str):
406
  if not os.path.exists(audio):
407
  raise FileNotFoundError(f"Audio path not found: {audio}")
@@ -415,7 +427,7 @@ def ensure_audio_path(audio):
415
  raise ValueError("Unsupported audio input format")
416
 
417
  # =========================
418
- # Pipeline (with robust error reporting)
419
  # =========================
420
  def transcribe_and_evaluate(audio, original_text, whisper_size=None,
421
  compute_type=None, vad=True, use_marbert=True):
@@ -423,29 +435,29 @@ def transcribe_and_evaluate(audio, original_text, whisper_size=None,
423
  if not original_text or not original_text.strip():
424
  raise ValueError("Original text is empty.")
425
 
426
- # Defaults per device
427
- if CPU_MODE:
428
- whisper_size = DEFAULT_WHISPER_CPU
429
- compute_type = DEFAULT_COMPUTE_CPU
430
- use_marbert = DEFAULT_USE_MARBERT_CPU
431
- else:
432
- whisper_size = whisper_size or "large-v3"
433
- compute_type = compute_type or "float16"
434
 
435
- load_models(whisper_name=whisper_size, whisper_compute=compute_type)
436
 
437
  audio_path = ensure_audio_path(audio)
438
- segments, info = _WHISPER.transcribe(
439
- audio_path, word_timestamps=True,
440
- vad_filter=vad, vad_parameters={"min_silence_duration_ms": 200}
441
- )
442
  segments = list(segments)
 
443
 
 
444
  words = []
445
  for seg in segments:
446
  for w in (seg.words or []):
447
  tok = clean_ar_token(w.word)
448
- if tok: words.append(tok)
 
449
  asr_text = " ".join(words)
450
 
451
  ref_tokens = simple_tokenize(original_text)
@@ -454,6 +466,7 @@ def transcribe_and_evaluate(audio, original_text, whisper_size=None,
454
 
455
  df_words = extract_word_conf_table(segments)
456
  asr_token_conf, low_t, high_t = build_asr_token_conf(df_words, hyp_tokens)
 
457
 
458
  results, corrected_text = classify_alignment_optimized(
459
  aligned, ref_tokens, hyp_tokens,
@@ -462,33 +475,31 @@ def transcribe_and_evaluate(audio, original_text, whisper_size=None,
462
  )
463
 
464
  lit = literal_similarity(original_text, corrected_text)
465
- sem = semantic_similarity(original_text, corrected_text, use_marbert=(use_marbert and not CPU_MODE))
466
 
467
  df = pd.DataFrame(results)
468
 
469
  report = {
470
- "whisper_model": whisper_size,
471
- "compute_type": compute_type,
472
  "original_text": original_text,
473
  "asr_text": asr_text,
474
  "corrected_text": corrected_text,
475
  "literal": lit,
476
  "semantic": sem,
477
- "low_t": low_t, "high_t": high_t,
478
  }
479
  return corrected_text, asr_text, json.dumps(report, ensure_ascii=False, indent=2), df
480
 
481
  except Exception as e:
482
  tb = traceback.format_exc()
483
  print("ERROR in transcribe_and_evaluate:\n", tb, flush=True)
484
- # نرجع JSON بالخطأ بدل ما نفجّر الواجهة
485
  empty_df = pd.DataFrame([{"ASR_word":"","GT_word":"","status":"ERROR","reason":str(e),"used":""}])
486
  err_json = json.dumps({"error": str(e), "traceback": tb}, ensure_ascii=False, indent=2)
487
  gr.Warning(str(e))
488
  return "", "", err_json, empty_df
489
 
490
  def api_predict(audio, original_text, whisper_size=None, compute_type=None, vad=True, use_marbert=True):
491
- # نفس الدالة لكن ترجع JSON فقط
492
  corrected_text, asr_text, report_json, df = transcribe_and_evaluate(
493
  audio, original_text, whisper_size, compute_type, vad, use_marbert
494
  )
@@ -505,23 +516,15 @@ def build_ui():
505
  gr.Markdown("## Samaali — ASR Post-Processing (Whisper + Alignment + Confidence + Semantics)")
506
 
507
  with gr.Row():
508
- # filepath أسلم للـ Spaces
509
  audio = gr.Audio(sources=["microphone","upload"], type="filepath", label="Audio")
510
  original = gr.Textbox(lines=8, label="Original Text (Ground Truth)")
511
 
512
  with gr.Row():
513
- whisper_size = gr.Dropdown(
514
- choices=["tiny","base","small","medium","large-v3"],
515
- value=("large-v3" if not CPU_MODE else DEFAULT_WHISPER_CPU),
516
- label="Whisper model size"
517
- )
518
- compute_type = gr.Dropdown(
519
- choices=["int8", "int8_float16", "float16", "float32"],
520
- value=("float16" if not CPU_MODE else DEFAULT_COMPUTE_CPU),
521
- label="compute_type"
522
- )
523
  vad = gr.Checkbox(value=True, label="VAD filter")
524
- use_marbert = gr.Checkbox(value=(not CPU_MODE), label="Use MARBERT (semantic)")
525
 
526
  btn = gr.Button("Transcribe & Evaluate", variant="primary")
527
 
 
12
  import soundfile as sf
13
 
14
  # =========================
15
+ # Global config (forced per your request)
16
  # =========================
17
+ # نثبّت الإعدادات المطلوبة على CPU
18
+ FORCE_WHISPER_NAME = "large-v3"
19
+ FORCE_COMPUTE_TYPE = "int8"
20
+ FORCE_USE_MARBERT = True
21
+
22
+ # خيارات تفريغ ثابتة لتقليل الفروقات مع النوتبوك
23
+ ASR_OPTS = dict(
24
+ word_timestamps=True,
25
+ vad_filter=True,
26
+ vad_parameters={"min_silence_duration_ms": 200},
27
+ beam_size=5,
28
+ best_of=5,
29
+ temperature=0.0, # جعل فك التشفير حتمي قدر الإمكان
30
+ )
31
 
32
+ # =========================
33
+ # Device
34
+ # =========================
35
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
36
+ print(f"[INIT] DEVICE={DEVICE}", flush=True)
37
 
38
  # =========================
39
  # Lazy models
 
46
  def load_models(
47
  sbert_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
48
  marbert_name="UBC-NLP/MARBERT",
49
+ whisper_name=FORCE_WHISPER_NAME,
50
+ whisper_compute=FORCE_COMPUTE_TYPE,
51
+ use_marbert=FORCE_USE_MARBERT
52
  ):
53
+ """Load models once; forced config respected even on CPU."""
54
  global _SBERT, _MARBERT_TOK, _MARBERT, _WHISPER
55
 
 
 
 
 
 
56
  if _SBERT is None:
57
+ _SBERT = SentenceTransformer(sbert_name, device=("cuda" if DEVICE=="cuda" else "cpu"))
58
+ print(f"[LOAD] SBERT: {sbert_name}", flush=True)
59
 
60
+ # مفعّل على CPU حسب رغبتك
61
+ if _MARBERT is None and use_marbert:
62
  _MARBERT_TOK = AutoTokenizer.from_pretrained(marbert_name)
63
+ _MARBERT = AutoModel.from_pretrained(marbert_name).to(("cuda" if DEVICE=="cuda" else "cpu"))
64
  _MARBERT.eval()
65
+ print(f"[LOAD] MARBERT: {marbert_name} (device={DEVICE})", flush=True)
66
 
67
  if _WHISPER is None:
68
+ _WHISPER = WhisperModel(whisper_name, device=("cuda" if DEVICE=="cuda" else "cpu"),
69
+ compute_type=whisper_compute)
70
+ print(f"[LOAD] Whisper: {whisper_name} (compute={whisper_compute})", flush=True)
71
 
72
  # =========================
73
  # Normalization / Tokenization / Alignment
 
81
  return text
82
 
83
  def simple_tokenize(text: str):
 
84
  t = normalize_ar_orth(text)
85
  try:
86
  import nltk
 
120
  for ch in w:
121
  for rep, chars in groups.items():
122
  if ch in chars:
123
+ code.append(rep)
124
+ break
125
  return "".join(code)
126
 
127
  def phonetic_similarity(w1, w2):
 
181
  if _MARBERT is None:
182
  return 0.0
183
  with torch.no_grad():
184
+ ta = _MARBERT_TOK(a, return_tensors='pt', truncation=True, padding=True).to(("cuda" if DEVICE=="cuda" else "cpu"))
185
+ tb = _MARBERT_TOK(b, return_tensors='pt', truncation=True, padding=True).to(("cuda" if DEVICE=="cuda" else "cpu"))
186
  ea = _MARBERT(**ta).last_hidden_state[:,0,:]
187
  eb = _MARBERT(**tb).last_hidden_state[:,0,:]
188
  sim = util.cos_sim(ea, eb).item()
 
403
  return {"levenshtein": round(lev,3), "word_overlap": round(word_overlap,3),
404
  "bleu1": round(bleu1,3), "literal_score": round(final_score,3)}
405
 
406
+ def semantic_similarity(original, recited, use_marbert=FORCE_USE_MARBERT):
407
  sbert_sim = float(util.pytorch_cos_sim(_SBERT.encode(original, convert_to_tensor=True),
408
  _SBERT.encode(recited, convert_to_tensor=True)))
409
  marbert_sim = marbert_cls_similarity(original, recited) if use_marbert else 0.0
 
411
  "semantic_score": round(max(sbert_sim, marbert_sim),3)}
412
 
413
  # =========================
414
+ # Audio helper
415
  # =========================
416
  def ensure_audio_path(audio):
 
417
  if isinstance(audio, str):
418
  if not os.path.exists(audio):
419
  raise FileNotFoundError(f"Audio path not found: {audio}")
 
427
  raise ValueError("Unsupported audio input format")
428
 
429
  # =========================
430
+ # Pipeline (robust errors + logs)
431
  # =========================
432
  def transcribe_and_evaluate(audio, original_text, whisper_size=None,
433
  compute_type=None, vad=True, use_marbert=True):
 
435
  if not original_text or not original_text.strip():
436
  raise ValueError("Original text is empty.")
437
 
438
+ # نُهمل اختيارات الواجهة ونفرض إعداداتك
439
+ whisper_size = FORCE_WHISPER_NAME
440
+ compute_type = FORCE_COMPUTE_TYPE
441
+ use_marbert = FORCE_USE_MARBERT
442
+
443
+ print(f"[RUN] whisper={whisper_size}, compute={compute_type}, marbert={use_marbert}", flush=True)
 
 
444
 
445
+ load_models(whisper_name=whisper_size, whisper_compute=compute_type, use_marbert=use_marbert)
446
 
447
  audio_path = ensure_audio_path(audio)
448
+ print(f"[AUDIO] path={audio_path}", flush=True)
449
+
450
+ segments, info = _WHISPER.transcribe(audio_path, **ASR_OPTS)
 
451
  segments = list(segments)
452
+ print(f"[ASR] segments={len(segments)}", flush=True)
453
 
454
+ # Build ASR text from words (more control)
455
  words = []
456
  for seg in segments:
457
  for w in (seg.words or []):
458
  tok = clean_ar_token(w.word)
459
+ if tok:
460
+ words.append(tok)
461
  asr_text = " ".join(words)
462
 
463
  ref_tokens = simple_tokenize(original_text)
 
466
 
467
  df_words = extract_word_conf_table(segments)
468
  asr_token_conf, low_t, high_t = build_asr_token_conf(df_words, hyp_tokens)
469
+ print(f"[CONF] low_t={low_t:.3f}, high_t={high_t:.3f}", flush=True)
470
 
471
  results, corrected_text = classify_alignment_optimized(
472
  aligned, ref_tokens, hyp_tokens,
 
475
  )
476
 
477
  lit = literal_similarity(original_text, corrected_text)
478
+ sem = semantic_similarity(original_text, corrected_text, use_marbert=use_marbert)
479
 
480
  df = pd.DataFrame(results)
481
 
482
  report = {
483
+ "requested": {"whisper_model": whisper_size, "compute_type": compute_type, "use_marbert": use_marbert},
484
+ "effective": {"whisper_model": whisper_size, "compute_type": compute_type, "use_marbert": use_marbert},
485
  "original_text": original_text,
486
  "asr_text": asr_text,
487
  "corrected_text": corrected_text,
488
  "literal": lit,
489
  "semantic": sem,
490
+ "low_t": float(low_t), "high_t": float(high_t),
491
  }
492
  return corrected_text, asr_text, json.dumps(report, ensure_ascii=False, indent=2), df
493
 
494
  except Exception as e:
495
  tb = traceback.format_exc()
496
  print("ERROR in transcribe_and_evaluate:\n", tb, flush=True)
 
497
  empty_df = pd.DataFrame([{"ASR_word":"","GT_word":"","status":"ERROR","reason":str(e),"used":""}])
498
  err_json = json.dumps({"error": str(e), "traceback": tb}, ensure_ascii=False, indent=2)
499
  gr.Warning(str(e))
500
  return "", "", err_json, empty_df
501
 
502
  def api_predict(audio, original_text, whisper_size=None, compute_type=None, vad=True, use_marbert=True):
 
503
  corrected_text, asr_text, report_json, df = transcribe_and_evaluate(
504
  audio, original_text, whisper_size, compute_type, vad, use_marbert
505
  )
 
516
  gr.Markdown("## Samaali — ASR Post-Processing (Whisper + Alignment + Confidence + Semantics)")
517
 
518
  with gr.Row():
 
519
  audio = gr.Audio(sources=["microphone","upload"], type="filepath", label="Audio")
520
  original = gr.Textbox(lines=8, label="Original Text (Ground Truth)")
521
 
522
  with gr.Row():
523
+ # واجهة ثابتة حسب طلبك (تُهمل في الدالة لكن نعرضها)
524
+ whisper_size = gr.Dropdown(choices=["large-v3"], value="large-v3", label="Whisper model size (forced)")
525
+ compute_type = gr.Dropdown(choices=["int8"], value="int8", label="compute_type (forced)")
 
 
 
 
 
 
 
526
  vad = gr.Checkbox(value=True, label="VAD filter")
527
+ use_marbert = gr.Checkbox(value=True, label="Use MARBERT (forced)")
528
 
529
  btn = gr.Button("Transcribe & Evaluate", variant="primary")
530