Spaces:

MuhammadHijazii
/

faster_whisper_large_v3_post_processwith_advanced

Sleeping

App Files Files Community

MuhammadHijazii commited on Aug 27

Commit

4fffe95

verified ·

1 Parent(s): e4b6c5a

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -12

app.py CHANGED Viewed

@@ -286,9 +286,21 @@ def classify_pair(ref_w, hyp_w, bert_scores, phon_sim, lev1, short_word,
         return 'ASR error (semantic/phonetic)'
     return 'Memorization error'
-def classify_alignment_optimized(aligned, ref_tokens, hyp_tokens,
-                                 bert_thresh=0.75, max_bert=0.85,
-                                 asr_token_conf=None, low_high=None):
     if low_high is None:
         if asr_token_conf:
             probs = [v["prob"] for v in asr_token_conf.values() if v["prob"] is not None]
@@ -303,29 +315,33 @@ def classify_alignment_optimized(aligned, ref_tokens, hyp_tokens,
         low_t, high_t = low_high
     results, corrected_words = [], []
     for entry in aligned:
         tag = entry['type']
-        i1, i2 = entry.get('ref_idx', (None,None))
-        j1, j2 = entry.get('hyp_idx', (None,None))
         if tag == 'equal':
             for ref_w, hyp_w in zip(entry['ref'], entry['hyp']):
                 results.append({'ASR_word': hyp_w, 'GT_word': ref_w, 'status': 'Correct', 'reason': ''})
                 corrected_words.append(hyp_w)
         elif tag in ['replace', 'delete', 'insert']:
             max_len = max(len(entry['ref']), len(entry['hyp']))
             for k in range(max_len):
                 ref_w = entry['ref'][k] if k < len(entry['ref']) else ''
                 hyp_w = entry['hyp'][k] if k < len(entry['hyp']) else ''
-                if not ref_w and not hyp_w:
                     continue
                 phon_sim = phonetic_similarity(ref_w, hyp_w) if ref_w and hyp_w else False
                 lev1 = is_levenshtein_1(ref_w, hyp_w) if ref_w and hyp_w else False
                 bert_scores = multi_bert_similarity(ref_w, hyp_w) if ref_w and hyp_w else {"sbert":0,"marbert":0,"max":0,"avg":0}
                 short_word = bool(ref_w and hyp_w and max(len(ref_w), len(hyp_w)) <= 6)
                 if ref_w and hyp_w:
                     base_status = classify_pair(ref_w, hyp_w, bert_scores, phon_sim, lev1, short_word,
                                                 bert_thresh, max_bert)
@@ -336,6 +352,7 @@ def classify_alignment_optimized(aligned, ref_tokens, hyp_tokens,
                 else:
                     base_status = 'Undefined Case'
                 word_prob = None; word_dur = None
                 if (j1 is not None) and (j2 is not None):
                     hyp_abs_idx = j1 + k
@@ -353,14 +370,30 @@ def classify_alignment_optimized(aligned, ref_tokens, hyp_tokens,
                         low_t=low_t, high_t=high_t, sbert_lo=0.60
                     )
                 used = hyp_w
                 if ref_w and hyp_w:
-                    used = ref_w if final_status.startswith("ASR error") else hyp_w
                 elif hyp_w == '':
-                    used = ''
                 elif ref_w == '':
-                    used = hyp_w
                 reason = (f'Phonetic={phon_sim}, Lev1={lev1}, '
                           f'SBERT={bert_scores["sbert"]:.2f}, '
                           f'MARBERT={bert_scores["marbert"]:.2f}, '
@@ -370,8 +403,15 @@ def classify_alignment_optimized(aligned, ref_tokens, hyp_tokens,
                           f'dur_ms={None if word_dur is None else int(word_dur)}, '
                           f'low_t={round(low_t,2)}, high_t={round(high_t,2)}')
-                results.append({'ASR_word': hyp_w, 'GT_word': ref_w,
-                                'status': final_status, 'reason': reason, 'used': used})
                 if used:
                     corrected_words.append(used)
@@ -424,9 +464,105 @@ def ensure_audio_path(audio):
             return tmp.name
     raise ValueError("Unsupported audio input format")
 # =========================
 # Pipeline (robust errors + logs)
 # =========================
 def transcribe_and_evaluate(audio, original_text, whisper_size=None,
                             compute_type=None, vad=True, use_marbert=True):
     try:
@@ -462,6 +598,12 @@ def transcribe_and_evaluate(audio, original_text, whisper_size=None,
         hyp_tokens = simple_tokenize(asr_text)
         aligned = align_texts(ref_tokens, hyp_tokens)
         df_words = extract_word_conf_table(segments)
         asr_token_conf, low_t, high_t = build_asr_token_conf(df_words, hyp_tokens)
         print(f"[CONF] low_t={low_t:.3f}, high_t={high_t:.3f}", flush=True)
@@ -469,7 +611,9 @@ def transcribe_and_evaluate(audio, original_text, whisper_size=None,
         results, corrected_text = classify_alignment_optimized(
             aligned, ref_tokens, hyp_tokens,
             bert_thresh=0.75, max_bert=0.85,
-            asr_token_conf=asr_token_conf, low_high=(low_t, high_t)
         )
         lit = literal_similarity(original_text, corrected_text)
@@ -480,6 +624,7 @@ def transcribe_and_evaluate(audio, original_text, whisper_size=None,
         report = {
             "requested": {"whisper_model": whisper_size, "compute_type": compute_type, "use_marbert": use_marbert},
             "effective": {"whisper_model": whisper_size, "compute_type": compute_type, "use_marbert": use_marbert},
             "original_text": original_text,
             "asr_text": asr_text,
             "corrected_text": corrected_text,

         return 'ASR error (semantic/phonetic)'
     return 'Memorization error'
+def classify_alignment_optimized(
+    aligned, ref_tokens, hyp_tokens,
+    bert_thresh=0.75, max_bert=0.85,
+    asr_token_conf=None, low_high=None,
+    replace_budget_tokens=None,   # NEW: سقف الاستبدال (int أو None)
+    guard_note=None               # NEW: وسم حر (مثلاً: "off-topic" أو "ok")
+):
+    """
+    مصنّف المحاذاة مع دعم 'سقف الاستبدال'.
+    - إذا replace_budget_tokens=None → لا يوجد سقف.
+    - إذا replace_budget_tokens=0 → لا يتم أي استبدال حتى لو كانت الحالة ASR error.
+    - عند بلوغ السقف نحتفظ بكلمة الطالب ونضيف "[guard: budget reached]" على الحالة.
+    - guard_note (اختياري) يُضاف للـ reason لتوثيق قرار الحارس العالمي.
+    """
+    # --- thresholds من احتمالات الكلمات ---
     if low_high is None:
         if asr_token_conf:
             probs = [v["prob"] for v in asr_token_conf.values() if v["prob"] is not None]
         low_t, high_t = low_high
     results, corrected_words = [], []
+    replaced_count = 0  # NEW: عدّاد الاستبدالات الفعلية
     for entry in aligned:
         tag = entry['type']
+        i1, i2 = entry.get('ref_idx', (None, None))
+        j1, j2 = entry.get('hyp_idx', (None, None))
         if tag == 'equal':
             for ref_w, hyp_w in zip(entry['ref'], entry['hyp']):
                 results.append({'ASR_word': hyp_w, 'GT_word': ref_w, 'status': 'Correct', 'reason': ''})
                 corrected_words.append(hyp_w)
         elif tag in ['replace', 'delete', 'insert']:
             max_len = max(len(entry['ref']), len(entry['hyp']))
             for k in range(max_len):
                 ref_w = entry['ref'][k] if k < len(entry['ref']) else ''
                 hyp_w = entry['hyp'][k] if k < len(entry['hyp']) else ''
+                if not ref_w and not hyp_w:
                     continue
+                # --- similarities ---
                 phon_sim = phonetic_similarity(ref_w, hyp_w) if ref_w and hyp_w else False
                 lev1 = is_levenshtein_1(ref_w, hyp_w) if ref_w and hyp_w else False
                 bert_scores = multi_bert_similarity(ref_w, hyp_w) if ref_w and hyp_w else {"sbert":0,"marbert":0,"max":0,"avg":0}
                 short_word = bool(ref_w and hyp_w and max(len(ref_w), len(hyp_w)) <= 6)
+                # --- base status ---
                 if ref_w and hyp_w:
                     base_status = classify_pair(ref_w, hyp_w, bert_scores, phon_sim, lev1, short_word,
                                                 bert_thresh, max_bert)
                 else:
                     base_status = 'Undefined Case'
+                # --- word-level confidence gate ---
                 word_prob = None; word_dur = None
                 if (j1 is not None) and (j2 is not None):
                     hyp_abs_idx = j1 + k
                         low_t=low_t, high_t=high_t, sbert_lo=0.60
                     )
+                # --- choose token to use (with budget) ---
                 used = hyp_w
+                budget_info = ""
                 if ref_w and hyp_w:
+                    if final_status.startswith("ASR error"):
+                        # نتحقق من السقف
+                        if (replace_budget_tokens is None) or (replaced_count < replace_budget_tokens):
+                            used = ref_w
+                            replaced_count += 1
+                            if replace_budget_tokens is not None:
+                                budget_info = f", budget={replaced_count}/{replace_budget_tokens}"
+                        else:
+                            # تجاوز السقف → لا نستبدل
+                            used = hyp_w
+                            final_status += " [guard: budget reached]"
+                            budget_info = f", budget={replaced_count}/{replace_budget_tokens}"
+                    else:
+                        used = hyp_w
                 elif hyp_w == '':
+                    used = ''          # حذف
                 elif ref_w == '':
+                    used = hyp_w       # إدراج
+                # --- reason string ---
                 reason = (f'Phonetic={phon_sim}, Lev1={lev1}, '
                           f'SBERT={bert_scores["sbert"]:.2f}, '
                           f'MARBERT={bert_scores["marbert"]:.2f}, '
                           f'dur_ms={None if word_dur is None else int(word_dur)}, '
                           f'low_t={round(low_t,2)}, high_t={round(high_t,2)}')
+                if guard_note:
+                    reason += f", guard='{guard_note}'"
+                if budget_info:
+                    reason += budget_info
+                results.append({
+                    'ASR_word': hyp_w, 'GT_word': ref_w,
+                    'status': final_status, 'reason': reason, 'used': used
+                })
                 if used:
                     corrected_words.append(used)
             return tmp.name
     raise ValueError("Unsupported audio input format")
+# =========================
+#
+# =========================
+def lcs_len(a, b):
+    """Longest Common Subsequence length على مستوى التوكنات."""
+    m, n = len(a), len(b)
+    dp = [[0]*(n+1) for _ in range(m+1)]
+    for i in range(1, m+1):
+        ai = a[i-1]
+        for j in range(1, n+1):
+            if ai == b[j-1]:
+                dp[i][j] = dp[i-1][j-1] + 1
+            else:
+                dp[i][j] = dp[i-1][j] if dp[i-1][j] >= dp[i][j-1] else dp[i][j-1]
+    return dp[m][n]
+def rouge_l_f1_tokens(ref_tokens, hyp_tokens, beta=1.2):
+    """تقريب ROUGE-L F1 على مستوى التوكنات."""
+    if not ref_tokens or not hyp_tokens:
+        return 0.0, 0.0, 0.0
+    lcs = lcs_len(ref_tokens, hyp_tokens)
+    prec = lcs / len(hyp_tokens)
+    rec  = lcs / len(ref_tokens)
+    if prec == 0 and rec == 0:
+        return 0.0, 0.0, 0.0
+    f1 = ((1+beta**2) * prec * rec) / (rec + beta**2 * prec + 1e-12)
+    return float(f1), float(prec), float(rec)
+def compute_wer_like(aligned, ref_tokens_len):
+    """WER مبسط من opcodes: (S+D+I)/N."""
+    S = D = I = 0
+    for op in aligned:
+        if op['type'] == 'replace':
+            S += max(len(op['ref']), len(op['hyp']))
+        elif op['type'] == 'delete':
+            D += len(op['ref'])
+        elif op['type'] == 'insert':
+            I += len(op['hyp'])
+    N = max(ref_tokens_len, 1)
+    return (S + D + I) / N
+def global_offtopic_guard(original_text, asr_text, ref_tokens, hyp_tokens, aligned, sbert_model):
+    """
+    يعيد dict يحوي:
+      off_topic: bool
+      budget_tokens: int (سقف الاستبدالات المسموح)
+      metrics: كل المقاييس للتقرير
+    """
+    # SBERT للنص الكامل
+    sbert_sim_text = float(util.pytorch_cos_sim(
+        sbert_model.encode(original_text, convert_to_tensor=True),
+        sbert_model.encode(asr_text,      convert_to_tensor=True)
+    ))
+    # ROUGE-L(F1) و LCS بنسخة توكنات
+    rouge_f1, rouge_p, rouge_r = rouge_l_f1_tokens(ref_tokens, hyp_tokens)
+    # نسبة التطابق المباشر (equal) من المحاذاة
+    equal_tokens = sum(len(op['ref']) for op in aligned if op['type'] == 'equal')
+    equal_ratio  = equal_tokens / max(len(ref_tokens), 1)
+    # WER مبسّط
+    wer = compute_wer_like(aligned, len(ref_tokens))
+    # قاعدة قرار Off-topic (حذرين)
+    # نعتبر خارج النص إذا: SBERT<0.70 و ROUGE_F1<0.45 و equal_ratio<0.25 أو WER>0.65
+    off_topic = ((sbert_sim_text < 0.70 and rouge_f1 < 0.45 and equal_ratio < 0.25) or (wer > 0.65))
+    # ميزانية الاستبدال (عدد الكلمات كحد أقصى يُسمح باستبدالها بـ GT)
+    #  - خارج النص: 0
+    #  - تشابه متوسط: 15% من طول Hyp
+    #  - تشابه مرتفع: 40% من طول Hyp
+    L = len(hyp_tokens)
+    if off_topic:
+        budget = 0
+    elif sbert_sim_text < 0.80 or rouge_f1 < 0.55:
+        budget = int(0.15 * L)
+    else:
+        budget = int(0.40 * L)
+    metrics = {
+        "sbert_sim_text": round(sbert_sim_text, 3),
+        "rougeL_f1": round(rouge_f1, 3),
+        "rougeL_prec": round(rouge_p, 3),
+        "rougeL_rec": round(rouge_r, 3),
+        "equal_ratio": round(equal_ratio, 3),
+        "wer_like": round(wer, 3),
+    }
+    print(f"[GUARD] off_topic={off_topic}, budget={budget}, metrics={metrics}", flush=True)
+    return {"off_topic": off_topic, "budget_tokens": budget, "metrics": metrics}
 # =========================
 # Pipeline (robust errors + logs)
 # =========================
 def transcribe_and_evaluate(audio, original_text, whisper_size=None,
                             compute_type=None, vad=True, use_marbert=True):
     try:
         hyp_tokens = simple_tokenize(asr_text)
         aligned = align_texts(ref_tokens, hyp_tokens)
+        # --- Global guard ---
+        guard = global_offtopic_guard(original_text, asr_text, ref_tokens, hyp_tokens, aligned, _SBERT)
+        off_topic = guard["off_topic"]
+        budget_tokens = guard["budget_tokens"]
+        guard_metrics = guard["metrics"]
         df_words = extract_word_conf_table(segments)
         asr_token_conf, low_t, high_t = build_asr_token_conf(df_words, hyp_tokens)
         print(f"[CONF] low_t={low_t:.3f}, high_t={high_t:.3f}", flush=True)
         results, corrected_text = classify_alignment_optimized(
             aligned, ref_tokens, hyp_tokens,
             bert_thresh=0.75, max_bert=0.85,
+            asr_token_conf=asr_token_conf, low_high=(low_t, high_t),
+            replace_budget_tokens=budget_tokens,   # ← عدد استبدالات أقصى
+            guard_note=("off-topic" if off_topic else "ok")
         )
         lit = literal_similarity(original_text, corrected_text)
         report = {
             "requested": {"whisper_model": whisper_size, "compute_type": compute_type, "use_marbert": use_marbert},
             "effective": {"whisper_model": whisper_size, "compute_type": compute_type, "use_marbert": use_marbert},
+            "guard": {"off_topic": off_topic,"budget_tokens": int(budget_tokens),**guard_metrics},
             "original_text": original_text,
             "asr_text": asr_text,
             "corrected_text": corrected_text,