MuhammadHijazii commited on
Commit
b9b7670
·
verified ·
1 Parent(s): dcbe586

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +569 -0
app.py ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """faster_whisper_large_v3_post_processwith_advanced. ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1-rZQ9JZaDiAZfrH4yoan2Cwv6pdhoOF1
8
+
9
+ # import lib and models
10
+ """
11
+
12
+ # Commented out IPython magic to ensure Python compatibility.
13
+ # %%capture
14
+ # !pip install transformers datasets soundfile torch
15
+ # !pip install pyxDamerauLevenshtein
16
+ # !pip install evaluate torchaudio soundfile
17
+ # !pip install jiwer
18
+ # !pip install textdistance
19
+ # !pip install -q bert-score sentence-transformers
20
+ # !pip install faster-whisper
21
+
22
+ import torch
23
+ import soundfile as sf
24
+ from datasets import load_dataset
25
+ import evaluate
26
+ import torchaudio
27
+ import editdistance
28
+ from difflib import SequenceMatcher
29
+ from IPython.display import display, HTML
30
+ import numpy as np
31
+ import pandas as pd
32
+ import re
33
+ import nltk
34
+ nltk.download('punkt')
35
+ nltk.download('punkt_tab')
36
+ from collections import Counter
37
+ from bert_score import score
38
+ import textdistance
39
+ from sentence_transformers import SentenceTransformer, util
40
+ from faster_whisper import WhisperModel
41
+
42
+ sbert_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
43
+
44
+ """# Whisper"""
45
+
46
+ dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
47
+
48
+ model = WhisperModel("large-v3", device=str(dev), compute_type="int8")
49
+
50
+ segments, info = model.transcribe(
51
+ "Test1.m4a",
52
+ word_timestamps=True, # ← مهم جداً
53
+ vad_filter=True, # اختياري: فلترة الصمت/الضوضاء
54
+ vad_parameters={"min_silence_duration_ms": 200},
55
+ )
56
+ segments = list(segments)
57
+
58
+ def clean_ar_token(t: str) -> str:
59
+ t = t.strip()
60
+ # إزالة محارف غير حرفية على الأطراف
61
+ t = re.sub(r'^[^\w\u0600-\u06FF]+|[^\w\u0600-\u06FF]+$', '', t)
62
+ return t
63
+
64
+ # نجمع الكلمات بترتيبها
65
+ words = []
66
+ for seg in segments:
67
+ if seg.words:
68
+ for w in seg.words:
69
+ tok = clean_ar_token(w.word)
70
+ if tok:
71
+ words.append(tok)
72
+
73
+ transcript = " ".join(words)
74
+
75
+ # تنظيف نهائي
76
+ transcript = re.sub(r"\s+", " ", transcript).strip()
77
+ transcript = re.sub(r"\s+([،,\.!?؟])", r"\1", transcript)
78
+
79
+ print(transcript)
80
+
81
+ reference_text = "التصلب اللويحي المتعدد: يظهر المرض بين سن (30 - 40) وهو تنكس عصبي، سببه: فقدان خلايا الدبق قليلة الاستطالات، وتفككها إلى صفائح متصلّبة نتيجة مرض مناعي ذاتي كما في الشكل المجاور، تنتج الأعراض من زوال غمد النخاعين في مناطق متعددة من المادة البيضاء للجهاز العصبي المركزي. فيحسّ المريض بصدمة كهربائية عند تحريك العنق." # Replace with your reference text
82
+ predicted_text = transcript
83
+ wer_metric = evaluate.load("wer")
84
+ cer_metric = evaluate.load("cer")
85
+
86
+ wer_score = wer_metric.compute(predictions=[predicted_text], references=[reference_text])
87
+ cer_score = cer_metric.compute(predictions=[predicted_text], references=[reference_text])
88
+
89
+ edit_distance = editdistance.eval(predicted_text, reference_text)
90
+
91
+ print(" WER - نسبة الخطأ في الكلمات: {:.2%}".format(wer_score))
92
+ print(" CER - نسبة الخطأ في الحروف: {:.2%}".format(cer_score))
93
+ print(f" Edit Distance - عدد التعديلات المطلوبة: {edit_distance}")
94
+
95
+ """# post Process
96
+
97
+ ## text analysis
98
+
99
+ ### normalize and tokenization
100
+ """
101
+
102
+ def normalize_arabic(text):
103
+ # إزالة التشكيل والترقيم وتوحيد بعض الأشكال
104
+ text = re.sub(r"[ًٌٍَُِّْـ]", "", text)
105
+ text = re.sub(r"[“”\"',:؛؟.!()\[\]{}،\-–—_]", " ", text)
106
+ text = re.sub(r"[إأٱآا]", "ا", text)
107
+ text = text.replace("ة", "ه").replace("ى", "ي")
108
+ text = re.sub(r"\s+", " ", text).strip()
109
+ return text
110
+
111
+ def simple_tokenize(text):
112
+ return nltk.word_tokenize(normalize_arabic(text))
113
+
114
+ def align_texts(ref_tokens, hyp_tokens):
115
+ import difflib
116
+ sm = difflib.SequenceMatcher(None, ref_tokens, hyp_tokens)
117
+ aligned = []
118
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
119
+ aligned.append({'type': tag, 'ref': ref_tokens[i1:i2], 'hyp': hyp_tokens[j1:j2], 'ref_idx': (i1, i2), 'hyp_idx': (j1, j2)})
120
+ return aligned
121
+
122
+ """### soundex and Levenshtein similarity"""
123
+
124
+ def arabic_soundex(word):
125
+ word = normalize_arabic(word)
126
+ replacements = {
127
+ 'بف': 'b', 'جشص': 'j', 'دض': 'd', 'طت': 't', 'قغ': 'q', 'كخ': 'k', 'سصز': 's',
128
+ 'ثذظ': 'z', 'ح': 'h', 'ع': 'a', 'م': 'm', 'ن': 'n', 'ل': 'l', 'ر': 'r',
129
+ 'ه': 'h', 'و': 'w', 'ي': 'y'
130
+ }
131
+ result = ""
132
+ for c in word:
133
+ for group, rep in replacements.items():
134
+ if c in group:
135
+ result += rep
136
+ break
137
+ return result
138
+
139
+ def phonetic_similarity(w1, w2):
140
+ return arabic_soundex(w1) == arabic_soundex(w2)
141
+
142
+ def is_levenshtein_1(w1, w2):
143
+ return textdistance.levenshtein(w1, w2) == 1
144
+
145
+ """### number study"""
146
+
147
+ AR_DIGITS = str.maketrans("٠١٢٣٤٥٦٧٨٩", "0123456789")
148
+
149
+ # تحويل "١٢٣" -> "123"
150
+ def normalize_digits(s: str) -> str:
151
+ return s.translate(AR_DIGITS)
152
+
153
+ # قاموس مبسّط للوحدات والعشرات والمئات (تكمله تدريجيًا حسب كتابك)
154
+ UNITS = {"صفر":0,"واحد":1,"واحدة":1,"اثنان":2,"اثنين":2,"اثنتان":2,"اثنتين":2,
155
+ "ثلاث":3,"ثلاثة":3,"أربع":4,"اربعة":4,"أربعة":4,"خمس":5,"خمسة":5,
156
+ "ست":6,"ستة":6,"سبع":7,"سبعة":7,"ثمان":8,"ثماني":8,"ثمانية":8,
157
+ "تسع":9,"تسعة":9}
158
+ TENS = {"عشر":10,"عشرة":10,"عشرون":20,"عشرين":20,"ثلاثون":30,"ثلاثين":30, "الثلاثين":30,
159
+ "أربعون":40,"اربعون":40,
160
+ "الأربعين":30,
161
+ "خمسون":50,"ستون":60,"سبعون":70,"ثمانون":80,"تسعون":90}
162
+ HUND = {"مئة":100,"مائه":100,"مائة":100,"مئه":100,"مئ":100}
163
+ SCALE = {"ألف":1000,"الف":1000,"ألاف":1000,"آلاف":1000,"مليون":10**6,"مليار":10**9}
164
+
165
+ def normalize_ar_orth(text: str) -> str:
166
+ text = re.sub(r"[ًٌٍَُِّْـ]", "", text)
167
+ text = re.sub(r"[“”\",:؛؟.!()\[\]{}،\-–—_]", " ", text)
168
+ # همزات وتطبيع بسيط
169
+ text = re.sub("[إأٱآا]", "ا", text)
170
+ text = text.replace("ة","ه").replace("ى","ي")
171
+ text = re.sub(r"\s+", " ", text).strip()
172
+ return text
173
+
174
+ def words_to_number(tokens):
175
+ """محول مبسّط: يدعم تركيب مثل: 'مئه و ثلاثه و عشرون'، طوّره حسب نصوصك."""
176
+ total = 0; current = 0
177
+ for w in tokens:
178
+ if w in UNITS: current += UNITS[w]
179
+ elif w in TENS: current += TENS[w]
180
+ elif w in HUND: current += HUND[w]
181
+ elif w in SCALE:
182
+ current = max(1, current) * SCALE[w]
183
+ total += current; current = 0
184
+ elif w == "و":
185
+ continue
186
+ else:
187
+ # كلمة ليست رقم؛ انهي التجميع الحالي
188
+ total += current; current = 0
189
+ total += current
190
+ return total if total != 0 else None
191
+
192
+ def to_numeric_value(token: str):
193
+ """يحاول تحويل التوكن إلى قيمة رقمية (digit أو كلمات)."""
194
+ t = normalize_ar_orth(token)
195
+ d = normalize_digits(t)
196
+ if re.fullmatch(r"\d+", d): # رقم مباشرة
197
+ return int(d)
198
+ # حوّل كلمة/جملة أرقام
199
+ toks = t.split()
200
+ val = words_to_number(toks)
201
+ return val
202
+
203
+ def is_number_token(w):
204
+ return to_numeric_value(w) is not None
205
+
206
+ """### SBERT and MARBERT-CLS"""
207
+
208
+ from transformers import AutoTokenizer, AutoModel
209
+ import torch
210
+ from sentence_transformers import util
211
+
212
+ _mar_name = "UBC-NLP/MARBERT"
213
+ _mar_tok = AutoTokenizer.from_pretrained(_mar_name)
214
+ _mar_model = AutoModel.from_pretrained(_mar_name)
215
+
216
+ def marbert_cls_similarity(a: str, b: str) -> float:
217
+ if not a or not b: return 0.0
218
+ with torch.no_grad():
219
+ ta = _mar_tok(a, return_tensors='pt', truncation=True, padding=True)
220
+ tb = _mar_tok(b, return_tensors='pt', truncation=True, padding=True)
221
+ ea = _mar_model(**ta).last_hidden_state[:,0,:]
222
+ eb = _mar_model(**tb).last_hidden_state[:,0,:]
223
+ sim = util.cos_sim(ea, eb).item()
224
+ # تحويل من [-1..1] إلى [0..1] اختياري:
225
+ return (sim + 1) / 2
226
+
227
+ def multi_bert_similarity(a: str, b: str):
228
+ if not a or not b:
229
+ return {"sbert":0.0, "marbert":0.0, "max":0.0, "avg":0.0}
230
+ sbert_sim = float(util.pytorch_cos_sim(sbert_model.encode(a, convert_to_tensor=True),
231
+ sbert_model.encode(b, convert_to_tensor=True)))
232
+ marbert_sim = marbert_cls_similarity(a, b)
233
+ vals = [sbert_sim, marbert_sim]
234
+ return {
235
+ "sbert": sbert_sim,
236
+ "marbert": marbert_sim,
237
+ "max": max(vals),
238
+ "avg": sum(vals)/len(vals)
239
+ }
240
+
241
+ """### Whisper predections and trust"""
242
+
243
+ def clean_ar_token(t: str) -> str:
244
+ t = t.strip()
245
+ t = re.sub(r'^[^\w\u0600-\u06FF]+|[^\w\u0600-\u06FF]+$', '', t)
246
+ t = normalize_ar_orth(t)
247
+ return t
248
+
249
+ def extract_word_conf_table(segments):
250
+ rows = []
251
+ for seg in segments:
252
+ for w in (seg.words or []):
253
+ rows.append({
254
+ "seg_start": float(seg.start),
255
+ "seg_end": float(seg.end),
256
+ "word_start": float(w.start),
257
+ "word_end": float(w.end),
258
+ "word": clean_ar_token(w.word),
259
+ "prob": float(w.probability),
260
+ })
261
+ return pd.DataFrame(rows)
262
+
263
+ def build_asr_token_conf(df_words: pd.DataFrame, hyp_tokens: list):
264
+ """
265
+ يحوّل كلمات ASR (مع احتمالاتها) إلى قائمة احتمالات/مدد align مع hyp_tokens
266
+ سياسة التجميع: لو انقسمت كلمة إل�� عدة توكنات ننسخ نفس prob، ولو اندمجت عدة كلمات في توكن واحد نأخذ 'أقل' احتمال (تحفظًا).
267
+ """
268
+ toks_probs = []
269
+ toks_durs = []
270
+ idx = 0
271
+ for _, row in df_words.iterrows():
272
+ w = row["word"]
273
+ prob = row["prob"]
274
+ dur = (row["word_end"] - row["word_start"]) * 1000.0 # ms
275
+ # نفترض أن tokenizer عندنا لا يجزّئ الكلمة العربية غالبًا
276
+ # لكن لضمان التوافق:
277
+ sub_toks = [w] # يمكن استبدالها بـ simple_tokenize(w) إذا أردت
278
+ for _ in sub_toks:
279
+ toks_probs.append(prob)
280
+ toks_durs.append(dur)
281
+ idx += 1
282
+
283
+ # مواءمة الطول مع hyp_tokens
284
+ L = len(hyp_tokens)
285
+ if len(toks_probs) >= L:
286
+ toks_probs = toks_probs[:L]
287
+ toks_durs = toks_durs[:L]
288
+ else:
289
+ pad = L - len(toks_probs)
290
+ toks_probs += [None]*pad
291
+ toks_durs += [None]*pad
292
+
293
+ # عتبات ديناميكية حسب التسجيل
294
+ arr = np.array([p for p in toks_probs if p is not None])
295
+ if arr.size:
296
+ low_t = float(np.quantile(arr, 0.15))
297
+ high_t = float(np.quantile(arr, 0.70))
298
+ else:
299
+ low_t, high_t = 0.5, 0.85
300
+
301
+ asr_token_conf = {
302
+ i: {"prob": toks_probs[i], "duration_ms": toks_durs[i]}
303
+ for i in range(L)
304
+ }
305
+ return asr_token_conf, low_t, high_t
306
+
307
+ """### desicion gate"""
308
+
309
+ def gate_by_word_conf(base_decision: str, prob: float, sbert_sim: float,
310
+ is_short: bool, lev1: bool, duration_ms: float = None,
311
+ low_t: float = 0.6, high_t: float = 0.9, sbert_lo=0.60):
312
+
313
+ band = "mid"
314
+ if prob is not None:
315
+ if prob <= low_t: band = "low"
316
+ elif prob >= high_t: band = "high"
317
+
318
+ very_short = (duration_ms is not None and duration_ms < 120) # 120ms قابل للتعديل
319
+
320
+ if band == "low":
321
+ if is_short and lev1:
322
+ return 'ASR error (low p + short+lev1)'
323
+ if very_short:
324
+ return 'ASR error (low p + very short)'
325
+ if sbert_sim >= sbert_lo:
326
+ return 'ASR error (low p + semantic)'
327
+ return 'ASR error (low p)'
328
+
329
+ if band == "high":
330
+ # لا نمنح ASR بسهولة — اترك القرار الأصلي (غالبًا يميل لخطأ حفظ عند اختلاف دلالي واضح)
331
+ return base_decision
332
+
333
+ return base_decision
334
+
335
+ """### classify pairs (numbers, short/long words, semantic)"""
336
+
337
+ def classify_pair(ref_w, hyp_w, bert_scores, phon_sim, lev1, short_word,
338
+ bert_thresh=0.75, max_bert=0.85):
339
+ # 1) فرع الأرقام
340
+ ref_num = to_numeric_value(ref_w)
341
+ hyp_num = to_numeric_value(hyp_w)
342
+ if ref_num is not None or hyp_num is not None:
343
+ if (ref_num is not None) and (hyp_num is not None):
344
+ if ref_num == hyp_num:
345
+ return 'ASR error (numbers equal)'
346
+ # لو أحدهما رقم والآخر لا، غالبًا ليس ASR — نكمل بقية الإشارات
347
+
348
+ # 2) كلمات قصيرة + Lev=1
349
+ if short_word and lev1:
350
+ return 'ASR error (short+lev1)'
351
+
352
+ # 3) قرار دلالي (نعتمد SBERT أساسًا + MARBERT مكمل)
353
+ avg_ok = bert_scores["avg"] >= bert_thresh
354
+ max_ok = bert_scores["max"] > max_bert
355
+ if (phon_sim or lev1) and avg_ok or max_ok:
356
+ return 'ASR error (semantic/phonetic)'
357
+
358
+ return 'Memorization error'
359
+
360
+ """## Judge function"""
361
+
362
+ def classify_alignment_optimized(aligned, ref_tokens, hyp_tokens,
363
+ bert_thresh=0.75, max_bert=0.85,
364
+ asr_token_conf=None, low_high=None):
365
+ """
366
+ - aligned: مخرجات align_texts (مع ref_idx/hyp_idx)
367
+ - ref_tokens, hyp_tokens: قوائم التوكنات بعد التطبيع
368
+ - asr_token_conf: dict من فهرس توكن ASR -> {"prob":.., "duration_ms":..}
369
+ - low_high: (low_t, high_t) عتبات ديناميكية مسبقة. إن لم تُمرَّر تُستنتج من asr_token_conf.
370
+ """
371
+ # استنتاج عتبات ديناميكية إذا توفرت
372
+ if low_high is None:
373
+ if asr_token_conf:
374
+ probs = [v["prob"] for v in asr_token_conf.values() if v["prob"] is not None]
375
+ if probs:
376
+ low_t = float(np.quantile(probs, 0.15))
377
+ high_t = float(np.quantile(probs, 0.70))
378
+ else:
379
+ low_t, high_t = 0.5, 0.85
380
+ else:
381
+ low_t, high_t = 0.5, 0.85
382
+ else:
383
+ low_t, high_t = low_high
384
+
385
+ results = []
386
+ for entry in aligned:
387
+ tag = entry['type']
388
+ i1, i2 = entry.get('ref_idx', (None,None))
389
+ j1, j2 = entry.get('hyp_idx', (None,None))
390
+
391
+ if tag == 'equal':
392
+ for ref_w, hyp_w in zip(entry['ref'], entry['hyp']):
393
+ results.append({'ASR_word': hyp_w, 'GT_word': ref_w,
394
+ 'status': 'Correct', 'reason': ''})
395
+ elif tag in ['replace', 'delete', 'insert']:
396
+ max_len = max(len(entry['ref']), len(entry['hyp']))
397
+ for k in range(max_len):
398
+ ref_w = entry['ref'][k] if k < len(entry['ref']) else ''
399
+ hyp_w = entry['hyp'][k] if k < len(entry['hyp']) else ''
400
+
401
+ if not ref_w and not hyp_w:
402
+ continue
403
+
404
+ # التشابهات
405
+ phon_sim = phonetic_similarity(ref_w, hyp_w) if ref_w and hyp_w else False
406
+ lev1 = is_levenshtein_1(ref_w, hyp_w) if ref_w and hyp_w else False
407
+ bert_scores = multi_bert_similarity(ref_w, hyp_w) if ref_w and hyp_w else {"sbert":0,"marbert":0,"max":0,"avg":0}
408
+ short_word = bool(ref_w and hyp_w and max(len(ref_w), len(hyp_w)) <= 6)
409
+
410
+ # قرار أساسي
411
+ if ref_w and hyp_w:
412
+ base_status = classify_pair(ref_w, hyp_w, bert_scores, phon_sim, lev1, short_word,
413
+ bert_thresh, max_bert)
414
+ elif hyp_w == '':
415
+ base_status = 'Missing (possible omission)'
416
+ elif ref_w == '':
417
+ base_status = 'Extra (possible ASR insertion)'
418
+ else:
419
+ base_status = 'Undefined Case'
420
+
421
+ # دمج ثقة كلمة ASR (إن توفرت)
422
+ word_prob = None
423
+ word_dur = None
424
+ hyp_abs_idx = None
425
+ if (j1 is not None) and (j2 is not None):
426
+ hyp_abs_idx = j1 + k
427
+ if asr_token_conf and hyp_abs_idx in asr_token_conf:
428
+ word_prob = asr_token_conf[hyp_abs_idx].get("prob")
429
+ word_dur = asr_token_conf[hyp_abs_idx].get("duration_ms")
430
+
431
+ final_status = base_status
432
+ if ref_w and hyp_w:
433
+ final_status = gate_by_word_conf(
434
+ base_decision=base_status,
435
+ prob=word_prob,
436
+ sbert_sim=bert_scores["sbert"],
437
+ is_short=short_word,
438
+ lev1=lev1,
439
+ duration_ms=word_dur,
440
+ low_t=low_t,
441
+ high_t=high_t,
442
+ sbert_lo=0.60
443
+ )
444
+
445
+ reason = (f'Phonetic={phon_sim}, Lev1={lev1}, '
446
+ f'SBERT={bert_scores["sbert"]:.2f}, '
447
+ f'MARBERT={bert_scores["marbert"]:.2f}, '
448
+ f'MAX={bert_scores["max"]:.2f}, '
449
+ f'AVG={bert_scores["avg"]:.2f}, short={short_word}, '
450
+ f'prob={None if word_prob is None else round(word_prob,2)}, '
451
+ f'dur_ms={None if word_dur is None else int(word_dur)}, '
452
+ f'low_t={round(low_t,2)}, high_t={round(high_t,2)}')
453
+
454
+ results.append({'ASR_word': hyp_w, 'GT_word': ref_w,
455
+ 'status': final_status, 'reason': reason})
456
+ return results
457
+
458
+ """# results
459
+
460
+ ## first result without token probability from faster whisper
461
+ """
462
+
463
+ # Pipeline execution:
464
+ ref_tokens = simple_tokenize(reference_text)
465
+ hyp_tokens = simple_tokenize(predicted_text)
466
+ aligned = align_texts(ref_tokens, hyp_tokens)
467
+ results1 = classify_alignment_optimized(aligned, ref_tokens, hyp_tokens)
468
+
469
+ df = pd.DataFrame(results1)
470
+ pd.set_option('display.max_colwidth', 200)
471
+ display(df)
472
+
473
+ """## map for each token probability of the word in whisper"""
474
+
475
+ # 3) (اختياري) خريطة ثقة الكلمات من faster-whisper
476
+ df_words = extract_word_conf_table(segments) # segments من faster_whisper مع word_timestamps=True
477
+ asr_token_conf, low_t, high_t = build_asr_token_conf(df_words, hyp_tokens)
478
+
479
+ df_words
480
+
481
+ """## second results with map token"""
482
+
483
+ results2 = classify_alignment_optimized(aligned, ref_tokens, hyp_tokens,
484
+ bert_thresh=0.75, max_bert=0.85,
485
+ asr_token_conf=asr_token_conf, low_high=(low_t, high_t))
486
+
487
+ df = pd.DataFrame(results2)
488
+ pd.set_option('display.max_colwidth', 200)
489
+ display(df)
490
+
491
+ """## apply the missing and wrongs"""
492
+
493
+ corrected_hyp_tokens = hyp_tokens.copy()
494
+
495
+ for entry in results2:
496
+ asr_word = entry['ASR_word']
497
+ gt_word = entry['GT_word']
498
+ status = entry['status']
499
+
500
+ # Find the index of the asr_word in the original hyp_tokens
501
+ try:
502
+ # We need to find the specific occurrence if there are duplicates.
503
+ # This simple approach assumes unique words or works for the first occurrence.
504
+ # A more robust approach might track indices during the alignment.
505
+ if asr_word in corrected_hyp_tokens:
506
+ idx_in_hyp = corrected_hyp_tokens.index(asr_word)
507
+ if status.startswith('ASR error'):
508
+ corrected_hyp_tokens[idx_in_hyp] = gt_word
509
+ elif status == 'Memorization error':
510
+ # الكلمة يلي قالها خطا (يعني كانت memorization error)
511
+ corrected_hyp_tokens[idx_in_hyp] = f"({asr_word})" # Mark memorization errors
512
+ # else: word was deleted/inserted, no direct replacement needed in this loop
513
+ except Exception as e:
514
+ print(f" Error processing {asr_word}: {e}")
515
+
516
+ corrected_text = ' '.join(corrected_hyp_tokens)
517
+
518
+ print("\n the result :")
519
+ print(corrected_text)
520
+
521
+ def highlight_word_diff(ref_word, pred_word):
522
+ s = SequenceMatcher(None, ref_word, pred_word)
523
+ ref_colored = ""
524
+ pred_colored = ""
525
+
526
+ for tag, i1, i2, j1, j2 in s.get_opcodes():
527
+ ref_part = ref_word[i1:i2]
528
+ pred_part = pred_word[j1:j2]
529
+ if tag == 'equal':
530
+ ref_colored += ref_part
531
+ pred_colored += pred_part
532
+ elif tag == 'replace':
533
+ ref_colored += ''.join([f'<span style="background: #ffeaea; color: #b30000; border-radius:2px;">{c}</span>' for c in ref_part])
534
+ pred_colored += ''.join([f'<span style="background: #ffeaea; color: #b30000; border-radius:2px;">{c}</span>' for c in pred_part])
535
+ elif tag == 'delete':
536
+ ref_colored += ''.join([f'<span style="background: #fff3cd; color:#ff9800; border-radius:2px;">{c}</span>' for c in ref_part])
537
+ elif tag == 'insert':
538
+ pred_colored += ''.join([f'<span style="background: #e3f1fc; color:#0069c0; border-radius:2px;">{c}</span>' for c in pred_part])
539
+ return ref_colored, pred_colored
540
+
541
+ def diff_colored_chars(ref, pred):
542
+ ref_words = ref.split()
543
+ pred_words = pred.split()
544
+ s = SequenceMatcher(None, ref_words, pred_words)
545
+ ref_html = ""
546
+ pred_html = ""
547
+
548
+ for tag, i1, i2, j1, j2 in s.get_opcodes():
549
+ if tag == 'equal':
550
+ ref_html += ' '.join([w for w in ref_words[i1:i2]]) + " "
551
+ pred_html += ' '.join([w for w in pred_words[j1:j2]]) + " "
552
+ elif tag == 'replace':
553
+ for rw, pw in zip(ref_words[i1:i2], pred_words[j1:j2]):
554
+ rw_c, pw_c = highlight_word_diff(rw, pw)
555
+ ref_html += f'<span style="padding:2px 4px; margin:1px; border-radius:2px;">{rw_c}</span> '
556
+ pred_html += f'<span style="padding:2px 4px; margin:1px; border-radius:2px;">{pw_c}</span> '
557
+ elif tag == 'insert':
558
+ pred_html += ' '.join([f'<span style="background: #e3f1fc; color:#0069c0; border-radius:2px;">{w}</span>' for w in pred_words[j1:j2]]) + " "
559
+ elif tag == 'delete':
560
+ ref_html += ' '.join([f'<span style="background: #fff3cd; color:#ff9800; border-radius:2px;">{w}</span>' for w in ref_words[i1:i2]]) + " "
561
+
562
+ display(HTML(f"""<div style="font-family: 'Cairo', Tahoma, Arial; font-size:20px; direction:rtl;">
563
+ <b>النص المرجعي:</b><br>{ref_html}<br><br>
564
+ <b>النص الناتج:</b><br>{pred_html}
565
+ </div>"""))
566
+
567
+ reference_text = normalize_arabic(reference_text)
568
+ diff_colored_chars(reference_text, corrected_text)
569
+