Upload app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,569 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""faster_whisper_large_v3_post_processwith_advanced. ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colab.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1-rZQ9JZaDiAZfrH4yoan2Cwv6pdhoOF1
|
| 8 |
+
|
| 9 |
+
# import lib and models
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
# Commented out IPython magic to ensure Python compatibility.
|
| 13 |
+
# %%capture
|
| 14 |
+
# !pip install transformers datasets soundfile torch
|
| 15 |
+
# !pip install pyxDamerauLevenshtein
|
| 16 |
+
# !pip install evaluate torchaudio soundfile
|
| 17 |
+
# !pip install jiwer
|
| 18 |
+
# !pip install textdistance
|
| 19 |
+
# !pip install -q bert-score sentence-transformers
|
| 20 |
+
# !pip install faster-whisper
|
| 21 |
+
|
| 22 |
+
import torch
|
| 23 |
+
import soundfile as sf
|
| 24 |
+
from datasets import load_dataset
|
| 25 |
+
import evaluate
|
| 26 |
+
import torchaudio
|
| 27 |
+
import editdistance
|
| 28 |
+
from difflib import SequenceMatcher
|
| 29 |
+
from IPython.display import display, HTML
|
| 30 |
+
import numpy as np
|
| 31 |
+
import pandas as pd
|
| 32 |
+
import re
|
| 33 |
+
import nltk
|
| 34 |
+
nltk.download('punkt')
|
| 35 |
+
nltk.download('punkt_tab')
|
| 36 |
+
from collections import Counter
|
| 37 |
+
from bert_score import score
|
| 38 |
+
import textdistance
|
| 39 |
+
from sentence_transformers import SentenceTransformer, util
|
| 40 |
+
from faster_whisper import WhisperModel
|
| 41 |
+
|
| 42 |
+
sbert_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
|
| 43 |
+
|
| 44 |
+
"""# Whisper"""
|
| 45 |
+
|
| 46 |
+
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 47 |
+
|
| 48 |
+
model = WhisperModel("large-v3", device=str(dev), compute_type="int8")
|
| 49 |
+
|
| 50 |
+
segments, info = model.transcribe(
|
| 51 |
+
"Test1.m4a",
|
| 52 |
+
word_timestamps=True, # ← مهم جداً
|
| 53 |
+
vad_filter=True, # اختياري: فلترة الصمت/الضوضاء
|
| 54 |
+
vad_parameters={"min_silence_duration_ms": 200},
|
| 55 |
+
)
|
| 56 |
+
segments = list(segments)
|
| 57 |
+
|
| 58 |
+
def clean_ar_token(t: str) -> str:
|
| 59 |
+
t = t.strip()
|
| 60 |
+
# إزالة محارف غير حرفية على الأطراف
|
| 61 |
+
t = re.sub(r'^[^\w\u0600-\u06FF]+|[^\w\u0600-\u06FF]+$', '', t)
|
| 62 |
+
return t
|
| 63 |
+
|
| 64 |
+
# نجمع الكلمات بترتيبها
|
| 65 |
+
words = []
|
| 66 |
+
for seg in segments:
|
| 67 |
+
if seg.words:
|
| 68 |
+
for w in seg.words:
|
| 69 |
+
tok = clean_ar_token(w.word)
|
| 70 |
+
if tok:
|
| 71 |
+
words.append(tok)
|
| 72 |
+
|
| 73 |
+
transcript = " ".join(words)
|
| 74 |
+
|
| 75 |
+
# تنظيف نهائي
|
| 76 |
+
transcript = re.sub(r"\s+", " ", transcript).strip()
|
| 77 |
+
transcript = re.sub(r"\s+([،,\.!?؟])", r"\1", transcript)
|
| 78 |
+
|
| 79 |
+
print(transcript)
|
| 80 |
+
|
| 81 |
+
reference_text = "التصلب اللويحي المتعدد: يظهر المرض بين سن (30 - 40) وهو تنكس عصبي، سببه: فقدان خلايا الدبق قليلة الاستطالات، وتفككها إلى صفائح متصلّبة نتيجة مرض مناعي ذاتي كما في الشكل المجاور، تنتج الأعراض من زوال غمد النخاعين في مناطق متعددة من المادة البيضاء للجهاز العصبي المركزي. فيحسّ المريض بصدمة كهربائية عند تحريك العنق." # Replace with your reference text
|
| 82 |
+
predicted_text = transcript
|
| 83 |
+
wer_metric = evaluate.load("wer")
|
| 84 |
+
cer_metric = evaluate.load("cer")
|
| 85 |
+
|
| 86 |
+
wer_score = wer_metric.compute(predictions=[predicted_text], references=[reference_text])
|
| 87 |
+
cer_score = cer_metric.compute(predictions=[predicted_text], references=[reference_text])
|
| 88 |
+
|
| 89 |
+
edit_distance = editdistance.eval(predicted_text, reference_text)
|
| 90 |
+
|
| 91 |
+
print(" WER - نسبة الخطأ في الكلمات: {:.2%}".format(wer_score))
|
| 92 |
+
print(" CER - نسبة الخطأ في الحروف: {:.2%}".format(cer_score))
|
| 93 |
+
print(f" Edit Distance - عدد التعديلات المطلوبة: {edit_distance}")
|
| 94 |
+
|
| 95 |
+
"""# post Process
|
| 96 |
+
|
| 97 |
+
## text analysis
|
| 98 |
+
|
| 99 |
+
### normalize and tokenization
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
def normalize_arabic(text):
|
| 103 |
+
# إزالة التشكيل والترقيم وتوحيد بعض الأشكال
|
| 104 |
+
text = re.sub(r"[ًٌٍَُِّْـ]", "", text)
|
| 105 |
+
text = re.sub(r"[“”\"',:؛؟.!()\[\]{}،\-–—_]", " ", text)
|
| 106 |
+
text = re.sub(r"[إأٱآا]", "ا", text)
|
| 107 |
+
text = text.replace("ة", "ه").replace("ى", "ي")
|
| 108 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 109 |
+
return text
|
| 110 |
+
|
| 111 |
+
def simple_tokenize(text):
|
| 112 |
+
return nltk.word_tokenize(normalize_arabic(text))
|
| 113 |
+
|
| 114 |
+
def align_texts(ref_tokens, hyp_tokens):
|
| 115 |
+
import difflib
|
| 116 |
+
sm = difflib.SequenceMatcher(None, ref_tokens, hyp_tokens)
|
| 117 |
+
aligned = []
|
| 118 |
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
| 119 |
+
aligned.append({'type': tag, 'ref': ref_tokens[i1:i2], 'hyp': hyp_tokens[j1:j2], 'ref_idx': (i1, i2), 'hyp_idx': (j1, j2)})
|
| 120 |
+
return aligned
|
| 121 |
+
|
| 122 |
+
"""### soundex and Levenshtein similarity"""
|
| 123 |
+
|
| 124 |
+
def arabic_soundex(word):
|
| 125 |
+
word = normalize_arabic(word)
|
| 126 |
+
replacements = {
|
| 127 |
+
'بف': 'b', 'جشص': 'j', 'دض': 'd', 'طت': 't', 'قغ': 'q', 'كخ': 'k', 'سصز': 's',
|
| 128 |
+
'ثذظ': 'z', 'ح': 'h', 'ع': 'a', 'م': 'm', 'ن': 'n', 'ل': 'l', 'ر': 'r',
|
| 129 |
+
'ه': 'h', 'و': 'w', 'ي': 'y'
|
| 130 |
+
}
|
| 131 |
+
result = ""
|
| 132 |
+
for c in word:
|
| 133 |
+
for group, rep in replacements.items():
|
| 134 |
+
if c in group:
|
| 135 |
+
result += rep
|
| 136 |
+
break
|
| 137 |
+
return result
|
| 138 |
+
|
| 139 |
+
def phonetic_similarity(w1, w2):
|
| 140 |
+
return arabic_soundex(w1) == arabic_soundex(w2)
|
| 141 |
+
|
| 142 |
+
def is_levenshtein_1(w1, w2):
|
| 143 |
+
return textdistance.levenshtein(w1, w2) == 1
|
| 144 |
+
|
| 145 |
+
"""### number study"""
|
| 146 |
+
|
| 147 |
+
AR_DIGITS = str.maketrans("٠١٢٣٤٥٦٧٨٩", "0123456789")
|
| 148 |
+
|
| 149 |
+
# تحويل "١٢٣" -> "123"
|
| 150 |
+
def normalize_digits(s: str) -> str:
|
| 151 |
+
return s.translate(AR_DIGITS)
|
| 152 |
+
|
| 153 |
+
# قاموس مبسّط للوحدات والعشرات والمئات (تكمله تدريجيًا حسب كتابك)
|
| 154 |
+
UNITS = {"صفر":0,"واحد":1,"واحدة":1,"اثنان":2,"اثنين":2,"اثنتان":2,"اثنتين":2,
|
| 155 |
+
"ثلاث":3,"ثلاثة":3,"أربع":4,"اربعة":4,"أربعة":4,"خمس":5,"خمسة":5,
|
| 156 |
+
"ست":6,"ستة":6,"سبع":7,"سبعة":7,"ثمان":8,"ثماني":8,"ثمانية":8,
|
| 157 |
+
"تسع":9,"تسعة":9}
|
| 158 |
+
TENS = {"عشر":10,"عشرة":10,"عشرون":20,"عشرين":20,"ثلاثون":30,"ثلاثين":30, "الثلاثين":30,
|
| 159 |
+
"أربعون":40,"اربعون":40,
|
| 160 |
+
"الأربعين":30,
|
| 161 |
+
"خمسون":50,"ستون":60,"سبعون":70,"ثمانون":80,"تسعون":90}
|
| 162 |
+
HUND = {"مئة":100,"مائه":100,"مائة":100,"مئه":100,"مئ":100}
|
| 163 |
+
SCALE = {"ألف":1000,"الف":1000,"ألاف":1000,"آلاف":1000,"مليون":10**6,"مليار":10**9}
|
| 164 |
+
|
| 165 |
+
def normalize_ar_orth(text: str) -> str:
|
| 166 |
+
text = re.sub(r"[ًٌٍَُِّْـ]", "", text)
|
| 167 |
+
text = re.sub(r"[“”\",:؛؟.!()\[\]{}،\-–—_]", " ", text)
|
| 168 |
+
# همزات وتطبيع بسيط
|
| 169 |
+
text = re.sub("[إأٱآا]", "ا", text)
|
| 170 |
+
text = text.replace("ة","ه").replace("ى","ي")
|
| 171 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 172 |
+
return text
|
| 173 |
+
|
| 174 |
+
def words_to_number(tokens):
|
| 175 |
+
"""محول مبسّط: يدعم تركيب مثل: 'مئه و ثلاثه و عشرون'، طوّره حسب نصوصك."""
|
| 176 |
+
total = 0; current = 0
|
| 177 |
+
for w in tokens:
|
| 178 |
+
if w in UNITS: current += UNITS[w]
|
| 179 |
+
elif w in TENS: current += TENS[w]
|
| 180 |
+
elif w in HUND: current += HUND[w]
|
| 181 |
+
elif w in SCALE:
|
| 182 |
+
current = max(1, current) * SCALE[w]
|
| 183 |
+
total += current; current = 0
|
| 184 |
+
elif w == "و":
|
| 185 |
+
continue
|
| 186 |
+
else:
|
| 187 |
+
# كلمة ليست رقم؛ انهي التجميع الحالي
|
| 188 |
+
total += current; current = 0
|
| 189 |
+
total += current
|
| 190 |
+
return total if total != 0 else None
|
| 191 |
+
|
| 192 |
+
def to_numeric_value(token: str):
|
| 193 |
+
"""يحاول تحويل التوكن إلى قيمة رقمية (digit أو كلمات)."""
|
| 194 |
+
t = normalize_ar_orth(token)
|
| 195 |
+
d = normalize_digits(t)
|
| 196 |
+
if re.fullmatch(r"\d+", d): # رقم مباشرة
|
| 197 |
+
return int(d)
|
| 198 |
+
# حوّل كلمة/جملة أرقام
|
| 199 |
+
toks = t.split()
|
| 200 |
+
val = words_to_number(toks)
|
| 201 |
+
return val
|
| 202 |
+
|
| 203 |
+
def is_number_token(w):
|
| 204 |
+
return to_numeric_value(w) is not None
|
| 205 |
+
|
| 206 |
+
"""### SBERT and MARBERT-CLS"""
|
| 207 |
+
|
| 208 |
+
from transformers import AutoTokenizer, AutoModel
|
| 209 |
+
import torch
|
| 210 |
+
from sentence_transformers import util
|
| 211 |
+
|
| 212 |
+
_mar_name = "UBC-NLP/MARBERT"
|
| 213 |
+
_mar_tok = AutoTokenizer.from_pretrained(_mar_name)
|
| 214 |
+
_mar_model = AutoModel.from_pretrained(_mar_name)
|
| 215 |
+
|
| 216 |
+
def marbert_cls_similarity(a: str, b: str) -> float:
|
| 217 |
+
if not a or not b: return 0.0
|
| 218 |
+
with torch.no_grad():
|
| 219 |
+
ta = _mar_tok(a, return_tensors='pt', truncation=True, padding=True)
|
| 220 |
+
tb = _mar_tok(b, return_tensors='pt', truncation=True, padding=True)
|
| 221 |
+
ea = _mar_model(**ta).last_hidden_state[:,0,:]
|
| 222 |
+
eb = _mar_model(**tb).last_hidden_state[:,0,:]
|
| 223 |
+
sim = util.cos_sim(ea, eb).item()
|
| 224 |
+
# تحويل من [-1..1] إلى [0..1] اختياري:
|
| 225 |
+
return (sim + 1) / 2
|
| 226 |
+
|
| 227 |
+
def multi_bert_similarity(a: str, b: str):
|
| 228 |
+
if not a or not b:
|
| 229 |
+
return {"sbert":0.0, "marbert":0.0, "max":0.0, "avg":0.0}
|
| 230 |
+
sbert_sim = float(util.pytorch_cos_sim(sbert_model.encode(a, convert_to_tensor=True),
|
| 231 |
+
sbert_model.encode(b, convert_to_tensor=True)))
|
| 232 |
+
marbert_sim = marbert_cls_similarity(a, b)
|
| 233 |
+
vals = [sbert_sim, marbert_sim]
|
| 234 |
+
return {
|
| 235 |
+
"sbert": sbert_sim,
|
| 236 |
+
"marbert": marbert_sim,
|
| 237 |
+
"max": max(vals),
|
| 238 |
+
"avg": sum(vals)/len(vals)
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
"""### Whisper predections and trust"""
|
| 242 |
+
|
| 243 |
+
def clean_ar_token(t: str) -> str:
|
| 244 |
+
t = t.strip()
|
| 245 |
+
t = re.sub(r'^[^\w\u0600-\u06FF]+|[^\w\u0600-\u06FF]+$', '', t)
|
| 246 |
+
t = normalize_ar_orth(t)
|
| 247 |
+
return t
|
| 248 |
+
|
| 249 |
+
def extract_word_conf_table(segments):
|
| 250 |
+
rows = []
|
| 251 |
+
for seg in segments:
|
| 252 |
+
for w in (seg.words or []):
|
| 253 |
+
rows.append({
|
| 254 |
+
"seg_start": float(seg.start),
|
| 255 |
+
"seg_end": float(seg.end),
|
| 256 |
+
"word_start": float(w.start),
|
| 257 |
+
"word_end": float(w.end),
|
| 258 |
+
"word": clean_ar_token(w.word),
|
| 259 |
+
"prob": float(w.probability),
|
| 260 |
+
})
|
| 261 |
+
return pd.DataFrame(rows)
|
| 262 |
+
|
| 263 |
+
def build_asr_token_conf(df_words: pd.DataFrame, hyp_tokens: list):
|
| 264 |
+
"""
|
| 265 |
+
يحوّل كلمات ASR (مع احتمالاتها) إلى قائمة احتمالات/مدد align مع hyp_tokens
|
| 266 |
+
سياسة التجميع: لو انقسمت كلمة إل�� عدة توكنات ننسخ نفس prob، ولو اندمجت عدة كلمات في توكن واحد نأخذ 'أقل' احتمال (تحفظًا).
|
| 267 |
+
"""
|
| 268 |
+
toks_probs = []
|
| 269 |
+
toks_durs = []
|
| 270 |
+
idx = 0
|
| 271 |
+
for _, row in df_words.iterrows():
|
| 272 |
+
w = row["word"]
|
| 273 |
+
prob = row["prob"]
|
| 274 |
+
dur = (row["word_end"] - row["word_start"]) * 1000.0 # ms
|
| 275 |
+
# نفترض أن tokenizer عندنا لا يجزّئ الكلمة العربية غالبًا
|
| 276 |
+
# لكن لضمان التوافق:
|
| 277 |
+
sub_toks = [w] # يمكن استبدالها بـ simple_tokenize(w) إذا أردت
|
| 278 |
+
for _ in sub_toks:
|
| 279 |
+
toks_probs.append(prob)
|
| 280 |
+
toks_durs.append(dur)
|
| 281 |
+
idx += 1
|
| 282 |
+
|
| 283 |
+
# مواءمة الطول مع hyp_tokens
|
| 284 |
+
L = len(hyp_tokens)
|
| 285 |
+
if len(toks_probs) >= L:
|
| 286 |
+
toks_probs = toks_probs[:L]
|
| 287 |
+
toks_durs = toks_durs[:L]
|
| 288 |
+
else:
|
| 289 |
+
pad = L - len(toks_probs)
|
| 290 |
+
toks_probs += [None]*pad
|
| 291 |
+
toks_durs += [None]*pad
|
| 292 |
+
|
| 293 |
+
# عتبات ديناميكية حسب التسجيل
|
| 294 |
+
arr = np.array([p for p in toks_probs if p is not None])
|
| 295 |
+
if arr.size:
|
| 296 |
+
low_t = float(np.quantile(arr, 0.15))
|
| 297 |
+
high_t = float(np.quantile(arr, 0.70))
|
| 298 |
+
else:
|
| 299 |
+
low_t, high_t = 0.5, 0.85
|
| 300 |
+
|
| 301 |
+
asr_token_conf = {
|
| 302 |
+
i: {"prob": toks_probs[i], "duration_ms": toks_durs[i]}
|
| 303 |
+
for i in range(L)
|
| 304 |
+
}
|
| 305 |
+
return asr_token_conf, low_t, high_t
|
| 306 |
+
|
| 307 |
+
"""### desicion gate"""
|
| 308 |
+
|
| 309 |
+
def gate_by_word_conf(base_decision: str, prob: float, sbert_sim: float,
|
| 310 |
+
is_short: bool, lev1: bool, duration_ms: float = None,
|
| 311 |
+
low_t: float = 0.6, high_t: float = 0.9, sbert_lo=0.60):
|
| 312 |
+
|
| 313 |
+
band = "mid"
|
| 314 |
+
if prob is not None:
|
| 315 |
+
if prob <= low_t: band = "low"
|
| 316 |
+
elif prob >= high_t: band = "high"
|
| 317 |
+
|
| 318 |
+
very_short = (duration_ms is not None and duration_ms < 120) # 120ms قابل للتعديل
|
| 319 |
+
|
| 320 |
+
if band == "low":
|
| 321 |
+
if is_short and lev1:
|
| 322 |
+
return 'ASR error (low p + short+lev1)'
|
| 323 |
+
if very_short:
|
| 324 |
+
return 'ASR error (low p + very short)'
|
| 325 |
+
if sbert_sim >= sbert_lo:
|
| 326 |
+
return 'ASR error (low p + semantic)'
|
| 327 |
+
return 'ASR error (low p)'
|
| 328 |
+
|
| 329 |
+
if band == "high":
|
| 330 |
+
# لا نمنح ASR بسهولة — اترك القرار الأصلي (غالبًا يميل لخطأ حفظ عند اختلاف دلالي واضح)
|
| 331 |
+
return base_decision
|
| 332 |
+
|
| 333 |
+
return base_decision
|
| 334 |
+
|
| 335 |
+
"""### classify pairs (numbers, short/long words, semantic)"""
|
| 336 |
+
|
| 337 |
+
def classify_pair(ref_w, hyp_w, bert_scores, phon_sim, lev1, short_word,
|
| 338 |
+
bert_thresh=0.75, max_bert=0.85):
|
| 339 |
+
# 1) فرع الأرقام
|
| 340 |
+
ref_num = to_numeric_value(ref_w)
|
| 341 |
+
hyp_num = to_numeric_value(hyp_w)
|
| 342 |
+
if ref_num is not None or hyp_num is not None:
|
| 343 |
+
if (ref_num is not None) and (hyp_num is not None):
|
| 344 |
+
if ref_num == hyp_num:
|
| 345 |
+
return 'ASR error (numbers equal)'
|
| 346 |
+
# لو أحدهما رقم والآخر لا، غالبًا ليس ASR — نكمل بقية الإشارات
|
| 347 |
+
|
| 348 |
+
# 2) كلمات قصيرة + Lev=1
|
| 349 |
+
if short_word and lev1:
|
| 350 |
+
return 'ASR error (short+lev1)'
|
| 351 |
+
|
| 352 |
+
# 3) قرار دلالي (نعتمد SBERT أساسًا + MARBERT مكمل)
|
| 353 |
+
avg_ok = bert_scores["avg"] >= bert_thresh
|
| 354 |
+
max_ok = bert_scores["max"] > max_bert
|
| 355 |
+
if (phon_sim or lev1) and avg_ok or max_ok:
|
| 356 |
+
return 'ASR error (semantic/phonetic)'
|
| 357 |
+
|
| 358 |
+
return 'Memorization error'
|
| 359 |
+
|
| 360 |
+
"""## Judge function"""
|
| 361 |
+
|
| 362 |
+
def classify_alignment_optimized(aligned, ref_tokens, hyp_tokens,
|
| 363 |
+
bert_thresh=0.75, max_bert=0.85,
|
| 364 |
+
asr_token_conf=None, low_high=None):
|
| 365 |
+
"""
|
| 366 |
+
- aligned: مخرجات align_texts (مع ref_idx/hyp_idx)
|
| 367 |
+
- ref_tokens, hyp_tokens: قوائم التوكنات بعد التطبيع
|
| 368 |
+
- asr_token_conf: dict من فهرس توكن ASR -> {"prob":.., "duration_ms":..}
|
| 369 |
+
- low_high: (low_t, high_t) عتبات ديناميكية مسبقة. إن لم تُمرَّر تُستنتج من asr_token_conf.
|
| 370 |
+
"""
|
| 371 |
+
# استنتاج عتبات ديناميكية إذا توفرت
|
| 372 |
+
if low_high is None:
|
| 373 |
+
if asr_token_conf:
|
| 374 |
+
probs = [v["prob"] for v in asr_token_conf.values() if v["prob"] is not None]
|
| 375 |
+
if probs:
|
| 376 |
+
low_t = float(np.quantile(probs, 0.15))
|
| 377 |
+
high_t = float(np.quantile(probs, 0.70))
|
| 378 |
+
else:
|
| 379 |
+
low_t, high_t = 0.5, 0.85
|
| 380 |
+
else:
|
| 381 |
+
low_t, high_t = 0.5, 0.85
|
| 382 |
+
else:
|
| 383 |
+
low_t, high_t = low_high
|
| 384 |
+
|
| 385 |
+
results = []
|
| 386 |
+
for entry in aligned:
|
| 387 |
+
tag = entry['type']
|
| 388 |
+
i1, i2 = entry.get('ref_idx', (None,None))
|
| 389 |
+
j1, j2 = entry.get('hyp_idx', (None,None))
|
| 390 |
+
|
| 391 |
+
if tag == 'equal':
|
| 392 |
+
for ref_w, hyp_w in zip(entry['ref'], entry['hyp']):
|
| 393 |
+
results.append({'ASR_word': hyp_w, 'GT_word': ref_w,
|
| 394 |
+
'status': 'Correct', 'reason': ''})
|
| 395 |
+
elif tag in ['replace', 'delete', 'insert']:
|
| 396 |
+
max_len = max(len(entry['ref']), len(entry['hyp']))
|
| 397 |
+
for k in range(max_len):
|
| 398 |
+
ref_w = entry['ref'][k] if k < len(entry['ref']) else ''
|
| 399 |
+
hyp_w = entry['hyp'][k] if k < len(entry['hyp']) else ''
|
| 400 |
+
|
| 401 |
+
if not ref_w and not hyp_w:
|
| 402 |
+
continue
|
| 403 |
+
|
| 404 |
+
# التشابهات
|
| 405 |
+
phon_sim = phonetic_similarity(ref_w, hyp_w) if ref_w and hyp_w else False
|
| 406 |
+
lev1 = is_levenshtein_1(ref_w, hyp_w) if ref_w and hyp_w else False
|
| 407 |
+
bert_scores = multi_bert_similarity(ref_w, hyp_w) if ref_w and hyp_w else {"sbert":0,"marbert":0,"max":0,"avg":0}
|
| 408 |
+
short_word = bool(ref_w and hyp_w and max(len(ref_w), len(hyp_w)) <= 6)
|
| 409 |
+
|
| 410 |
+
# قرار أساسي
|
| 411 |
+
if ref_w and hyp_w:
|
| 412 |
+
base_status = classify_pair(ref_w, hyp_w, bert_scores, phon_sim, lev1, short_word,
|
| 413 |
+
bert_thresh, max_bert)
|
| 414 |
+
elif hyp_w == '':
|
| 415 |
+
base_status = 'Missing (possible omission)'
|
| 416 |
+
elif ref_w == '':
|
| 417 |
+
base_status = 'Extra (possible ASR insertion)'
|
| 418 |
+
else:
|
| 419 |
+
base_status = 'Undefined Case'
|
| 420 |
+
|
| 421 |
+
# دمج ثقة كلمة ASR (إن توفرت)
|
| 422 |
+
word_prob = None
|
| 423 |
+
word_dur = None
|
| 424 |
+
hyp_abs_idx = None
|
| 425 |
+
if (j1 is not None) and (j2 is not None):
|
| 426 |
+
hyp_abs_idx = j1 + k
|
| 427 |
+
if asr_token_conf and hyp_abs_idx in asr_token_conf:
|
| 428 |
+
word_prob = asr_token_conf[hyp_abs_idx].get("prob")
|
| 429 |
+
word_dur = asr_token_conf[hyp_abs_idx].get("duration_ms")
|
| 430 |
+
|
| 431 |
+
final_status = base_status
|
| 432 |
+
if ref_w and hyp_w:
|
| 433 |
+
final_status = gate_by_word_conf(
|
| 434 |
+
base_decision=base_status,
|
| 435 |
+
prob=word_prob,
|
| 436 |
+
sbert_sim=bert_scores["sbert"],
|
| 437 |
+
is_short=short_word,
|
| 438 |
+
lev1=lev1,
|
| 439 |
+
duration_ms=word_dur,
|
| 440 |
+
low_t=low_t,
|
| 441 |
+
high_t=high_t,
|
| 442 |
+
sbert_lo=0.60
|
| 443 |
+
)
|
| 444 |
+
|
| 445 |
+
reason = (f'Phonetic={phon_sim}, Lev1={lev1}, '
|
| 446 |
+
f'SBERT={bert_scores["sbert"]:.2f}, '
|
| 447 |
+
f'MARBERT={bert_scores["marbert"]:.2f}, '
|
| 448 |
+
f'MAX={bert_scores["max"]:.2f}, '
|
| 449 |
+
f'AVG={bert_scores["avg"]:.2f}, short={short_word}, '
|
| 450 |
+
f'prob={None if word_prob is None else round(word_prob,2)}, '
|
| 451 |
+
f'dur_ms={None if word_dur is None else int(word_dur)}, '
|
| 452 |
+
f'low_t={round(low_t,2)}, high_t={round(high_t,2)}')
|
| 453 |
+
|
| 454 |
+
results.append({'ASR_word': hyp_w, 'GT_word': ref_w,
|
| 455 |
+
'status': final_status, 'reason': reason})
|
| 456 |
+
return results
|
| 457 |
+
|
| 458 |
+
"""# results
|
| 459 |
+
|
| 460 |
+
## first result without token probability from faster whisper
|
| 461 |
+
"""
|
| 462 |
+
|
| 463 |
+
# Pipeline execution:
|
| 464 |
+
ref_tokens = simple_tokenize(reference_text)
|
| 465 |
+
hyp_tokens = simple_tokenize(predicted_text)
|
| 466 |
+
aligned = align_texts(ref_tokens, hyp_tokens)
|
| 467 |
+
results1 = classify_alignment_optimized(aligned, ref_tokens, hyp_tokens)
|
| 468 |
+
|
| 469 |
+
df = pd.DataFrame(results1)
|
| 470 |
+
pd.set_option('display.max_colwidth', 200)
|
| 471 |
+
display(df)
|
| 472 |
+
|
| 473 |
+
"""## map for each token probability of the word in whisper"""
|
| 474 |
+
|
| 475 |
+
# 3) (اختياري) خريطة ثقة الكلمات من faster-whisper
|
| 476 |
+
df_words = extract_word_conf_table(segments) # segments من faster_whisper مع word_timestamps=True
|
| 477 |
+
asr_token_conf, low_t, high_t = build_asr_token_conf(df_words, hyp_tokens)
|
| 478 |
+
|
| 479 |
+
df_words
|
| 480 |
+
|
| 481 |
+
"""## second results with map token"""
|
| 482 |
+
|
| 483 |
+
results2 = classify_alignment_optimized(aligned, ref_tokens, hyp_tokens,
|
| 484 |
+
bert_thresh=0.75, max_bert=0.85,
|
| 485 |
+
asr_token_conf=asr_token_conf, low_high=(low_t, high_t))
|
| 486 |
+
|
| 487 |
+
df = pd.DataFrame(results2)
|
| 488 |
+
pd.set_option('display.max_colwidth', 200)
|
| 489 |
+
display(df)
|
| 490 |
+
|
| 491 |
+
"""## apply the missing and wrongs"""
|
| 492 |
+
|
| 493 |
+
corrected_hyp_tokens = hyp_tokens.copy()
|
| 494 |
+
|
| 495 |
+
for entry in results2:
|
| 496 |
+
asr_word = entry['ASR_word']
|
| 497 |
+
gt_word = entry['GT_word']
|
| 498 |
+
status = entry['status']
|
| 499 |
+
|
| 500 |
+
# Find the index of the asr_word in the original hyp_tokens
|
| 501 |
+
try:
|
| 502 |
+
# We need to find the specific occurrence if there are duplicates.
|
| 503 |
+
# This simple approach assumes unique words or works for the first occurrence.
|
| 504 |
+
# A more robust approach might track indices during the alignment.
|
| 505 |
+
if asr_word in corrected_hyp_tokens:
|
| 506 |
+
idx_in_hyp = corrected_hyp_tokens.index(asr_word)
|
| 507 |
+
if status.startswith('ASR error'):
|
| 508 |
+
corrected_hyp_tokens[idx_in_hyp] = gt_word
|
| 509 |
+
elif status == 'Memorization error':
|
| 510 |
+
# الكلمة يلي قالها خطا (يعني كانت memorization error)
|
| 511 |
+
corrected_hyp_tokens[idx_in_hyp] = f"({asr_word})" # Mark memorization errors
|
| 512 |
+
# else: word was deleted/inserted, no direct replacement needed in this loop
|
| 513 |
+
except Exception as e:
|
| 514 |
+
print(f" Error processing {asr_word}: {e}")
|
| 515 |
+
|
| 516 |
+
corrected_text = ' '.join(corrected_hyp_tokens)
|
| 517 |
+
|
| 518 |
+
print("\n the result :")
|
| 519 |
+
print(corrected_text)
|
| 520 |
+
|
| 521 |
+
def highlight_word_diff(ref_word, pred_word):
|
| 522 |
+
s = SequenceMatcher(None, ref_word, pred_word)
|
| 523 |
+
ref_colored = ""
|
| 524 |
+
pred_colored = ""
|
| 525 |
+
|
| 526 |
+
for tag, i1, i2, j1, j2 in s.get_opcodes():
|
| 527 |
+
ref_part = ref_word[i1:i2]
|
| 528 |
+
pred_part = pred_word[j1:j2]
|
| 529 |
+
if tag == 'equal':
|
| 530 |
+
ref_colored += ref_part
|
| 531 |
+
pred_colored += pred_part
|
| 532 |
+
elif tag == 'replace':
|
| 533 |
+
ref_colored += ''.join([f'<span style="background: #ffeaea; color: #b30000; border-radius:2px;">{c}</span>' for c in ref_part])
|
| 534 |
+
pred_colored += ''.join([f'<span style="background: #ffeaea; color: #b30000; border-radius:2px;">{c}</span>' for c in pred_part])
|
| 535 |
+
elif tag == 'delete':
|
| 536 |
+
ref_colored += ''.join([f'<span style="background: #fff3cd; color:#ff9800; border-radius:2px;">{c}</span>' for c in ref_part])
|
| 537 |
+
elif tag == 'insert':
|
| 538 |
+
pred_colored += ''.join([f'<span style="background: #e3f1fc; color:#0069c0; border-radius:2px;">{c}</span>' for c in pred_part])
|
| 539 |
+
return ref_colored, pred_colored
|
| 540 |
+
|
| 541 |
+
def diff_colored_chars(ref, pred):
|
| 542 |
+
ref_words = ref.split()
|
| 543 |
+
pred_words = pred.split()
|
| 544 |
+
s = SequenceMatcher(None, ref_words, pred_words)
|
| 545 |
+
ref_html = ""
|
| 546 |
+
pred_html = ""
|
| 547 |
+
|
| 548 |
+
for tag, i1, i2, j1, j2 in s.get_opcodes():
|
| 549 |
+
if tag == 'equal':
|
| 550 |
+
ref_html += ' '.join([w for w in ref_words[i1:i2]]) + " "
|
| 551 |
+
pred_html += ' '.join([w for w in pred_words[j1:j2]]) + " "
|
| 552 |
+
elif tag == 'replace':
|
| 553 |
+
for rw, pw in zip(ref_words[i1:i2], pred_words[j1:j2]):
|
| 554 |
+
rw_c, pw_c = highlight_word_diff(rw, pw)
|
| 555 |
+
ref_html += f'<span style="padding:2px 4px; margin:1px; border-radius:2px;">{rw_c}</span> '
|
| 556 |
+
pred_html += f'<span style="padding:2px 4px; margin:1px; border-radius:2px;">{pw_c}</span> '
|
| 557 |
+
elif tag == 'insert':
|
| 558 |
+
pred_html += ' '.join([f'<span style="background: #e3f1fc; color:#0069c0; border-radius:2px;">{w}</span>' for w in pred_words[j1:j2]]) + " "
|
| 559 |
+
elif tag == 'delete':
|
| 560 |
+
ref_html += ' '.join([f'<span style="background: #fff3cd; color:#ff9800; border-radius:2px;">{w}</span>' for w in ref_words[i1:i2]]) + " "
|
| 561 |
+
|
| 562 |
+
display(HTML(f"""<div style="font-family: 'Cairo', Tahoma, Arial; font-size:20px; direction:rtl;">
|
| 563 |
+
<b>النص المرجعي:</b><br>{ref_html}<br><br>
|
| 564 |
+
<b>النص الناتج:</b><br>{pred_html}
|
| 565 |
+
</div>"""))
|
| 566 |
+
|
| 567 |
+
reference_text = normalize_arabic(reference_text)
|
| 568 |
+
diff_colored_chars(reference_text, corrected_text)
|
| 569 |
+
|