Update app.py
Browse files
app.py
CHANGED
|
@@ -12,15 +12,28 @@ from transformers import AutoTokenizer, AutoModel
|
|
| 12 |
import soundfile as sf
|
| 13 |
|
| 14 |
# =========================
|
| 15 |
-
#
|
| 16 |
# =========================
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
#
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
| 24 |
|
| 25 |
# =========================
|
| 26 |
# Lazy models
|
|
@@ -33,28 +46,28 @@ _WHISPER = None
|
|
| 33 |
def load_models(
|
| 34 |
sbert_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
| 35 |
marbert_name="UBC-NLP/MARBERT",
|
| 36 |
-
whisper_name=
|
| 37 |
-
whisper_compute=
|
|
|
|
| 38 |
):
|
| 39 |
-
"""Load models
|
| 40 |
global _SBERT, _MARBERT_TOK, _MARBERT, _WHISPER
|
| 41 |
|
| 42 |
-
# حماية على CPU: اجبار نماذج أخف
|
| 43 |
-
if CPU_MODE:
|
| 44 |
-
whisper_name = DEFAULT_WHISPER_CPU
|
| 45 |
-
whisper_compute = DEFAULT_COMPUTE_CPU
|
| 46 |
-
|
| 47 |
if _SBERT is None:
|
| 48 |
-
_SBERT = SentenceTransformer(sbert_name, device=DEVICE)
|
|
|
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
if _MARBERT is None and
|
| 52 |
_MARBERT_TOK = AutoTokenizer.from_pretrained(marbert_name)
|
| 53 |
-
_MARBERT = AutoModel.from_pretrained(marbert_name).to(DEVICE)
|
| 54 |
_MARBERT.eval()
|
|
|
|
| 55 |
|
| 56 |
if _WHISPER is None:
|
| 57 |
-
_WHISPER = WhisperModel(whisper_name, device=DEVICE
|
|
|
|
|
|
|
| 58 |
|
| 59 |
# =========================
|
| 60 |
# Normalization / Tokenization / Alignment
|
|
@@ -68,7 +81,6 @@ def normalize_ar_orth(text: str) -> str:
|
|
| 68 |
return text
|
| 69 |
|
| 70 |
def simple_tokenize(text: str):
|
| 71 |
-
"""يحاول punkt؛ وإن فشل يستخدم تجزئة بسيطة بالمسافات."""
|
| 72 |
t = normalize_ar_orth(text)
|
| 73 |
try:
|
| 74 |
import nltk
|
|
@@ -108,7 +120,8 @@ def arabic_soundex(word):
|
|
| 108 |
for ch in w:
|
| 109 |
for rep, chars in groups.items():
|
| 110 |
if ch in chars:
|
| 111 |
-
code.append(rep)
|
|
|
|
| 112 |
return "".join(code)
|
| 113 |
|
| 114 |
def phonetic_similarity(w1, w2):
|
|
@@ -168,8 +181,8 @@ def marbert_cls_similarity(a: str, b: str) -> float:
|
|
| 168 |
if _MARBERT is None:
|
| 169 |
return 0.0
|
| 170 |
with torch.no_grad():
|
| 171 |
-
ta = _MARBERT_TOK(a, return_tensors='pt', truncation=True, padding=True).to(DEVICE)
|
| 172 |
-
tb = _MARBERT_TOK(b, return_tensors='pt', truncation=True, padding=True).to(DEVICE)
|
| 173 |
ea = _MARBERT(**ta).last_hidden_state[:,0,:]
|
| 174 |
eb = _MARBERT(**tb).last_hidden_state[:,0,:]
|
| 175 |
sim = util.cos_sim(ea, eb).item()
|
|
@@ -390,7 +403,7 @@ def literal_similarity(original, recited):
|
|
| 390 |
return {"levenshtein": round(lev,3), "word_overlap": round(word_overlap,3),
|
| 391 |
"bleu1": round(bleu1,3), "literal_score": round(final_score,3)}
|
| 392 |
|
| 393 |
-
def semantic_similarity(original, recited, use_marbert=
|
| 394 |
sbert_sim = float(util.pytorch_cos_sim(_SBERT.encode(original, convert_to_tensor=True),
|
| 395 |
_SBERT.encode(recited, convert_to_tensor=True)))
|
| 396 |
marbert_sim = marbert_cls_similarity(original, recited) if use_marbert else 0.0
|
|
@@ -398,10 +411,9 @@ def semantic_similarity(original, recited, use_marbert=True):
|
|
| 398 |
"semantic_score": round(max(sbert_sim, marbert_sim),3)}
|
| 399 |
|
| 400 |
# =========================
|
| 401 |
-
# Audio
|
| 402 |
# =========================
|
| 403 |
def ensure_audio_path(audio):
|
| 404 |
-
"""Accepts filepath (str) OR (numpy_array, sr). Returns a valid filepath."""
|
| 405 |
if isinstance(audio, str):
|
| 406 |
if not os.path.exists(audio):
|
| 407 |
raise FileNotFoundError(f"Audio path not found: {audio}")
|
|
@@ -415,7 +427,7 @@ def ensure_audio_path(audio):
|
|
| 415 |
raise ValueError("Unsupported audio input format")
|
| 416 |
|
| 417 |
# =========================
|
| 418 |
-
# Pipeline (
|
| 419 |
# =========================
|
| 420 |
def transcribe_and_evaluate(audio, original_text, whisper_size=None,
|
| 421 |
compute_type=None, vad=True, use_marbert=True):
|
|
@@ -423,29 +435,29 @@ def transcribe_and_evaluate(audio, original_text, whisper_size=None,
|
|
| 423 |
if not original_text or not original_text.strip():
|
| 424 |
raise ValueError("Original text is empty.")
|
| 425 |
|
| 426 |
-
#
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
whisper_size = whisper_size or "large-v3"
|
| 433 |
-
compute_type = compute_type or "float16"
|
| 434 |
|
| 435 |
-
load_models(whisper_name=whisper_size, whisper_compute=compute_type)
|
| 436 |
|
| 437 |
audio_path = ensure_audio_path(audio)
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
)
|
| 442 |
segments = list(segments)
|
|
|
|
| 443 |
|
|
|
|
| 444 |
words = []
|
| 445 |
for seg in segments:
|
| 446 |
for w in (seg.words or []):
|
| 447 |
tok = clean_ar_token(w.word)
|
| 448 |
-
if tok:
|
|
|
|
| 449 |
asr_text = " ".join(words)
|
| 450 |
|
| 451 |
ref_tokens = simple_tokenize(original_text)
|
|
@@ -454,6 +466,7 @@ def transcribe_and_evaluate(audio, original_text, whisper_size=None,
|
|
| 454 |
|
| 455 |
df_words = extract_word_conf_table(segments)
|
| 456 |
asr_token_conf, low_t, high_t = build_asr_token_conf(df_words, hyp_tokens)
|
|
|
|
| 457 |
|
| 458 |
results, corrected_text = classify_alignment_optimized(
|
| 459 |
aligned, ref_tokens, hyp_tokens,
|
|
@@ -462,33 +475,31 @@ def transcribe_and_evaluate(audio, original_text, whisper_size=None,
|
|
| 462 |
)
|
| 463 |
|
| 464 |
lit = literal_similarity(original_text, corrected_text)
|
| 465 |
-
sem = semantic_similarity(original_text, corrected_text, use_marbert=
|
| 466 |
|
| 467 |
df = pd.DataFrame(results)
|
| 468 |
|
| 469 |
report = {
|
| 470 |
-
"whisper_model": whisper_size,
|
| 471 |
-
"compute_type": compute_type,
|
| 472 |
"original_text": original_text,
|
| 473 |
"asr_text": asr_text,
|
| 474 |
"corrected_text": corrected_text,
|
| 475 |
"literal": lit,
|
| 476 |
"semantic": sem,
|
| 477 |
-
"low_t": low_t, "high_t": high_t,
|
| 478 |
}
|
| 479 |
return corrected_text, asr_text, json.dumps(report, ensure_ascii=False, indent=2), df
|
| 480 |
|
| 481 |
except Exception as e:
|
| 482 |
tb = traceback.format_exc()
|
| 483 |
print("ERROR in transcribe_and_evaluate:\n", tb, flush=True)
|
| 484 |
-
# نرجع JSON بالخطأ بدل ما نفجّر الواجهة
|
| 485 |
empty_df = pd.DataFrame([{"ASR_word":"","GT_word":"","status":"ERROR","reason":str(e),"used":""}])
|
| 486 |
err_json = json.dumps({"error": str(e), "traceback": tb}, ensure_ascii=False, indent=2)
|
| 487 |
gr.Warning(str(e))
|
| 488 |
return "", "", err_json, empty_df
|
| 489 |
|
| 490 |
def api_predict(audio, original_text, whisper_size=None, compute_type=None, vad=True, use_marbert=True):
|
| 491 |
-
# نفس الدالة لكن ترجع JSON فقط
|
| 492 |
corrected_text, asr_text, report_json, df = transcribe_and_evaluate(
|
| 493 |
audio, original_text, whisper_size, compute_type, vad, use_marbert
|
| 494 |
)
|
|
@@ -505,23 +516,15 @@ def build_ui():
|
|
| 505 |
gr.Markdown("## Samaali — ASR Post-Processing (Whisper + Alignment + Confidence + Semantics)")
|
| 506 |
|
| 507 |
with gr.Row():
|
| 508 |
-
# filepath أسلم للـ Spaces
|
| 509 |
audio = gr.Audio(sources=["microphone","upload"], type="filepath", label="Audio")
|
| 510 |
original = gr.Textbox(lines=8, label="Original Text (Ground Truth)")
|
| 511 |
|
| 512 |
with gr.Row():
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
label="Whisper model size"
|
| 517 |
-
)
|
| 518 |
-
compute_type = gr.Dropdown(
|
| 519 |
-
choices=["int8", "int8_float16", "float16", "float32"],
|
| 520 |
-
value=("float16" if not CPU_MODE else DEFAULT_COMPUTE_CPU),
|
| 521 |
-
label="compute_type"
|
| 522 |
-
)
|
| 523 |
vad = gr.Checkbox(value=True, label="VAD filter")
|
| 524 |
-
use_marbert = gr.Checkbox(value=
|
| 525 |
|
| 526 |
btn = gr.Button("Transcribe & Evaluate", variant="primary")
|
| 527 |
|
|
|
|
| 12 |
import soundfile as sf
|
| 13 |
|
| 14 |
# =========================
|
| 15 |
+
# Global config (forced per your request)
|
| 16 |
# =========================
|
| 17 |
+
# نثبّت الإعدادات المطلوبة على CPU
|
| 18 |
+
FORCE_WHISPER_NAME = "large-v3"
|
| 19 |
+
FORCE_COMPUTE_TYPE = "int8"
|
| 20 |
+
FORCE_USE_MARBERT = True
|
| 21 |
+
|
| 22 |
+
# خيارات تفريغ ثابتة لتقليل الفروقات مع النوتبوك
|
| 23 |
+
ASR_OPTS = dict(
|
| 24 |
+
word_timestamps=True,
|
| 25 |
+
vad_filter=True,
|
| 26 |
+
vad_parameters={"min_silence_duration_ms": 200},
|
| 27 |
+
beam_size=5,
|
| 28 |
+
best_of=5,
|
| 29 |
+
temperature=0.0, # جعل فك التشفير حتمي قدر الإمكان
|
| 30 |
+
)
|
| 31 |
|
| 32 |
+
# =========================
|
| 33 |
+
# Device
|
| 34 |
+
# =========================
|
| 35 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 36 |
+
print(f"[INIT] DEVICE={DEVICE}", flush=True)
|
| 37 |
|
| 38 |
# =========================
|
| 39 |
# Lazy models
|
|
|
|
| 46 |
def load_models(
|
| 47 |
sbert_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
| 48 |
marbert_name="UBC-NLP/MARBERT",
|
| 49 |
+
whisper_name=FORCE_WHISPER_NAME,
|
| 50 |
+
whisper_compute=FORCE_COMPUTE_TYPE,
|
| 51 |
+
use_marbert=FORCE_USE_MARBERT
|
| 52 |
):
|
| 53 |
+
"""Load models once; forced config respected even on CPU."""
|
| 54 |
global _SBERT, _MARBERT_TOK, _MARBERT, _WHISPER
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
if _SBERT is None:
|
| 57 |
+
_SBERT = SentenceTransformer(sbert_name, device=("cuda" if DEVICE=="cuda" else "cpu"))
|
| 58 |
+
print(f"[LOAD] SBERT: {sbert_name}", flush=True)
|
| 59 |
|
| 60 |
+
# مفعّل على CPU حسب رغبتك
|
| 61 |
+
if _MARBERT is None and use_marbert:
|
| 62 |
_MARBERT_TOK = AutoTokenizer.from_pretrained(marbert_name)
|
| 63 |
+
_MARBERT = AutoModel.from_pretrained(marbert_name).to(("cuda" if DEVICE=="cuda" else "cpu"))
|
| 64 |
_MARBERT.eval()
|
| 65 |
+
print(f"[LOAD] MARBERT: {marbert_name} (device={DEVICE})", flush=True)
|
| 66 |
|
| 67 |
if _WHISPER is None:
|
| 68 |
+
_WHISPER = WhisperModel(whisper_name, device=("cuda" if DEVICE=="cuda" else "cpu"),
|
| 69 |
+
compute_type=whisper_compute)
|
| 70 |
+
print(f"[LOAD] Whisper: {whisper_name} (compute={whisper_compute})", flush=True)
|
| 71 |
|
| 72 |
# =========================
|
| 73 |
# Normalization / Tokenization / Alignment
|
|
|
|
| 81 |
return text
|
| 82 |
|
| 83 |
def simple_tokenize(text: str):
|
|
|
|
| 84 |
t = normalize_ar_orth(text)
|
| 85 |
try:
|
| 86 |
import nltk
|
|
|
|
| 120 |
for ch in w:
|
| 121 |
for rep, chars in groups.items():
|
| 122 |
if ch in chars:
|
| 123 |
+
code.append(rep)
|
| 124 |
+
break
|
| 125 |
return "".join(code)
|
| 126 |
|
| 127 |
def phonetic_similarity(w1, w2):
|
|
|
|
| 181 |
if _MARBERT is None:
|
| 182 |
return 0.0
|
| 183 |
with torch.no_grad():
|
| 184 |
+
ta = _MARBERT_TOK(a, return_tensors='pt', truncation=True, padding=True).to(("cuda" if DEVICE=="cuda" else "cpu"))
|
| 185 |
+
tb = _MARBERT_TOK(b, return_tensors='pt', truncation=True, padding=True).to(("cuda" if DEVICE=="cuda" else "cpu"))
|
| 186 |
ea = _MARBERT(**ta).last_hidden_state[:,0,:]
|
| 187 |
eb = _MARBERT(**tb).last_hidden_state[:,0,:]
|
| 188 |
sim = util.cos_sim(ea, eb).item()
|
|
|
|
| 403 |
return {"levenshtein": round(lev,3), "word_overlap": round(word_overlap,3),
|
| 404 |
"bleu1": round(bleu1,3), "literal_score": round(final_score,3)}
|
| 405 |
|
| 406 |
+
def semantic_similarity(original, recited, use_marbert=FORCE_USE_MARBERT):
|
| 407 |
sbert_sim = float(util.pytorch_cos_sim(_SBERT.encode(original, convert_to_tensor=True),
|
| 408 |
_SBERT.encode(recited, convert_to_tensor=True)))
|
| 409 |
marbert_sim = marbert_cls_similarity(original, recited) if use_marbert else 0.0
|
|
|
|
| 411 |
"semantic_score": round(max(sbert_sim, marbert_sim),3)}
|
| 412 |
|
| 413 |
# =========================
|
| 414 |
+
# Audio helper
|
| 415 |
# =========================
|
| 416 |
def ensure_audio_path(audio):
|
|
|
|
| 417 |
if isinstance(audio, str):
|
| 418 |
if not os.path.exists(audio):
|
| 419 |
raise FileNotFoundError(f"Audio path not found: {audio}")
|
|
|
|
| 427 |
raise ValueError("Unsupported audio input format")
|
| 428 |
|
| 429 |
# =========================
|
| 430 |
+
# Pipeline (robust errors + logs)
|
| 431 |
# =========================
|
| 432 |
def transcribe_and_evaluate(audio, original_text, whisper_size=None,
|
| 433 |
compute_type=None, vad=True, use_marbert=True):
|
|
|
|
| 435 |
if not original_text or not original_text.strip():
|
| 436 |
raise ValueError("Original text is empty.")
|
| 437 |
|
| 438 |
+
# نُهمل اختيارات الواجهة ونفرض إعداداتك
|
| 439 |
+
whisper_size = FORCE_WHISPER_NAME
|
| 440 |
+
compute_type = FORCE_COMPUTE_TYPE
|
| 441 |
+
use_marbert = FORCE_USE_MARBERT
|
| 442 |
+
|
| 443 |
+
print(f"[RUN] whisper={whisper_size}, compute={compute_type}, marbert={use_marbert}", flush=True)
|
|
|
|
|
|
|
| 444 |
|
| 445 |
+
load_models(whisper_name=whisper_size, whisper_compute=compute_type, use_marbert=use_marbert)
|
| 446 |
|
| 447 |
audio_path = ensure_audio_path(audio)
|
| 448 |
+
print(f"[AUDIO] path={audio_path}", flush=True)
|
| 449 |
+
|
| 450 |
+
segments, info = _WHISPER.transcribe(audio_path, **ASR_OPTS)
|
|
|
|
| 451 |
segments = list(segments)
|
| 452 |
+
print(f"[ASR] segments={len(segments)}", flush=True)
|
| 453 |
|
| 454 |
+
# Build ASR text from words (more control)
|
| 455 |
words = []
|
| 456 |
for seg in segments:
|
| 457 |
for w in (seg.words or []):
|
| 458 |
tok = clean_ar_token(w.word)
|
| 459 |
+
if tok:
|
| 460 |
+
words.append(tok)
|
| 461 |
asr_text = " ".join(words)
|
| 462 |
|
| 463 |
ref_tokens = simple_tokenize(original_text)
|
|
|
|
| 466 |
|
| 467 |
df_words = extract_word_conf_table(segments)
|
| 468 |
asr_token_conf, low_t, high_t = build_asr_token_conf(df_words, hyp_tokens)
|
| 469 |
+
print(f"[CONF] low_t={low_t:.3f}, high_t={high_t:.3f}", flush=True)
|
| 470 |
|
| 471 |
results, corrected_text = classify_alignment_optimized(
|
| 472 |
aligned, ref_tokens, hyp_tokens,
|
|
|
|
| 475 |
)
|
| 476 |
|
| 477 |
lit = literal_similarity(original_text, corrected_text)
|
| 478 |
+
sem = semantic_similarity(original_text, corrected_text, use_marbert=use_marbert)
|
| 479 |
|
| 480 |
df = pd.DataFrame(results)
|
| 481 |
|
| 482 |
report = {
|
| 483 |
+
"requested": {"whisper_model": whisper_size, "compute_type": compute_type, "use_marbert": use_marbert},
|
| 484 |
+
"effective": {"whisper_model": whisper_size, "compute_type": compute_type, "use_marbert": use_marbert},
|
| 485 |
"original_text": original_text,
|
| 486 |
"asr_text": asr_text,
|
| 487 |
"corrected_text": corrected_text,
|
| 488 |
"literal": lit,
|
| 489 |
"semantic": sem,
|
| 490 |
+
"low_t": float(low_t), "high_t": float(high_t),
|
| 491 |
}
|
| 492 |
return corrected_text, asr_text, json.dumps(report, ensure_ascii=False, indent=2), df
|
| 493 |
|
| 494 |
except Exception as e:
|
| 495 |
tb = traceback.format_exc()
|
| 496 |
print("ERROR in transcribe_and_evaluate:\n", tb, flush=True)
|
|
|
|
| 497 |
empty_df = pd.DataFrame([{"ASR_word":"","GT_word":"","status":"ERROR","reason":str(e),"used":""}])
|
| 498 |
err_json = json.dumps({"error": str(e), "traceback": tb}, ensure_ascii=False, indent=2)
|
| 499 |
gr.Warning(str(e))
|
| 500 |
return "", "", err_json, empty_df
|
| 501 |
|
| 502 |
def api_predict(audio, original_text, whisper_size=None, compute_type=None, vad=True, use_marbert=True):
|
|
|
|
| 503 |
corrected_text, asr_text, report_json, df = transcribe_and_evaluate(
|
| 504 |
audio, original_text, whisper_size, compute_type, vad, use_marbert
|
| 505 |
)
|
|
|
|
| 516 |
gr.Markdown("## Samaali — ASR Post-Processing (Whisper + Alignment + Confidence + Semantics)")
|
| 517 |
|
| 518 |
with gr.Row():
|
|
|
|
| 519 |
audio = gr.Audio(sources=["microphone","upload"], type="filepath", label="Audio")
|
| 520 |
original = gr.Textbox(lines=8, label="Original Text (Ground Truth)")
|
| 521 |
|
| 522 |
with gr.Row():
|
| 523 |
+
# واجهة ثابتة حسب طلبك (تُهمل في الدالة لكن نعرضها)
|
| 524 |
+
whisper_size = gr.Dropdown(choices=["large-v3"], value="large-v3", label="Whisper model size (forced)")
|
| 525 |
+
compute_type = gr.Dropdown(choices=["int8"], value="int8", label="compute_type (forced)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
vad = gr.Checkbox(value=True, label="VAD filter")
|
| 527 |
+
use_marbert = gr.Checkbox(value=True, label="Use MARBERT (forced)")
|
| 528 |
|
| 529 |
btn = gr.Button("Transcribe & Evaluate", variant="primary")
|
| 530 |
|