alaatiger989 commited on Nov 18, 2025

Commit

b5e57ee

verified ·

1 Parent(s): d6bee05

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
Extracting_tokenizer_dir_from_Nemo_model.py +126 -0
StartingServer.txt +2 -0
UploadingtoGitlab.txt +4 -0
WER_CER_eval.py +123 -0
WER_calc.py +64 -0
app_api_2.py +345 -0
continue_finetuning_nemo.py +199 -0
converting_dataset_to_8khz.py +95 -0
data_tts/gcloud_tts_sample_001.wav +0 -0
data_tts/gcloud_tts_sample_002.wav +0 -0
data_tts/gcloud_tts_sample_053.wav +0 -0
data_tts/gcloud_tts_sample_060.wav +0 -0
data_tts/gcloud_tts_sample_1065.wav +3 -0
data_tts/gcloud_tts_sample_1067.wav +3 -0
data_tts/gcloud_tts_sample_107.wav +3 -0
data_tts/gcloud_tts_sample_1078.wav +3 -0
data_tts/gcloud_tts_sample_1080.wav +3 -0
data_tts/gcloud_tts_sample_1082.wav +3 -0
data_tts/gcloud_tts_sample_1189.wav +0 -0
data_tts/gcloud_tts_sample_1190.wav +0 -0
data_tts/gcloud_tts_sample_1191.wav +0 -0
data_tts/gcloud_tts_sample_1192.wav +0 -0
data_tts/gcloud_tts_sample_1193.wav +0 -0
data_tts/gcloud_tts_sample_1221.wav +0 -0
data_tts/gcloud_tts_sample_1222.wav +0 -0
data_tts/gcloud_tts_sample_1236.wav +0 -0
data_tts/gcloud_tts_sample_1241.wav +0 -0
data_tts/gcloud_tts_sample_1277.wav +0 -0
data_tts/gcloud_tts_sample_1278.wav +0 -0
data_tts/gcloud_tts_sample_1279.wav +0 -0
data_tts/gcloud_tts_sample_1280.wav +0 -0
data_tts/gcloud_tts_sample_1286.wav +0 -0
data_tts/gcloud_tts_sample_1287.wav +0 -0
data_tts/gcloud_tts_sample_1295.wav +0 -0
data_tts/gcloud_tts_sample_1296.wav +0 -0
data_tts/gcloud_tts_sample_1297.wav +0 -0
data_tts/gcloud_tts_sample_1304.wav +0 -0
data_tts/gcloud_tts_sample_1305.wav +0 -0
data_tts/gcloud_tts_sample_1306.wav +0 -0
data_tts/gcloud_tts_sample_1313.wav +0 -0
data_tts/gcloud_tts_sample_1314.wav +0 -0
data_tts/gcloud_tts_sample_1322.wav +0 -0
eval_manifest.jsonl +163 -0
evaluation_results.csv +164 -0
finetune_asr.py +711 -0
testing_main.py +192 -0
testing_main_v2.py +473 -0
train_manifest.jsonl +0 -0
train_split.jsonl +0 -0

.gitattributes CHANGED Viewed

@@ -968,3 +968,9 @@ data_tts/gcloud_tts_sample_1073.wav filter=lfs diff=lfs merge=lfs -text
 data_tts/gcloud_tts_sample_1075.wav filter=lfs diff=lfs merge=lfs -text
 data_tts/gcloud_tts_sample_1072.wav filter=lfs diff=lfs merge=lfs -text
 data_tts/gcloud_tts_sample_1079.wav filter=lfs diff=lfs merge=lfs -text

 data_tts/gcloud_tts_sample_1075.wav filter=lfs diff=lfs merge=lfs -text
 data_tts/gcloud_tts_sample_1072.wav filter=lfs diff=lfs merge=lfs -text
 data_tts/gcloud_tts_sample_1079.wav filter=lfs diff=lfs merge=lfs -text
+data_tts/gcloud_tts_sample_107.wav filter=lfs diff=lfs merge=lfs -text
+data_tts/gcloud_tts_sample_1065.wav filter=lfs diff=lfs merge=lfs -text
+data_tts/gcloud_tts_sample_1078.wav filter=lfs diff=lfs merge=lfs -text
+data_tts/gcloud_tts_sample_1067.wav filter=lfs diff=lfs merge=lfs -text
+data_tts/gcloud_tts_sample_1082.wav filter=lfs diff=lfs merge=lfs -text
+data_tts/gcloud_tts_sample_1080.wav filter=lfs diff=lfs merge=lfs -text

Extracting_tokenizer_dir_from_Nemo_model.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+Run this script FIRST to extract the tokenizer from the .nemo file
+This creates the tokenizer folder that the training script needs
+"""
+import os
+import tarfile
+import zipfile
+import shutil
+MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
+OUTPUT_DIR = "tokenizer"
+print("🔹 Detecting .nemo file format...")
+def try_extract_tokenizer():
+    """Try different methods to extract tokenizer"""
+    # Method 1: Try as regular tar (no compression)
+    try:
+        print("Trying: Regular tar format...")
+        with tarfile.open(MODEL_PATH, 'r:') as tar:
+            return extract_from_tar(tar)
+    except Exception as e:
+        print(f"  ✗ Not a regular tar: {e}")
+    # Method 2: Try as gzipped tar
+    try:
+        print("Trying: Gzipped tar format...")
+        with tarfile.open(MODEL_PATH, 'r:gz') as tar:
+            return extract_from_tar(tar)
+    except Exception as e:
+        print(f"  ✗ Not gzipped tar: {e}")
+    # Method 3: Try as zip file
+    try:
+        print("Trying: ZIP format...")
+        with zipfile.ZipFile(MODEL_PATH, 'r') as zf:
+            return extract_from_zip(zf)
+    except Exception as e:
+        print(f"  ✗ Not a ZIP file: {e}")
+    # Method 4: Try auto-detect
+    try:
+        print("Trying: Auto-detect format...")
+        with tarfile.open(MODEL_PATH, 'r:*') as tar:
+            return extract_from_tar(tar)
+    except Exception as e:
+        print(f"  ✗ Auto-detect failed: {e}")
+    return False
+def extract_from_tar(tar):
+    """Extract tokenizer files from tar archive"""
+    tokenizer_files = [m for m in tar.getmembers() if 'tokenizer' in m.name.lower()]
+    if not tokenizer_files:
+        print("\n📋 Available files in archive:")
+        for member in tar.getmembers()[:20]:  # Show first 20
+            print(f"  - {member.name}")
+        if len(tar.getmembers()) > 20:
+            print(f"  ... and {len(tar.getmembers()) - 20} more files")
+        return False
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    for member in tokenizer_files:
+        # Extract to temp directory
+        tar.extract(member, path="temp_extract")
+        # Move to tokenizer directory
+        src = os.path.join("temp_extract", member.name)
+        if os.path.isfile(src):
+            dst = os.path.join(OUTPUT_DIR, os.path.basename(member.name))
+            shutil.copy2(src, dst)
+            print(f"✅ Extracted: {os.path.basename(member.name)}")
+    # Cleanup
+    if os.path.exists("temp_extract"):
+        shutil.rmtree("temp_extract")
+    return True
+def extract_from_zip(zf):
+    """Extract tokenizer files from zip archive"""
+    tokenizer_files = [n for n in zf.namelist() if 'tokenizer' in n.lower()]
+    if not tokenizer_files:
+        print("\n📋 Available files in archive:")
+        for name in zf.namelist()[:20]:
+            print(f"  - {name}")
+        if len(zf.namelist()) > 20:
+            print(f"  ... and {len(zf.namelist()) - 20} more files")
+        return False
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    for name in tokenizer_files:
+        # Extract file
+        zf.extract(name, path="temp_extract")
+        # Move to tokenizer directory
+        src = os.path.join("temp_extract", name)
+        if os.path.isfile(src):
+            dst = os.path.join(OUTPUT_DIR, os.path.basename(name))
+            shutil.copy2(src, dst)
+            print(f"✅ Extracted: {os.path.basename(name)}")
+    # Cleanup
+    if os.path.exists("temp_extract"):
+        shutil.rmtree("temp_extract")
+    return True
+# Try extraction
+success = try_extract_tokenizer()
+if success:
+    print(f"\n✅ Tokenizer extracted to: {OUTPUT_DIR}")
+    print("\n📁 Tokenizer files:")
+    for file in os.listdir(OUTPUT_DIR):
+        print(f"  - {file}")
+    print("\n✅ Now you can run the training script!")
+else:
+    print("\n❌ Could not extract tokenizer from .nemo file")
+    print("\n🔧 Alternative solution: The training script will use the embedded tokenizer")
+    print("   No action needed - proceed with training!")

StartingServer.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ python -m streamlit run app.py
2	+ python -m uvicorn app_api:app --host 0.0.0.0 --port 8070 --reload

UploadingtoGitlab.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+cd existing_repo
+git remote add origin https://gitlab.expertflow.com/bot/ai/contextual_asr.git
+git branch -M main
+git push -uf origin main

WER_CER_eval.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+import json
+import torch
+import pandas as pd
+import Levenshtein as lev
+from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
+from nemo.collections.asr.metrics.wer import word_error_rate  # ✅ Keep this
+# ==========================
+# CONFIGURATION
+# ==========================
+MODEL_PATH = "output_finetuned/finetuned_model_best.nemo"
+EVAL_MANIFEST = "eval_manifest.jsonl"
+# ==========================
+# LOAD MODEL
+# ==========================
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Loading model on: {device}")
+try:
+    model = EncDecHybridRNNTCTCBPEModel.restore_from(restore_path=MODEL_PATH, map_location=device)
+    model = model.to(device)
+    model.eval()
+    print("✅ Model loaded successfully.")
+except Exception as e:
+    print(f"❌ Failed to load model: {e}")
+    exit()
+# ==========================
+# LOAD MANIFEST
+# ==========================
+def load_manifest(manifest_path):
+    """Load audio paths and text from a JSONL manifest file."""
+    data = []
+    with open(manifest_path, "r", encoding="utf-8") as f:
+        for line in f:
+            try:
+                item = json.loads(line.strip())
+                audio_path = item["audio_filepath"]
+                text = item.get("text", "").strip()
+                if os.path.exists(audio_path) and text:
+                    data.append((audio_path, text))
+                else:
+                    print(f"⚠️ Skipping invalid entry: {audio_path}")
+            except json.JSONDecodeError as e:
+                print(f"❌ Invalid JSON line: {e}")
+    print(f"\n📁 Loaded {len(data)} valid samples from manifest.")
+    return data
+# ==========================
+# CER FUNCTION
+# ==========================
+def calculate_cer(reference, hypothesis):
+    """Compute Character Error Rate (CER)."""
+    reference = reference.replace(" ", "")
+    hypothesis = hypothesis.replace(" ", "")
+    if len(reference) == 0:
+        return 0.0
+    return lev.distance(reference, hypothesis) / len(reference)
+# ==========================
+# EVALUATION FUNCTION
+# ==========================
+def evaluate_model(model, dataset):
+    total_wer, total_cer = 0.0, 0.0
+    results = []
+    for i, (audio_path, expected_text) in enumerate(dataset, 1):
+        print(f"\n🔍 [{i}/{len(dataset)}] Evaluating: {audio_path}")
+        with torch.no_grad():
+            output = model.transcribe([audio_path])
+            if isinstance(output, tuple):
+                prediction_list = output[0]
+            else:
+                prediction_list = output
+            prediction = prediction_list[0] if isinstance(prediction_list, list) else prediction_list
+        # Compute WER & CER
+        wer = word_error_rate([expected_text], [prediction])
+        cer = calculate_cer(expected_text, prediction)
+        print(f"Expected : {expected_text}")
+        print(f"Predicted: {prediction}")
+        print(f"WER={wer:.3f}, CER={cer:.3f}")
+        results.append({
+            "audio": os.path.basename(audio_path),
+            "expected": expected_text,
+            "predicted": prediction,
+            "WER": wer,
+            "CER": cer
+        })
+        total_wer += wer
+        total_cer += cer
+    avg_wer = total_wer / len(dataset)
+    avg_cer = total_cer / len(dataset)
+    accuracy = (1 - avg_wer) * 100  # ✅ Calculate accuracy percentage
+    print("\n==============================")
+    print(f"📊 Average WER: {avg_wer:.3f}")
+    print(f"🎯 Accuracy: {accuracy:.2f}%")  # ✅ Added this line
+    print(f"📊 Average CER: {avg_cer:.3f}")
+    print("==============================")
+    return results, avg_wer, avg_cer
+# ==========================
+# RUN EVALUATION
+# ==========================
+if __name__ == "__main__":
+    dataset = load_manifest(EVAL_MANIFEST)
+    if not dataset:
+        print("❌ No valid data found in manifest.")
+        exit()
+    results, avg_wer, avg_cer = evaluate_model(model, dataset)
+    df = pd.DataFrame(results)
+    df.to_csv("evaluation_results.csv", index=False, encoding="utf-8-sig")
+    print("\n💾 Results saved to: evaluation_results.csv")

WER_calc.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+import torch
+from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
+from nemo.collections.asr.metrics.wer import word_error_rate
+# ==========================
+# CONFIGURATION
+# ==========================
+MODEL_PATH = "output_finetuned/finetuned_model_best.nemo"
+SAMPLE_AUDIO = "arabic_recording.wav"
+EXPECTED_TEXT = "زيرو واحد واحد واحد واحد واحد واحد اتنين اربعة ستة"
+# ==========================
+# LOAD MODEL
+# ==========================
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Loading model on: {device}")
+try:
+    model = EncDecHybridRNNTCTCBPEModel.restore_from(restore_path=MODEL_PATH, map_location=device)
+    model.eval()
+    print("✅ Model loaded successfully.")
+except Exception as e:
+    print(f"❌ Failed to load model: {e}")
+    exit()
+# ==========================
+# TEST FUNCTION
+# ==========================
+def test_model(model, sample_audio, expected_text):
+    if not os.path.exists(sample_audio):
+        print(f"❌ Audio file not found: {sample_audio}")
+        return
+    print(f"\n🔍 Testing on: {sample_audio}")
+    # Transcribe
+    with torch.no_grad():
+        output = model.transcribe([sample_audio])
+        # Handle different return types
+        if isinstance(output, tuple):
+            # Sometimes returns (predictions, tokens)
+            prediction_list = output[0]
+        else:
+            prediction_list = output
+        # Ensure it's a single string
+        prediction = prediction_list[0] if isinstance(prediction_list, list) else prediction_list
+    # Display results
+    print(f"\nPredicted: {prediction}")
+    print(f"Expected : {expected_text}")
+    # Compute WER
+    wer = word_error_rate([expected_text], [prediction])
+    print(f"\n📊 Word Error Rate (WER): {wer:.3f}")
+    return prediction, wer
+# ==========================
+# RUN TEST
+# ==========================
+if __name__ == "__main__":
+    prediction, wer = test_model(model, SAMPLE_AUDIO, EXPECTED_TEXT)

app_api_2.py ADDED Viewed

	@@ -0,0 +1,345 @@

+# from fastapi import FastAPI, File, UploadFile
+# from fastapi.responses import JSONResponse
+# import uvicorn
+# import tempfile
+# import nemo.collections.asr as nemo_asr
+# import re
+# import os
+# import librosa
+# import soundfile as sf
+# # ===== Arabic number mapping (expanded) =====
+# arabic_numbers = {
+#     "صفر": "0", "زيرو": "0", "٠": "0", "زيو": "0", "زير": "0",
+#     "واحد": "1", "واحدة": "1", "١": "1",
+#     "اتنين": "2", "اثنين": "2", "اثنان": "2", "٢": "2",
+#     "تلاتة": "3", "ثلاثة": "3", "٣": "3","ثلاث": "3","تلات": "3",
+#     "اربعة": "4", "أربعة": "4", "٤": "4",
+#     "خمسة": "5", "٥": "5","خمسه": "5",
+#     "ستة": "6", "٦": "6",
+#     "سبعة": "7", "٧": "7","سبعه": "7",
+#     "تمانية": "8", "ثمانية": "8", "٨": "8",
+#     "تسعة": "9", "٩": "9",
+#     "عشرة": "10", "١٠": "10","عشره": "10",
+#     "حداشر": "11", "احد عشر": "11", "احداشر": "11",
+#     "اتناشر": "12", "اثنا عشر": "12",
+#     "تلتاشر": "13", "ثلاثة عشر": "13",
+#     "اربعتاشر": "14", "أربعة عشر": "14",
+#     "خمستاشر": "15", "خمسة عشر": "15",
+#     "ستاشر": "16", "ستة عشر": "16",
+#     "سبعتاشر": "17", "سبعة عشر": "17",
+#     "طمنتاشر": "18", "ثمانية عشر": "18",
+#     "تسعتاشر": "19", "تسعة عشر": "19",
+#     "عشرين": "20", "٢٠": "20",
+#     "تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
+#     "اربعين": "40", "أربعين": "40", "٤٠": "40",
+#     "خمسين": "50", "٥٠": "50",
+#     "ستين": "60", "٦٠": "60",
+#     "سبعين": "70", "٧٠": "70",
+#     "تمانين": "80", "ثمانين": "80", "٨٠": "80",
+#     "تسعين": "90", "٩٠": "90",
+#     "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
+#     "ميتين": "200", "مائتين": "200",
+#     "تلاتمية": "300", "ثلاثمائة": "300",
+#     "اربعمية": "400", "أربعمائة": "400",
+#     "خمسمية": "500", "خمسمائة": "500",
+#     "ستمية": "600", "ستمائة": "600",
+#     "سبعمية": "700", "سبعمائة": "700",
+#     "تمانمية": "800", "ثمانمائة": "800",
+#     "تسعمية": "900", "تسعمائة": "900",
+#     "ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
+#     "ألفين": "2000", "الفين": "2000",
+#     "تلات تلاف": "3000", "ثلاثة آلاف": "3000",
+#     "اربعة آلاف": "4000", "أربعة آلاف": "4000",
+#     "خمسة آلاف": "5000",
+#     "ستة آلاف": "6000",
+#     "سبعة آلاف": "7000",
+#     "تمانية آلاف": "8000", "ثمانية آلاف": "8000",
+#     "تسعة آلاف": "9000",
+#     "عشرة آلاف": "10000",
+#     "مية ألف": "100000", "مائة ألف": "100000",
+#     "مليون": "1000000", "ملايين": "1000000",
+#     "مليار": "1000000000"
+# }
+# # ===== Helpers =====
+# def normalize_arabic(text: str) -> str:
+#     diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
+#     text = re.sub(diacritics, '', text)
+#     text = re.sub(r'[إأآا]', 'ا', text)
+#     text = re.sub(r'ى', 'ي', text)
+#     text = re.sub(r'ؤ', 'و', text)
+#     text = re.sub(r'ئ', 'ي', text)
+#     text = re.sub(r'ة', 'ه', text)
+#     return text
+# def replace_arabic_numbers(text: str) -> str:
+#     for word, digit in arabic_numbers.items():
+#         text = re.sub(fr"(?:^|\s){word}(?:$|\s)", f" {digit} ", text)
+#     return " ".join(text.split())
+# def join_digit_sequences(text: str) -> str:
+#     tokens = text.split()
+#     out, buffer = [], []
+#     for tok in tokens:
+#         if tok.isdigit() and len(tok) == 1:
+#             buffer.append(tok)
+#         else:
+#             if buffer:
+#                 out.append("".join(buffer))
+#                 buffer = []
+#             out.append(tok)
+#     if buffer:
+#         out.append("".join(buffer))
+#     return " ".join(out)
+# def ensure_16k_wav(input_path, output_path):
+#     y, sr = librosa.load(input_path, sr=16000, mono=True)
+#     sf.write(output_path, y, 16000)
+# # ===== FastAPI app =====
+# app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic digit conversion")
+# @app.on_event("startup")
+# def load_model():
+#     global asr_model
+#     model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/asr-egyptian-nemo-v2.0.nemo"
+#     asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
+# @app.post("/transcribe")
+# async def transcribe_audio(file: UploadFile = File(...)):
+#     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+#         tmp.write(await file.read())
+#         tmp_path = tmp.name
+#     # Resample to 16kHz
+#     resampled_path = tmp_path.replace(".wav", "_16k.wav")
+#     ensure_16k_wav(tmp_path, resampled_path)
+#     try:
+#         result = asr_model.transcribe([resampled_path])
+#         raw_text = result[0].text
+#         raw_text = normalize_arabic(raw_text)
+#         cleaned_text = replace_arabic_numbers(raw_text)
+#         cleaned_text = join_digit_sequences(cleaned_text)
+#         return JSONResponse(content={"transcription": cleaned_text})
+#     finally:
+#         os.remove(tmp_path)
+#         if os.path.exists(resampled_path):
+#             os.remove(resampled_path)
+# @app.post("/transcribe-bytes")
+# async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
+#     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+#         tmp.write(audio_bytes)
+#         tmp_path = tmp.name
+#     resampled_path = tmp_path.replace(".wav", "_16k.wav")
+#     ensure_16k_wav(tmp_path, resampled_path)
+#     try:
+#         result = asr_model.transcribe([resampled_path])
+#         raw_text = result[0].text
+#         raw_text = normalize_arabic(raw_text)
+#         cleaned_text = replace_arabic_numbers(raw_text)
+#         cleaned_text = join_digit_sequences(cleaned_text)
+#         return JSONResponse(content={"transcription": cleaned_text})
+#     finally:
+#         os.remove(tmp_path)
+#         if os.path.exists(resampled_path):
+#             os.remove(resampled_path)
+# if __name__ == "__main__":
+#     uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
+from fastapi import FastAPI, File, UploadFile
+from fastapi.responses import JSONResponse
+import uvicorn
+import tempfile
+import nemo.collections.asr as nemo_asr
+import re
+import os
+import librosa
+import soundfile as sf
+from omegaconf import OmegaConf
+# ===== Arabic + English number mapping (expanded) =====
+arabic_numbers = {
+    "صفر": "0", "زيرو": "0", "زيو": "0", "زير": "0", "٠": "0",
+    "واحد": "1", "واحدة": "1", "واحده": "1", "١": "1",
+    "اثنين": "2", "اثنان": "2", "اتنين": "2", "٢": "2",
+    "ثلاثة": "3", "ثلاث": "3", "تلاتة": "3", "تلات": "3", "ثلاثه": "3", "تلاته": "3",
+    "أربعة": "4", "اربعة": "4", "٤": "4","أربعه": "4","اربعه": "4",
+    "خمسة": "5", "خمسه": "5", "٥": "5",
+    "ستة": "6", "ست": "6", "٦": "6","سته": "6",
+    "سبعة": "7", "سبعه": "7", "٧": "7",
+    "ثمانية": "8", "تمانية": "8", "تمنية": "8", "ثمان": "8", "٨": "8","تمانيه": "8",
+    "تسعة": "9", "تسعه": "9", "٩": "9"
+}
+# ===== Helpers =====
+def normalize_arabic(text: str) -> str:
+    diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
+    text = re.sub(diacritics, '', text)
+    text = re.sub(r'[إأآا]', 'ا', text)
+    text = re.sub(r'ى', 'ي', text)
+    text = re.sub(r'ؤ', 'و', text)
+    text = re.sub(r'ئ', 'ي', text)
+    text = re.sub(r'ة', 'ه', text)
+    return text
+def replace_arabic_numbers(text: str) -> str:
+    # Replace Arabic words 0-9 with digits
+    for word, digit in arabic_numbers.items():
+        text = re.sub(rf'\b{re.escape(word)}\b', digit, text)
+    return text
+def join_digit_sequences(text: str) -> str:
+    # Merge consecutive digits into single numbers
+    tokens = text.split()
+    out, buffer = [], []
+    for tok in tokens:
+        if tok.isdigit():
+            buffer.append(tok)
+        else:
+            if buffer:
+                out.append("".join(buffer))
+                buffer = []
+            out.append(tok)
+    if buffer:
+        out.append("".join(buffer))
+    return " ".join(out)
+def ensure_16k_wav(input_path, output_path):
+    y, sr = librosa.load(input_path, sr=16000, mono=True)
+    sf.write(output_path, y, 16000)
+# ===== FastAPI app =====
+app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic/English digit conversion")
+@app.on_event("startup")
+def load_model():
+    global asr_model
+    model_path = "output_finetuned/finetuned_model_best.nemo"
+    asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(model_path)
+    # Add this right after loading the model to see what's actually available:
+    print("Available greedy parameters:")
+    print(OmegaConf.to_yaml(asr_model.cfg.decoding.greedy))
+    # ===== STEP 3: Configure for LITERAL transcription =====
+    print("🔍 Configuring greedy decoding for literal output...")
+    # Set struct mode to False temporarily to allow modifications
+    OmegaConf.set_struct(asr_model.cfg.decoding, False)
+    OmegaConf.set_struct(asr_model.cfg.decoding.greedy, False)
+    decoding_cfg = asr_model.cfg.decoding
+    decoding_cfg.strategy = "maes"
+    # Now try setting the parameters
+    try:
+        decoding_cfg.greedy.max_symbols_per_step = 300
+        print(f"✓ max_symbols_per_step: {decoding_cfg.greedy.max_symbols_per_step}")
+    except:
+        print("⚠ Could not set max_symbols_per_step")
+    decoding_cfg.greedy.max_symbols = 500
+    decoding_cfg.greedy.loop_labels = True
+    decoding_cfg.greedy.preserve_alignments = True
+    decoding_cfg.preserve_alignments = True
+    decoding_cfg.compute_timestamps = True
+    decoding_cfg.temperature = 1.3
+    decoding_cfg.beam.beam_size = 64
+    decoding_cfg.beam.softmax_temperature = 1.3
+    decoding_cfg.beam.search_type = "beam"
+    print(f"✓ max_symbols: {decoding_cfg.greedy.max_symbols}")
+    print(f"✓ loop_labels: {decoding_cfg.greedy.loop_labels}")
+    print(f"✓ temperature: {decoding_cfg.temperature}")
+    # Re-enable struct mode
+    OmegaConf.set_struct(asr_model.cfg.decoding, True)
+    OmegaConf.set_struct(asr_model.cfg.decoding.greedy, True)
+    # Apply configuration
+    asr_model.change_decoding_strategy(decoding_cfg)
+@app.post("/transcribe")
+async def transcribe_audio(file: UploadFile = File(...)):
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+        tmp.write(await file.read())
+        tmp_path = tmp.name
+    resampled_path = tmp_path.replace(".wav", "_16k.wav")
+    ensure_16k_wav(tmp_path, resampled_path)
+    try:
+        result = asr_model.transcribe([resampled_path])
+        # Handle NeMo tuple/list structure robustly
+        if isinstance(result, tuple):
+            result = result[0]  # take first element if tuple
+        if isinstance(result, list):
+            raw_text = result[0]
+        else:
+            raw_text = str(result)
+        # Normalize and replace Arabic numerals
+        raw_text = normalize_arabic(raw_text)
+        cleaned_text = replace_arabic_numbers(raw_text)
+        cleaned_text = join_digit_sequences(cleaned_text)
+        print("📝 Cleaned Transcription:", cleaned_text)  # for debug
+        return JSONResponse(content={"transcription": cleaned_text})
+    finally:
+        os.remove(tmp_path)
+        if os.path.exists(resampled_path):
+            os.remove(resampled_path)
+@app.post("/transcribe-bytes")
+async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+        tmp.write(audio_bytes)
+        tmp_path = tmp.name
+    resampled_path = tmp_path.replace(".wav", "_16k.wav")
+    ensure_16k_wav(tmp_path, resampled_path)
+    try:
+        result = asr_model.transcribe([resampled_path])
+        # Robust extraction
+        if isinstance(result, list):
+            # if first element is also a list, flatten it
+            first = result[0]
+            if isinstance(first, list):
+                raw_text = first[0]
+            elif isinstance(first, str):
+                raw_text = first
+            elif hasattr(first, "text"):  # sometimes result contains objects with 'text'
+                raw_text = first.text
+            else:
+                raw_text = str(first)  # fallback to string
+        else:
+            raw_text = str(result)
+        #print("Raw text:", raw_text)
+        raw_text = normalize_arabic(raw_text)
+        cleaned_text = replace_arabic_numbers(raw_text)
+        cleaned_text = join_digit_sequences(cleaned_text)
+        return JSONResponse(content={"transcription": cleaned_text})
+    finally:
+        os.remove(tmp_path)
+        if os.path.exists(resampled_path):
+            os.remove(resampled_path)
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)

continue_finetuning_nemo.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import os
+import io
+import json
+import torch
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
+from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
+from omegaconf import open_dict , DictConfig
+# ============================================================
+# Environment Fixes (Windows / CUDA)
+# ============================================================
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+os.environ["NUMBA_CUDA_USE_NVIDIA_BINDING"] = "1"
+os.environ["NUMBA_DISABLE_JIT"] = "0"
+os.environ["NUMBA_CUDA_DRIVER"] = "cuda"
+# Uncomment to use GPU (recommended for RTX 3070)
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+# ============================================================
+# UTF-8 Fix for Manifest
+# ============================================================
+manifest_path = "train_manifest.jsonl"
+with io.open(manifest_path, 'r', encoding='utf-8', errors='ignore') as f:
+    content = f.read()
+with io.open(manifest_path, 'w', encoding='utf-8') as f:
+    f.write(content)
+print("✅ train_manifest.jsonl converted to UTF-8")
+# Patch builtins.open for UTF-8
+import builtins
+_old_open = open
+def open_utf8(file, *args, **kwargs):
+    if isinstance(file, str) and file.endswith('.jsonl') and 'encoding' not in kwargs:
+        kwargs['encoding'] = 'utf-8'
+    return _old_open(file, *args, **kwargs)
+builtins.open = open_utf8
+# ============================================================
+# Validate Manifest
+# ============================================================
+def validate_manifest(manifest_path):
+    count = 0
+    with open(manifest_path, "r", encoding="utf-8") as f:
+        for i, line in enumerate(f, 1):
+            try:
+                item = json.loads(line.strip())
+                assert os.path.exists(item["audio_filepath"]), f"Missing: {item['audio_filepath']}"
+                assert "text" in item and item["text"].strip(), "Empty text"
+                count += 1
+            except Exception as e:
+                print(f"❌ Line {i} error: {e}")
+                print(f"   Content: {line[:100]}")
+    print(f"✅ Valid entries: {count}")
+    return count
+valid_count = validate_manifest(manifest_path)
+if valid_count == 0:
+    raise ValueError("No valid training samples found!")
+# ============================================================
+# Paths and Hyperparameters
+# ============================================================
+BASE_MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
+SAVE_DIR = "output_finetuned"
+LAST_CKPT = os.path.join(SAVE_DIR, "last.ckpt")
+BATCH_SIZE = 4
+ADDITIONAL_EPOCHS = 50
+LEARNING_RATE = 1e-5
+WARMUP_STEPS = 500
+WEIGHT_DECAY = 0.00001
+os.makedirs(SAVE_DIR, exist_ok=True)
+# ============================================================
+# Load Model
+# ============================================================
+print("🔹 Loading pretrained or last fine-tuned model...")
+model = EncDecHybridRNNTCTCBPEModel.restore_from(BASE_MODEL_PATH)
+# ============================================================
+# Tokenizer Fix
+# ============================================================
+with open_dict(model.cfg):
+    tokenizer_dir = os.path.join(os.path.dirname(BASE_MODEL_PATH), "tokenizer")
+    os.makedirs(tokenizer_dir, exist_ok=True)
+    model.cfg.tokenizer.dir = tokenizer_dir
+    model.cfg.tokenizer.type = "bpe"
+    if 'validation_ds' in model.cfg:
+        model.cfg.validation_ds.manifest_filepath = None
+    if 'test_ds' in model.cfg:
+        model.cfg.test_ds.manifest_filepath = None
+# ============================================================
+# Setup Training Data
+# ============================================================
+train_ds_config = {
+    "manifest_filepath": manifest_path,
+    "batch_size": BATCH_SIZE,
+    "shuffle": True,
+    "num_workers": 0,
+    "pin_memory": False,
+    "sample_rate": 16000,
+    "max_duration": 20.0,
+    "min_duration": 0.5,
+    "trim_silence": True,
+    "use_start_end_token": True,
+    "normalize_transcripts": True,
+    "parser": "ar",
+}
+model.setup_training_data(train_ds_config)
+# ============================================================
+# Optimizer & Scheduler
+# ============================================================
+with open_dict(model.cfg):
+    model.cfg.optim.name = "adamw"
+    model.cfg.optim.lr = LEARNING_RATE
+    model.cfg.optim.betas = [0.9, 0.98]
+    model.cfg.optim.weight_decay = WEIGHT_DECAY
+    model.cfg.optim.eps = 1e-8
+    model.cfg.optim.sched = {
+        "name": "CosineAnnealing",
+        "warmup_steps": WARMUP_STEPS,
+        "min_lr": 1e-7,
+        "last_epoch": -1,
+    }
+# ============================================================
+# Callbacks
+# ============================================================
+checkpoint_callback = ModelCheckpoint(
+    dirpath=SAVE_DIR,
+    filename='continued-{epoch:02d}-{train_loss:.4f}',
+    save_top_k=3,
+    monitor='train_loss',
+    mode='min',
+    save_last=True,
+)
+early_stop_callback = EarlyStopping(
+    monitor='train_loss',
+    patience=20,
+    mode='min',
+    verbose=True,
+)
+lr_monitor = LearningRateMonitor(logging_interval='step')
+# ============================================================
+# Determine Max Epochs Based on Last Checkpoint
+# ============================================================
+# ============================================================
+# Allow loading full NeMo checkpoint (trusted source)
+# ============================================================
+torch.serialization.add_safe_globals([DictConfig])
+if os.path.exists(LAST_CKPT):
+    ckpt_data = torch.load(LAST_CKPT, map_location="cpu", weights_only=False)
+    last_epoch = ckpt_data.get("epoch", 0)
+    new_max_epochs = last_epoch + ADDITIONAL_EPOCHS
+    print(f"🧩 Last checkpoint epoch: {last_epoch} → continuing up to {new_max_epochs} epochs total.")
+else:
+    new_max_epochs = ADDITIONAL_EPOCHS
+# ============================================================
+# Trainer
+# ============================================================
+trainer = Trainer(
+    accelerator="gpu" if torch.cuda.is_available() else "cpu",
+    devices=1,
+    max_epochs=new_max_epochs,
+    log_every_n_steps=1,
+    enable_checkpointing=True,
+    default_root_dir=SAVE_DIR,
+    callbacks=[checkpoint_callback, early_stop_callback, lr_monitor],
+    gradient_clip_val=1.0,
+    accumulate_grad_batches=4,
+)
+# ============================================================
+# Continue Training
+# ============================================================
+if os.path.exists(LAST_CKPT):
+    print(f"🚀 Continuing training from checkpoint: {LAST_CKPT}")
+    trainer.fit(model, ckpt_path=LAST_CKPT)
+else:
+    print("⚠️ No checkpoint found, training from base model...")
+    trainer.fit(model)
+# ============================================================
+# Save Final Model
+# ============================================================
+final_model_path = os.path.join(SAVE_DIR, "finetuned_model_continued.nemo")
+model.save_to(final_model_path)
+print(f"\n✅ Continued fine-tuned model saved to: {final_model_path}")

converting_dataset_to_8khz.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import json
+import subprocess
+import soundfile as sf
+# ==============================
+# CONFIGURATION
+# ==============================
+input_folder = "data_tts_evaluation"
+output_folder = "data_tts_eval_8k_ulaw"
+old_manifest = "eval_manifest.jsonl"  # Optional: use old text references
+new_manifest = "eval_manifest_8k_ulaw.jsonl"
+# Create output folder if it doesn’t exist
+os.makedirs(output_folder, exist_ok=True)
+# Supported audio formats
+valid_ext = (".wav", ".mp3", ".flac", ".ogg", ".m4a")
+# ==============================
+# Load Texts from Old Manifest (if exists)
+# ==============================
+text_map = {}
+if os.path.exists(old_manifest):
+    print(f"🔹 Loading existing manifest: {old_manifest}")
+    with open(old_manifest, "r", encoding="utf-8") as f:
+        for line in f:
+            try:
+                item = json.loads(line.strip())
+                # Extract filename without extension for mapping
+                key = os.path.splitext(os.path.basename(item["audio_filepath"]))[0]
+                text_map[key] = item.get("text", "")
+            except Exception as e:
+                print(f"⚠️ Error reading line: {e}")
+# ==============================
+# CONVERSION LOOP + MANIFEST CREATION
+# ==============================
+converted_entries = []
+for filename in os.listdir(input_folder):
+    if not filename.lower().endswith(valid_ext):
+        continue
+    input_path = os.path.join(input_folder, filename)
+    base_name = os.path.splitext(filename)[0]
+    output_name = base_name + "_8k_ulaw.wav"
+    output_path = os.path.join(output_folder, output_name)
+    # FFmpeg command: convert to mono 8kHz u-law
+    cmd = [
+        "ffmpeg",
+        "-y",  # overwrite
+        "-i", input_path,
+        "-ar", "8000",  # 8kHz
+        "-ac", "1",  # mono
+        "-c:a", "pcm_mulaw",
+        output_path
+    ]
+    try:
+        subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        # Get duration of new file
+        data, samplerate = sf.read(output_path)
+        duration = round(len(data) / samplerate, 2)
+        # Get text (if exists from old manifest)
+        text = text_map.get(base_name, "")
+        # Add entry to new manifest
+        converted_entries.append({
+            "audio_filepath": output_path.replace("\\", "/"),
+            "duration": duration,
+            "text": text
+        })
+        print(f"✅ Converted: {filename} → {output_name} ({duration}s)")
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Error converting {filename}: {e.stderr.decode('utf-8', errors='ignore')}")
+# ==============================
+# SAVE NEW MANIFEST
+# ==============================
+if converted_entries:
+    with open(new_manifest, "w", encoding="utf-8") as f:
+        for entry in converted_entries:
+            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+    print(f"\n💾 Manifest saved to: {new_manifest}")
+    print(f"📊 Total entries: {len(converted_entries)}")
+else:
+    print("⚠️ No audio files converted or manifest empty!")
+print(f"\n🎯 Conversion complete! {len(converted_entries)} files saved in '{output_folder}'.")

data_tts/gcloud_tts_sample_001.wav ADDED Viewed

Binary file (94.6 kB). View file

data_tts/gcloud_tts_sample_002.wav ADDED Viewed

Binary file (82.5 kB). View file

data_tts/gcloud_tts_sample_053.wav ADDED Viewed

Binary file (98.5 kB). View file

data_tts/gcloud_tts_sample_060.wav ADDED Viewed

Binary file (99.7 kB). View file

data_tts/gcloud_tts_sample_1065.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3299bdc8d6906ab1152f326a4b2839966a842deab4762c837aa073b7c4b286dd
+size 183062

data_tts/gcloud_tts_sample_1067.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:211ce2911a62fc66fd143e6434b4f053f0ccde0ce5b007f0a99c98a5caeefa8f
+size 133318

data_tts/gcloud_tts_sample_107.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c840bba5d03b45f415219171e226055fc94735f43187c816f4d723a9eb162d7a
+size 158232

data_tts/gcloud_tts_sample_1078.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1774d6b6b212dac76b782be61c89a466a900e2ab9620b223e78df14c70f30d3d
+size 172362

data_tts/gcloud_tts_sample_1080.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fae59de9a6431baadb0b50aa9ff7e2adc12808f498827c3913e4519adcc51849
+size 179680

data_tts/gcloud_tts_sample_1082.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9dfddaf1051afae6e4fa8298fe948d6ffcfc4681ef35c883195969c8ba5a22c1
+size 162944

data_tts/gcloud_tts_sample_1189.wav ADDED Viewed

Binary file (77.5 kB). View file

data_tts/gcloud_tts_sample_1190.wav ADDED Viewed

Binary file (69.7 kB). View file

data_tts/gcloud_tts_sample_1191.wav ADDED Viewed

Binary file (77 kB). View file

data_tts/gcloud_tts_sample_1192.wav ADDED Viewed

Binary file (97.9 kB). View file

data_tts/gcloud_tts_sample_1193.wav ADDED Viewed

Binary file (74.6 kB). View file

data_tts/gcloud_tts_sample_1221.wav ADDED Viewed

Binary file (99.7 kB). View file

data_tts/gcloud_tts_sample_1222.wav ADDED Viewed

Binary file (95.8 kB). View file

data_tts/gcloud_tts_sample_1236.wav ADDED Viewed

Binary file (83.7 kB). View file

data_tts/gcloud_tts_sample_1241.wav ADDED Viewed

Binary file (99.7 kB). View file

data_tts/gcloud_tts_sample_1277.wav ADDED Viewed

Binary file (32.9 kB). View file

data_tts/gcloud_tts_sample_1278.wav ADDED Viewed

Binary file (48.7 kB). View file

data_tts/gcloud_tts_sample_1279.wav ADDED Viewed

Binary file (66.2 kB). View file

data_tts/gcloud_tts_sample_1280.wav ADDED Viewed

Binary file (83.7 kB). View file

data_tts/gcloud_tts_sample_1286.wav ADDED Viewed

Binary file (47.6 kB). View file

data_tts/gcloud_tts_sample_1287.wav ADDED Viewed

Binary file (75.2 kB). View file

data_tts/gcloud_tts_sample_1295.wav ADDED Viewed

Binary file (43 kB). View file

data_tts/gcloud_tts_sample_1296.wav ADDED Viewed

Binary file (75.1 kB). View file

data_tts/gcloud_tts_sample_1297.wav ADDED Viewed

Binary file (95.8 kB). View file

data_tts/gcloud_tts_sample_1304.wav ADDED Viewed

Binary file (41.8 kB). View file

data_tts/gcloud_tts_sample_1305.wav ADDED Viewed

Binary file (64.5 kB). View file

data_tts/gcloud_tts_sample_1306.wav ADDED Viewed

Binary file (88.5 kB). View file

data_tts/gcloud_tts_sample_1313.wav ADDED Viewed

Binary file (50 kB). View file

data_tts/gcloud_tts_sample_1314.wav ADDED Viewed

Binary file (81.7 kB). View file

data_tts/gcloud_tts_sample_1322.wav ADDED Viewed

Binary file (51.5 kB). View file

eval_manifest.jsonl ADDED Viewed

	@@ -0,0 +1,163 @@

+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_000.wav", "duration": 1.6, "text": "علاء سيد عبد الله"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_001.wav", "duration": 2.0, "text": "محمد أحمد عبد الرحمن"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_002.wav", "duration": 2.7, "text": "كريم محمود عبد الغفار"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_003.wav", "duration": 1.7, "text": "يوسف علي عبد الحليم"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_004.wav", "duration": 1.6, "text": "مصطفى طارق حسن"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_005.wav", "duration": 2.55, "text": "إبراهيم محمد عبد العزيز"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_006.wav", "duration": 2.85, "text": "خالد عمر عبد السميع"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_007.wav", "duration": 1.6, "text": "أحمد سامي حسين"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_008.wav", "duration": 2.0, "text": "محمود ناصر عبد اللطيف"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_009.wav", "duration": 1.9, "text": "عمر عبد الله محمد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_010.wav", "duration": 1.65, "text": "مينا فادي نصيف"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_011.wav", "duration": 1.65, "text": "بيتر عادل صليب"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_012.wav", "duration": 1.65, "text": "جرجس سامح حكيم"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_013.wav", "duration": 1.75, "text": "رامي فوزي بشارة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_014.wav", "duration": 1.7, "text": "فادي منير عوض"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_015.wav", "duration": 1.5, "text": "مريم يوسف فؤاد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_016.wav", "duration": 2.0, "text": "نانسي شريف عياد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_017.wav", "duration": 1.35, "text": "كيرلس ممدوح سمعان"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_018.wav", "duration": 1.65, "text": "هالة فؤاد حبيب"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_019.wav", "duration": 1.7, "text": "مارجريت جرجس فخري"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_020.wav", "duration": 1.8, "text": "ريم أحمد عبد الباري"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_021.wav", "duration": 1.9, "text": "شروق محمد عبد الرحيم"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_022.wav", "duration": 1.65, "text": "إيمان حسن مصطفى"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_023.wav", "duration": 2.5, "text": "فاطمة الزهراء عبد الله"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_024.wav", "duration": 2.7, "text": "سارة خالد عبد الباقي"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_025.wav", "duration": 1.8, "text": "ندى إبراهيم حسن"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_026.wav", "duration": 1.45, "text": "دينا محمود فوزي"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_027.wav", "duration": 2.15, "text": "لبنى عبد الرحمن السيد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_028.wav", "duration": 1.7, "text": "آية طارق عبد الجليل"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_029.wav", "duration": 1.85, "text": "أسماء علي إبراهيم"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_030.wav", "duration": 1.9, "text": "أحمد عصام عبد الرحمن"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_031.wav", "duration": 1.75, "text": "نور هشام عبد الله"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_032.wav", "duration": 1.4, "text": "نجلاء سامي فؤاد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_033.wav", "duration": 1.7, "text": "رنا علاء الدين أحمد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_034.wav", "duration": 2.55, "text": "عادل فخري سمعان"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_035.wav", "duration": 1.4, "text": "بولا هاني رزق"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_036.wav", "duration": 1.45, "text": "مينا يوسف بشاي"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_037.wav", "duration": 1.75, "text": "أبانوب فادي كامل"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_038.wav", "duration": 2.25, "text": "مارينا جرجس جاد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_039.wav", "duration": 2.1, "text": "كريستين فؤاد صموئيل"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_040.wav", "duration": 2.1, "text": "سليم أحمد عبد المقصود"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_041.wav", "duration": 1.95, "text": "عمار محمد عبد الرحيم"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_042.wav", "duration": 2.6, "text": "أنس عبد الله محمود"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_043.wav", "duration": 1.7, "text": "زياد عمرو ناصر"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_044.wav", "duration": 2.0, "text": "أمير يوسف عبد الغفار"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_045.wav", "duration": 2.4, "text": "خالد مصطفى عبد الحميد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_046.wav", "duration": 1.75, "text": "جرجس عادل لبيب"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_047.wav", "duration": 1.4, "text": "بولا فخري بطرس"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_048.wav", "duration": 1.75, "text": "مارينا فادي صادق"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_049.wav", "duration": 2.3, "text": "جوليان جورج عزيز"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_050.wav", "duration": 1.45, "text": "نادر سامي رزق"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_051.wav", "duration": 1.75, "text": "عبد الرحمن أحمد عبد الله"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_052.wav", "duration": 1.5, "text": "محمد طه السيد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_053.wav", "duration": 1.75, "text": "أحمد ياسر مصطفى"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_054.wav", "duration": 2.1, "text": "سيد عبد الفتاح عبد الغني"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_055.wav", "duration": 3.05, "text": "محمد رمضان عبد الحكيم"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_056.wav", "duration": 2.1, "text": "عبد الله حمدي عبد الفتاح"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_057.wav", "duration": 2.1, "text": "أيمن جمال عبد الناصر"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_058.wav", "duration": 2.45, "text": "أحمد عبد الرازق حسن"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_059.wav", "duration": 1.65, "text": "محمود خالد محمد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_060.wav", "duration": 1.6, "text": "مروان عماد عبد الله"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_061.wav", "duration": 2.7, "text": "عبد الرحمن محمد شريف"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_062.wav", "duration": 1.95, "text": "أحمد محروس عبد اللطيف"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_063.wav", "duration": 2.4, "text": "مصطفى عبد القادر عبد السميع"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_064.wav", "duration": 1.9, "text": "عبد العزيز حسن عبد الله"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_065.wav", "duration": 2.25, "text": "مينا شنودة فخري"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_066.wav", "duration": 1.5, "text": "بولا يوسف بطرس"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_067.wav", "duration": 1.45, "text": "فادي عادل رسمي"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_068.wav", "duration": 2.2, "text": "جرجس فوزي منصور"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_069.wav", "duration": 1.95, "text": "كيرلس رأفت نجيب"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_070.wav", "duration": 1.5, "text": "مارينا جورج عادل"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_071.wav", "duration": 1.85, "text": "ديفيد ماهر منير"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_072.wav", "duration": 1.75, "text": "كارولين فادي شكر"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_073.wav", "duration": 1.9, "text": "مريم سامي فؤاد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_074.wav", "duration": 1.6, "text": "أندرو فؤاد رزق"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_075.wav", "duration": 1.9, "text": "ريهام عبد الله محمد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_076.wav", "duration": 2.7, "text": "سارة عماد حسن"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_077.wav", "duration": 1.9, "text": "ميادة عبد الحميد ناصر"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_078.wav", "duration": 1.7, "text": "آية أحمد عبد الله"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_079.wav", "duration": 1.95, "text": "نورهان عبد الفتاح علي"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_080.wav", "duration": 1.35, "text": "هدير خالد حسن"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_081.wav", "duration": 1.9, "text": "بسمة إبراهيم عبد الغني"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_082.wav", "duration": 2.3, "text": "أسماء طارق عبد الرحمن"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_083.wav", "duration": 1.75, "text": "يمنى محمد عبد الحليم"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_084.wav", "duration": 2.25, "text": "صفاء عبد الرحمن السيد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_085.wav", "duration": 1.9, "text": "منال أحمد حسن"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_086.wav", "duration": 2.65, "text": "رحمة عبد الله محمود"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_087.wav", "duration": 2.35, "text": "ياسمين خالد عبد الرحمن"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_088.wav", "duration": 2.2, "text": "شيماء أحمد عبد الغفار"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_089.wav", "duration": 1.95, "text": "علا سامي عبد المقصود"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_090.wav", "duration": 1.95, "text": "رغدة علي عبد الباري"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_091.wav", "duration": 1.95, "text": "هايدي جرجس بطرس"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_092.wav", "duration": 1.5, "text": "نيرمين مينا فؤاد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_093.wav", "duration": 1.7, "text": "جيسيكا بولا منصور"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_094.wav", "duration": 1.65, "text": "ماريان يوسف رسمي"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_095.wav", "duration": 1.5, "text": "كارين فادي شنودة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_096.wav", "duration": 1.75, "text": "أميرة أحمد عبد الله"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_097.wav", "duration": 2.3, "text": "نورا إبراهيم حسن"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_098.wav", "duration": 1.6, "text": "هبة طارق عبد الرحمن"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_099.wav", "duration": 1.65, "text": "دعاء عبد الله السيد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_100.wav", "duration": 1.9, "text": "عبير خالد عبد العزيز"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_101.wav", "duration": 1.7, "text": "خلود ناصر عبد الغفار"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_102.wav", "duration": 2.25, "text": "جيهان عبد الرحمن محمود"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_103.wav", "duration": 7.6, "text": "اثنين ثلاثة زيرو واحد واحد اثنين زيرو سبعة واحد زيرو زيرو واحد واحد اثنين"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_104.wav", "duration": 7.35, "text": "اثنين خمسة زيرو تسعة زيرو خمسة اثنين واحد واحد تسعة زيرو زيرو ثلاثة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_105.wav", "duration": 7.15, "text": "ثلاثة زيرو واحد واحد اثنين ثلاثة زيرو سبعة واحد سبعة تسعة زيرو خمسة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_106.wav", "duration": 6.75, "text": "اثنين تسعة زيرو سبعة واحد واحد زيرو اثنين واحد زيرو خمسة زيرو اثنين"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_107.wav", "duration": 8.2, "text": "ثلاثة واحد زيرو اثنين واحد اثنين زيرو تسعة واحد زيرو سبعة زيرو واحد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_108.wav", "duration": 6.9, "text": "ثلاثة اثنين زيرو ثلاثة واحد واحد زيرو سبعة واحد تسعة زيرو زيرو أربعة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_109.wav", "duration": 6.7, "text": "اثنين ثمانية زيرو تسعة واحد واحد زيرو خمسة واحد زيرو زيرو زيرو ستة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_110.wav", "duration": 8.25, "text": "ثلاثة زيرو واحد واحد زيرو تسعة زيرو تسعة واحد ثمانية زيرو زيرو سبعة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_111.wav", "duration": 5.25, "text": "اثنين خمسة زيرو اثنين زيرو اثنين زيرو سبعة واحد زيرو زيرو زيرو خمسة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_112.wav", "duration": 7.95, "text": "ثلاثة واحد زيرو سبعة واحد زيرو زيرو خمسة واحد زيرو تسعة زيرو واحد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_113.wav", "duration": 5.45, "text": "اثنين ستة زيرو واحد زيرو ثلاثة زيرو سبعة واحد زيرو زيرو زيرو تسعة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_114.wav", "duration": 7.3, "text": "اثنين تسعة زيرو ثمانية واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو اثنين"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_115.wav", "duration": 6.35, "text": "ثلاثة اثنين زيرو سبعة واحد واحد زيرو تسعة واحد زيرو زيرو زيرو خمسة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_116.wav", "duration": 7.15, "text": "ثلاثة واحد زيرو واحد واحد اثنين زيرو خمسة واحد زيرو زيرو زيرو أربعة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_117.wav", "duration": 7.3, "text": "اثنين ثمانية زيرو تسعة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو اثنين"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_118.wav", "duration": 7.65, "text": "ثلاثة واحد زيرو خمسة واحد اثنين زيرو تسعة واحد زيرو تسعة زيرو ثلاثة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_119.wav", "duration": 7.45, "text": "اثنين تسعة زيرو اثنين واحد واحد زيرو ثمانية واحد زيرو زيرو زيرو خمسة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_120.wav", "duration": 8.25, "text": "ثلاثة اثنين زيرو واحد واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو تسعة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_121.wav", "duration": 6.25, "text": "اثنين واحد زيرو ثلاثة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو أربعة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_122.wav", "duration": 6.65, "text": "ثلاثة اثنين زيرو ثمانية واحد واحد زيرو خمسة واحد زيرو تسعة زيرو أربعة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_123.wav", "duration": 6.7, "text": "زيرو واحد زيرو واحد اثنين ثلاثة أربعة خمسة ستة سبعة ثمانية"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_124.wav", "duration": 5.65, "text": "زيرو واحد واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_125.wav", "duration": 5.15, "text": "زيرو واحد اثنين زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_126.wav", "duration": 5.15, "text": "زيرو واحد خمسة سبعة ثمانية تسعة أربعة ثلاثة اثنين واحد زيرو"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_127.wav", "duration": 5.6, "text": "زيرو واحد زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين واحد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_128.wav", "duration": 7.25, "text": "زيرو واحد واحد زيرو أربعة خمسة ستة سبعة ثمانية تسعة زيرو اثنين"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_129.wav", "duration": 8.25, "text": "زيرو واحد اثنين واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_130.wav", "duration": 8.9, "text": "زيرو واحد خمسة زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_131.wav", "duration": 7.3, "text": "زيرو واحد واحد خمسة ستة سبعة ثمانية تسعة أربعة ثلاثة اثنين زيرو"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_132.wav", "duration": 7.15, "text": "زيرو واحد اثنين أربعة خمسة ستة سبعة ثمانية تسعة زيرو واحد اثنين"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_133.wav", "duration": 2.1, "text": "في حادث عربية عند كوبري عباس"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_134.wav", "duration": 2.05, "text": "فيه حريق في عمارة في شارع فيصل"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_135.wav", "duration": 1.5, "text": "لقيت طفل تاه في المول"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_136.wav", "duration": 2.0, "text": "في خناقة كبيرة في ميدان الجيزة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_137.wav", "duration": 2.75, "text": "عربية مقلوبة على الطريق الدائري"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_138.wav", "duration": 1.6, "text": "في صوت ضرب نار في الهرم"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_139.wav", "duration": 1.7, "text": "جارتي وقعت من البلكونة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_140.wav", "duration": 2.95, "text": "حصلت سرقة في الشارع عند السوبرماركت"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_141.wav", "duration": 2.25, "text": "في واحد بيعتدي على بنت في الشارع"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_142.wav", "duration": 2.4, "text": "حصل انفجار صغير في محل الغاز"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_143.wav", "duration": 2.2, "text": "شفت عربية بتخبط موتوسيكل وهربت"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_144.wav", "duration": 2.35, "text": "طفل محبوس في الأسانسير"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_145.wav", "duration": 2.4, "text": "في شاب مصاب قدام محطة المترو"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_146.wav", "duration": 1.95, "text": "العربية عطلت في نص الطريق"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_147.wav", "duration": 1.9, "text": "في تسريب غاز في العمارة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_148.wav", "duration": 2.9, "text": "واحد كبير في السن مغمى عليه في المسجد"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_149.wav", "duration": 3.0, "text": "حصلت مشاجرة بالسكاكين في السوق"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_150.wav", "duration": 2.55, "text": "عربية إسعاف اتأخرت على المكان"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_151.wav", "duration": 2.95, "text": "فيه كلب شرس بيهاجم الناس في الشارع"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_152.wav", "duration": 3.7, "text": "في بنت اتخطفِت من عند المدرسة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_153.wav", "duration": 2.85, "text": "في حادث تصادم في محور 26 يوليو"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_154.wav", "duration": 4.15, "text": "واحد وقع من فوق سلم البيت"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_155.wav", "duration": 1.65, "text": "النور قاطع في الشارع كله"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_156.wav", "duration": 1.85, "text": "صوت انفجار جامد في المنطقة"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_157.wav", "duration": 3.05, "text": "العربية دخلت في محل في الهرم"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_158.wav", "duration": 1.75, "text": "طفلة ضايعة في المول"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_159.wav", "duration": 2.9, "text": "في تسريب مياه من الدور الرابع"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_160.wav", "duration": 2.15, "text": "خناقة بين الجيران فوق السطح"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_161.wav", "duration": 3.35, "text": "فيه عربية مركونة غلط قافلة الشارع"}
+{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_162.wav", "duration": 3.45, "text": "الغاز بيخرج من البوتاجاز ومفيش حد في الشقة"}

evaluation_results.csv ADDED Viewed

	@@ -0,0 +1,164 @@

+audio,expected,predicted,WER,CER
+openai_tts_sample_000.wav,علاء سيد عبد الله,علاء سيد عبد الله,0.0,0.0
+openai_tts_sample_001.wav,محمد أحمد عبد الرحمن,محمد أحمد عبد الرحمن,0.0,0.0
+openai_tts_sample_002.wav,كريم محمود عبد الغفار,كريم محمود عبد الغفار,0.0,0.0
+openai_tts_sample_003.wav,يوسف علي عبد الحليم,يوسف علي عبد الحليم,0.0,0.0
+openai_tts_sample_004.wav,مصطفى طارق حسن,مصطفى طارق حسن,0.0,0.0
+openai_tts_sample_005.wav,إبراهيم محمد عبد العزيز,إبراهيم محمد عبد العزيز,0.0,0.0
+openai_tts_sample_006.wav,خالد عمر عبد السميع,خالد عمر عبد السميع,0.0,0.0
+openai_tts_sample_007.wav,أحمد سامي حسين,أحمد سامي حسين,0.0,0.0
+openai_tts_sample_008.wav,محمود ناصر عبد اللطيف,محمود ناصر عبد اللطيف,0.0,0.0
+openai_tts_sample_009.wav,عمر عبد الله محمد,عمر عبد الله محمد,0.0,0.0
+openai_tts_sample_010.wav,مينا فادي نصيف,مينا فادي نصيف,0.0,0.0
+openai_tts_sample_011.wav,بيتر عادل صليب,بيتر عادل صليب,0.0,0.0
+openai_tts_sample_012.wav,جرجس سامح حكيم,جرجس سامح حكيم,0.0,0.0
+openai_tts_sample_013.wav,رامي فوزي بشارة,رامي فوزي بشارة,0.0,0.0
+openai_tts_sample_014.wav,فادي منير عوض,فادي منير عوض,0.0,0.0
+openai_tts_sample_015.wav,مريم يوسف فؤاد,مريم يوسف فؤاد,0.0,0.0
+openai_tts_sample_016.wav,نانسي شريف عياد,نانسي شريف عياد,0.0,0.0
+openai_tts_sample_017.wav,كيرلس ممدوح سمعان,كيرلس ممدوح سمعان,0.0,0.0
+openai_tts_sample_018.wav,هالة فؤاد حبيب,هانا فؤاد حبيب,0.3333333333333333,0.16666666666666666
+openai_tts_sample_019.wav,مارجريت جرجس فخري,ماربريت جرجس فخري,0.3333333333333333,0.06666666666666667
+openai_tts_sample_020.wav,ريم أحمد عبد الباري,ريم أحمد عبد الباري,0.0,0.0
+openai_tts_sample_021.wav,شروق محمد عبد الرحيم,شروق محمد عبد الرحيم,0.0,0.0
+openai_tts_sample_022.wav,إيمان حسن مصطفى,إيمان حسن مصطفى,0.0,0.0
+openai_tts_sample_023.wav,فاطمة الزهراء عبد الله,فاطمة زحراء عبد الله,0.25,0.15789473684210525
+openai_tts_sample_024.wav,سارة خالد عبد الباقي,سارة خالد عبد الباقي,0.0,0.0
+openai_tts_sample_025.wav,ندى إبراهيم حسن,ندى إبراهيم حسن,0.0,0.0
+openai_tts_sample_026.wav,دينا محمود فوزي,دينا محمود فوزي,0.0,0.0
+openai_tts_sample_027.wav,لبنى عبد الرحمن السيد,لبنى عبد الرحمن السيد,0.0,0.0
+openai_tts_sample_028.wav,آية طارق عبد الجليل,آية طارق عبد الجليل,0.0,0.0
+openai_tts_sample_029.wav,أسماء علي إبراهيم,أسماء علي إبراهيم,0.0,0.0
+openai_tts_sample_030.wav,أحمد عصام عبد الرحمن,أحمد عصام عبد الرحمن,0.0,0.0
+openai_tts_sample_031.wav,نور هشام عبد الله,نور هشام عبد الله,0.0,0.0
+openai_tts_sample_032.wav,نجلاء سامي فؤاد,نجلاء سامي فؤاد,0.0,0.0
+openai_tts_sample_033.wav,رنا علاء الدين أحمد,رنا علاء الدين أحمد,0.0,0.0
+openai_tts_sample_034.wav,عادل فخري سمعان,عادل فخري سمعان,0.0,0.0
+openai_tts_sample_035.wav,بولا هاني رزق,بولا هاني رزق,0.0,0.0
+openai_tts_sample_036.wav,مينا يوسف بشاي,مينا يوسف بيشاي,0.3333333333333333,0.08333333333333333
+openai_tts_sample_037.wav,أبانوب فادي كامل,أبانوب فادي كامل,0.0,0.0
+openai_tts_sample_038.wav,مارينا جرجس جاد,مارينا جرجس كاد,0.3333333333333333,0.07692307692307693
+openai_tts_sample_039.wav,كريستين فؤاد صموئيل,كريستين فؤاد صموئيل,0.0,0.0
+openai_tts_sample_040.wav,سليم أحمد عبد المقصود,سليم أحمد عبد المقصود,0.0,0.0
+openai_tts_sample_041.wav,عمار محمد عبد الرحيم,أنار محمد عبد الرحيم,0.25,0.11764705882352941
+openai_tts_sample_042.wav,أنس عبد الله محمود,أنس عبد الله محمود,0.0,0.0
+openai_tts_sample_043.wav,زياد عمرو ناصر,زياد عمرو ناصر,0.0,0.0
+openai_tts_sample_044.wav,أمير يوسف عبد الغفار,أمير يوسف عبد الغفار,0.0,0.0
+openai_tts_sample_045.wav,خالد مصطفى عبد الحميد,خالد مصطفى عبد الحميد,0.0,0.0
+openai_tts_sample_046.wav,جرجس عادل لبيب,جرجس عادل لبيب,0.0,0.0
+openai_tts_sample_047.wav,بولا فخري بطرس,ولا فخري بطرس,0.3333333333333333,0.08333333333333333
+openai_tts_sample_048.wav,مارينا فادي صادق,مارينا فادي صادق,0.0,0.0
+openai_tts_sample_049.wav,جوليان جورج عزيز,جوليان جورج عزيز,0.0,0.0
+openai_tts_sample_050.wav,نادر سامي رزق,نادر سامي رزق,0.0,0.0
+openai_tts_sample_051.wav,عبد الرحمن أحمد عبد الله,عبد الرحمن أحمد عبد الله,0.0,0.0
+openai_tts_sample_052.wav,محمد طه السيد,محمد طه السيد,0.0,0.0
+openai_tts_sample_053.wav,أحمد ياسر مصطفى,أحمد ياسر مصطفى,0.0,0.0
+openai_tts_sample_054.wav,سيد عبد الفتاح عبد الغني,سيد عبد الفتاح عبد الغني,0.0,0.0
+openai_tts_sample_055.wav,محمد رمضان عبد الحكيم,محمد رمضان عبد الحكيم,0.0,0.0
+openai_tts_sample_056.wav,عبد الله حمدي عبد الفتاح,عبد الله حمدي عبد الفتاح,0.0,0.0
+openai_tts_sample_057.wav,أيمن جمال عبد الناصر,أيمن جمال عبد الناصر,0.0,0.0
+openai_tts_sample_058.wav,أحمد عبد الرازق حسن,أحمد عبد الرازق حسن,0.0,0.0
+openai_tts_sample_059.wav,محمود خالد محمد,محمود خالد محمد,0.0,0.0
+openai_tts_sample_060.wav,مروان عماد عبد الله,مروان عماد عبد الله,0.0,0.0
+openai_tts_sample_061.wav,عبد الرحمن محمد شريف,عبد الرحمن محمد شريف,0.0,0.0
+openai_tts_sample_062.wav,أحمد محروس عبد اللطيف,أحمد محروس عبد اللطيف,0.0,0.0
+openai_tts_sample_063.wav,مصطفى عبد القادر عبد السميع,مصطفى عبد القادر عبد السميع,0.0,0.0
+openai_tts_sample_064.wav,عبد العزيز حسن عبد الله,عبد العزيز حسن عبد الله,0.0,0.0
+openai_tts_sample_065.wav,مينا شنودة فخري,مينا شنودة فخري,0.0,0.0
+openai_tts_sample_066.wav,بولا يوسف بطرس,بولا يوسف بطرس,0.0,0.0
+openai_tts_sample_067.wav,فادي عادل رسمي,فادي عادل لسمي,0.3333333333333333,0.08333333333333333
+openai_tts_sample_068.wav,جرجس فوزي منصور,جرجس فوزي منصور,0.0,0.0
+openai_tts_sample_069.wav,كيرلس رأفت نجيب,كيرلس رأفت نجيب,0.0,0.0
+openai_tts_sample_070.wav,مارينا جورج عادل,مارينا جورج عادل,0.0,0.0
+openai_tts_sample_071.wav,ديفيد ماهر منير,ديفيد ماهر منير,0.0,0.0
+openai_tts_sample_072.wav,كارولين فادي شكر,كارولين فادي شكر,0.0,0.0
+openai_tts_sample_073.wav,مريم سامي فؤاد,مريم سامي فؤاد,0.0,0.0
+openai_tts_sample_074.wav,أندرو فؤاد رزق,أندرو فؤاد رزق,0.0,0.0
+openai_tts_sample_075.wav,ريهام عبد الله محمد,ريهام عبد الله محمد,0.0,0.0
+openai_tts_sample_076.wav,سارة عماد حسن,سارة عماد حسن,0.0,0.0
+openai_tts_sample_077.wav,ميادة عبد الحميد ناصر,مادة عبد الحميد ناصر,0.25,0.05555555555555555
+openai_tts_sample_078.wav,آية أحمد عبد الله,آية أحمد عبد الله,0.0,0.0
+openai_tts_sample_079.wav,نورهان عبد الفتاح علي,نرهان عبد الفتاح علي,0.25,0.05555555555555555
+openai_tts_sample_080.wav,هدير خالد حسن,هدير خالد حسن,0.0,0.0
+openai_tts_sample_081.wav,بسمة إبراهيم عبد الغني,بسمة إبراهيم عبد الغني,0.0,0.0
+openai_tts_sample_082.wav,أسماء طارق عبد الرحمن,أسماء طارق عبد الرحمن,0.0,0.0
+openai_tts_sample_083.wav,يمنى محمد عبد الحليم,يمنى محمد عبد الحليم,0.0,0.0
+openai_tts_sample_084.wav,صفاء عبد الرحمن السيد,صفاء عبد الرحمن السيد,0.0,0.0
+openai_tts_sample_085.wav,منال أحمد حسن,منال أحمد حسن,0.0,0.0
+openai_tts_sample_086.wav,رحمة عبد الله محمود,رحمة عبد الله محمود,0.0,0.0
+openai_tts_sample_087.wav,ياسمين خالد عبد الرحمن,ياسمين خالد عبد الرحمن,0.0,0.0
+openai_tts_sample_088.wav,شيماء أحمد عبد الغفار,شيماء أحمد عبد الغفار,0.0,0.0
+openai_tts_sample_089.wav,علا سامي عبد المقصود,علا سامي عبد المقصود,0.0,0.0
+openai_tts_sample_090.wav,رغدة علي عبد الباري,رغدة علي عبد الباري,0.0,0.0
+openai_tts_sample_091.wav,هايدي جرجس بطرس,هايدي جرجس بطرس,0.0,0.0
+openai_tts_sample_092.wav,نيرمين مينا فؤاد,نرمين مينا فؤاد,0.3333333333333333,0.07142857142857142
+openai_tts_sample_093.wav,جيسيكا بولا منصور,كيسيك بولا منصور,0.3333333333333333,0.13333333333333333
+openai_tts_sample_094.wav,ماريان يوسف رسمي,ماريان يوسف رسمي,0.0,0.0
+openai_tts_sample_095.wav,كارين فادي شنودة,كريم فادي شنودة,0.3333333333333333,0.14285714285714285
+openai_tts_sample_096.wav,أميرة أحمد عبد الله,أميرة أحمد عبد الله,0.0,0.0
+openai_tts_sample_097.wav,نورا إبراهيم حسن,نورا إبراهيم حسن,0.0,0.0
+openai_tts_sample_098.wav,هبة طارق عبد الرحمن,هبة طارق عبد الرحمن,0.0,0.0
+openai_tts_sample_099.wav,دعاء عبد الله السيد,دعاء عبد الله السيد,0.0,0.0
+openai_tts_sample_100.wav,عبير خالد عبد العزيز,أمير خالد عبد العزيز,0.25,0.11764705882352941
+openai_tts_sample_101.wav,خلود ناصر عبد الغفار,ولود ناصر عبد الغفار,0.25,0.058823529411764705
+openai_tts_sample_102.wav,جيهان عبد الرحمن محمود,جيهان عبد الرحمن محمود,0.0,0.0
+openai_tts_sample_103.wav,اثنين ثلاثة زيرو واحد واحد اثنين زيرو سبعة واحد زيرو زيرو واحد واحد اثنين,اثنين ثلاثة زيرو واحد واحد اثنين زيرو سبعة واحد زيرو زيرو واحد واحد اثنين,0.0,0.0
+openai_tts_sample_104.wav,اثنين خمسة زيرو تسعة زيرو خمسة اثنين واحد واحد تسعة زيرو زيرو ثلاثة,اتنين خمسة زيرو تسعة زيرو خمسة اتنين واحد واحد تسعة زيرو زيرو تلاتة,0.23076923076923078,0.07272727272727272
+openai_tts_sample_105.wav,ثلاثة زيرو واحد واحد اثنين ثلاثة زيرو سبعة واحد سبعة تسعة زيرو خمسة,ثلاثة زيرو واحد واحد اثنين ثلاثة زيرو سبعة واحد سبعة تسعة زيرو خمسة,0.0,0.0
+openai_tts_sample_106.wav,اثنين تسعة زيرو سبعة واحد واحد زيرو اثنين واحد زيرو خمسة زيرو اثنين,اتنين تسعة زيرو سبعة واحد واحد زيرو اثنين واحد واحد زيرو خمسة زيرو اثنين,0.14285714285714285,0.09090909090909091
+openai_tts_sample_107.wav,ثلاثة واحد زيرو اثنين واحد اثنين زيرو تسعة واحد زيرو سبعة زيرو واحد,ثلاثة واحد زيرو اثنين واحد اثنين زيرو تسعة واحد زيرو سبعة زيرو واحد,0.0,0.0
+openai_tts_sample_108.wav,ثلاثة اثنين زيرو ثلاثة واحد واحد زيرو سبعة واحد تسعة زيرو زيرو أربعة,ثلاثة اتنين زيرو ثلاثة واحد زيرو سبعة واحد تسعة زيرو زيرو أربعة,0.16666666666666666,0.08928571428571429
+openai_tts_sample_109.wav,اثنين ثمانية زيرو تسعة واحد واحد زيرو خمسة واحد زيرو زيرو زيرو ستة,اتنين تمانية زيرو تسعة واحد زيرو خمسة واحد زيرو زيرو زيرو ستة,0.25,0.1111111111111111
+openai_tts_sample_110.wav,ثلاثة زيرو واحد واحد زيرو تسعة زيرو تسعة واحد ثمانية زيرو زيرو سبعة,ثلاثة زيرو واحد واحد زيرو تسعة زيرو تسعة واحد ثمانية زيرو سبعة,0.08333333333333333,0.07272727272727272
+openai_tts_sample_111.wav,اثنين خمسة زيرو اثنين زيرو اثنين زيرو سبعة واحد زيرو زيرو زيرو خمسة,اتنين خمسة زيرو اتنين زيرو اتنين زيرو سبعة واحد زيرو زيرو زيرو خمسة,0.23076923076923078,0.05454545454545454
+openai_tts_sample_112.wav,ثلاثة واحد زيرو سبعة واحد زيرو زيرو خمسة واحد زيرو تسعة زيرو واحد,ثلاثة واحد زيرو سبعة واحد زيرو زيرو خمسة واحد زيرو تسعة زيرو واحد,0.0,0.0
+openai_tts_sample_113.wav,اثنين ستة زيرو واحد زيرو ثلاثة زيرو سبعة واحد زيرو زيرو زيرو تسعة,اتنين ستة زيرو واحد زيرو ثلاثة زيرو سبعة واحد زيرو زيرو زيرو تسعة,0.07692307692307693,0.018867924528301886
+openai_tts_sample_114.wav,اثنين تسعة زيرو ثمانية واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو اثنين,اتنين تسعة زيرو تمانية واحد اثنين زيرو اربعة واحد زيرو زيرو اتنين,0.4166666666666667,0.13793103448275862
+openai_tts_sample_115.wav,ثلاثة اثنين زيرو سبعة واحد واحد زيرو تسعة واحد زيرو زيرو زيرو خمسة,ثلاثة اثنين زيرو سبعة واحد زيرو زيرو زيرو خمسة,0.4444444444444444,0.2962962962962963
+openai_tts_sample_116.wav,ثلاثة واحد زيرو واحد واحد اثنين زيرو خمسة واحد زيرو زيرو زيرو أربعة,ثلاثة زيرو واحد واحد اثنين زيرو خمسة واحد زيرو زيرو زيرو أربعة,0.08333333333333333,0.07272727272727272
+openai_tts_sample_117.wav,اثنين ثمانية زيرو تسعة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو اثنين,اتنين تمانية زيرو تسعة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو اتنين,0.23076923076923078,0.05357142857142857
+openai_tts_sample_118.wav,ثلاثة واحد زيرو خمسة واحد اثنين زيرو تسعة واحد زيرو تسعة زيرو ثلاثة,تلاتة واحد زيرو خمسة واحد اتنين زيرو تسعة واحد زيرو تسعة,0.36363636363636365,0.21818181818181817
+openai_tts_sample_119.wav,اثنين تسعة زيرو اثنين واحد واحد زيرو ثمانية واحد زيرو زيرو زيرو خمسة,اتنين تسعة زيرو اثنين واحد واحد زيرو اثنين زيرو زيرو خمسة,0.36363636363636365,0.23214285714285715
+openai_tts_sample_120.wav,ثلاثة اثنين زيرو واحد واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو تسعة,ثلاثة اثنين زيرو واحد واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو تسعة,0.0,0.0
+openai_tts_sample_121.wav,اثنين واحد زيرو ثلاثة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو أربعة,اتنين واحد زيرو ثلاثة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو أربعة,0.07692307692307693,0.01818181818181818
+openai_tts_sample_122.wav,ثلاثة اثنين زيرو ثمانية واحد واحد زيرو خمسة واحد زيرو تسعة زيرو أربعة,ثلاثة اتنين زيرو تمانية واحد زيرو خمسة واحد زيرو تسعة زيرو أربعة,0.25,0.10526315789473684
+openai_tts_sample_123.wav,زيرو واحد زيرو واحد اثنين ثلاثة أربعة خمسة ستة سبعة ثمانية,زيرو واحد زيرو واحد اثنين تلاتة أربعة خمسة ستة سبعة تمانية,0.18181818181818182,0.0625
+openai_tts_sample_124.wav,زيرو واحد واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,زيرو واحد واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,0.0,0.0
+openai_tts_sample_125.wav,زيرو واحد اثنين زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة,زيرو واحد اتنين زيرو تسعة تمانية سبعة ستة خمسة أربعة تلاتة,0.2727272727272727,0.08333333333333333
+openai_tts_sample_126.wav,زيرو واحد خمسة سبعة ثمانية تسعة أربعة ثلاثة اثنين واحد زيرو,زيرو واحد خمسة سبعة اتنين تسعة أربعة تلاتة اتنين واحد زيرو,0.2727272727272727,0.14285714285714285
+openai_tts_sample_127.wav,زيرو واحد زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين واحد,زيرو واحد زيرو تسعة تمانية سبعة ستة خمسة أربعة تلاتة اتنين واحد,0.25,0.07692307692307693
+openai_tts_sample_128.wav,زيرو واحد واحد زيرو أربعة خمسة ستة سبعة ثمانية تسعة زيرو اثنين,زيرو واحد واحد زيرو أربعة خمسة ستة سبعة ثمانية تسعة زيرو اثنين,0.0,0.0
+openai_tts_sample_129.wav,زيرو واحد اثنين واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,زيرو واحد اثنين واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,0.0,0.0
+openai_tts_sample_130.wav,زيرو واحد خمسة زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,زيرو واحد خمسة زيرو تسعة تمانية سبعة ستة خمسة أربعة تلاتة اتنين,0.25,0.07692307692307693
+openai_tts_sample_131.wav,زيرو واحد واحد خمسة ستة سبعة ثمانية تسعة أربعة ثلاثة اثنين زيرو,زيرو واحد واحد خمسة ستة سبعة ثمانية تسعة أربعة ثلاثة اثنين زيرو,0.0,0.0
+openai_tts_sample_132.wav,زيرو واحد اثنين أربعة خمسة ستة سبعة ثمانية تسعة زيرو واحد اثنين,زيرو واحد اتنين أربعة خمسة ستة سبعة ثمانية تسعة زيرو واحد اتنين,0.16666666666666666,0.038461538461538464
+openai_tts_sample_133.wav,في حادث عربية عند كوبري عباس,في حادث عربية عند كوبري عباس,0.0,0.0
+openai_tts_sample_134.wav,فيه حريق في عمارة في شارع فيصل,فيه حريق في عمارة في شارع فيصل.,0.14285714285714285,0.041666666666666664
+openai_tts_sample_135.wav,لقيت طفل تاه في المول,ماجيت طفل تايه في المول,0.4,0.23529411764705882
+openai_tts_sample_136.wav,في خناقة كبيرة في ميدان الجيزة,في خناقة كبيرة في ميدان الجيزة.,0.16666666666666666,0.04
+openai_tts_sample_137.wav,ع��بية مقلوبة على الطريق الدائري,عربية مقلوبة على الطريق الدائري.,0.2,0.037037037037037035
+openai_tts_sample_138.wav,في صوت ضرب نار في الهرم,في سود ضرب نار في الهرم.,0.3333333333333333,0.16666666666666666
+openai_tts_sample_139.wav,جارتي وقعت من البلكونة,جارتي وقعت من البلكونة,0.0,0.0
+openai_tts_sample_140.wav,حصلت سرقة في الشارع عند السوبرماركت,حصلت سرقة في الشارع عند السوبرماركت,0.0,0.0
+openai_tts_sample_141.wav,في واحد بيعتدي على بنت في الشارع,في واحد بيعتدي على بنت في الشارع,0.0,0.0
+openai_tts_sample_142.wav,حصل انفجار صغير في محل الغاز,حصل انفجار صغير في محل الغاز,0.0,0.0
+openai_tts_sample_143.wav,شفت عربية بتخبط موتوسيكل وهربت,شفت عربية بتخبط موتوسيكل وهربت.,0.2,0.038461538461538464
+openai_tts_sample_144.wav,طفل محبوس في الأسانسير,طفل محبوس في الأسانسير,0.0,0.0
+openai_tts_sample_145.wav,في شاب مصاب قدام محطة المترو,في شاب مصاب قدام محطة المترو.,0.16666666666666666,0.043478260869565216
+openai_tts_sample_146.wav,العربية عطلت في نص الطريق,العربية عطلت في نص الطريق.,0.2,0.047619047619047616
+openai_tts_sample_147.wav,في تسريب غاز في العمارة,في تسريب غاز في العمارة.,0.2,0.05263157894736842
+openai_tts_sample_148.wav,واحد كبير في السن مغمى عليه في المسجد,واحد كبير في السن مغمى عليه في المسجد,0.0,0.0
+openai_tts_sample_149.wav,حصلت مشاجرة بالسكاكين في السوق,حصلت مشاجرة بالسكاكين في السوق,0.0,0.0
+openai_tts_sample_150.wav,عربية إسعاف اتأخرت على المكان,عربية اسعاف اتأخرت على المكان.,0.4,0.08
+openai_tts_sample_151.wav,فيه كلب شرس بيهاجم الناس في الشارع,فيه كلب شرس بيهاجم الناس في الشارع,0.0,0.0
+openai_tts_sample_152.wav,في بنت اتخطفِت من عند المدرسة,في بنت اتخطفت من عند المدرسة,0.16666666666666666,0.041666666666666664
+openai_tts_sample_153.wav,في حادث تصادم في محور 26 يوليو,في حادث تصادم في محور  ⁇  يوليو.,0.2857142857142857,0.125
+openai_tts_sample_154.wav,واحد وقع من فوق سلم البيت,واحد وقع من فوق سلم البيت,0.0,0.0
+openai_tts_sample_155.wav,النور قاطع في الشارع كله,النور قاطع في الشارع كله,0.0,0.0
+openai_tts_sample_156.wav,صوت انفجار جامد في المنطقة,صوت انفجار جامد في المنطقة,0.0,0.0
+openai_tts_sample_157.wav,العربية دخلت في محل في الهرم,العربية دخلت في محل في الهرم.,0.16666666666666666,0.043478260869565216
+openai_tts_sample_158.wav,طفلة ضايعة في المول,طفلة ضايعة في المول,0.0,0.0
+openai_tts_sample_159.wav,في تسريب مياه من الدور الرابع,في تسريب مياه من الدور الرابع.,0.16666666666666666,0.041666666666666664
+openai_tts_sample_160.wav,خناقة بين الجيران فوق السطح,خناقة بين الجيران فوق السطح,0.0,0.0
+openai_tts_sample_161.wav,فيه عربية مركونة غلط قافلة الشارع,فيه عربية مركونة غلط قافل الشارع,0.16666666666666666,0.03571428571428571
+openai_tts_sample_162.wav,الغاز بيخرج من البوتاجاز ومفيش حد في الشقة,الغاز بيخرج من البوتاجاز ومفيش حد في الشقة,0.0,0.0

finetune_asr.py ADDED Viewed

	@@ -0,0 +1,711 @@

+import os
+import io
+import json
+import torch
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
+from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
+from omegaconf import OmegaConf, open_dict
+# ============================================
+# CRITICAL: Windows CUDA/Numba Fix
+# ============================================
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+os.environ["NUMBA_CUDA_USE_NVIDIA_BINDING"] = "1"
+os.environ["NUMBA_DISABLE_JIT"] = "0"
+os.environ["NUMBA_CUDA_DRIVER"] = "cuda"
+# Force CPU for RNNT loss on Windows (prevents access violation)
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+# ============================================
+# UTF-8 Encoding Fix
+# ============================================
+manifest_path = "train_manifest.jsonl"
+with io.open(manifest_path, 'r', encoding='utf-8', errors='ignore') as f:
+    content = f.read()
+with io.open(manifest_path, 'w', encoding='utf-8') as f:
+    f.write(content)
+print("✅ train_manifest.jsonl converted to UTF-8")
+# Patch builtins.open for UTF-8
+import builtins
+_old_open = open
+def open_utf8(file, *args, **kwargs):
+    if isinstance(file, str) and file.endswith('.jsonl') and 'encoding' not in kwargs:
+        kwargs['encoding'] = 'utf-8'
+    return _old_open(file, *args, **kwargs)
+builtins.open = open_utf8
+# ============================================
+# Validate Manifest
+# ============================================
+def validate_manifest(manifest_path):
+    count = 0
+    with open(manifest_path, "r", encoding="utf-8") as f:
+        for i, line in enumerate(f, 1):
+            try:
+                item = json.loads(line.strip())
+                assert os.path.exists(item["audio_filepath"]), f"Missing: {item['audio_filepath']}"
+                assert "text" in item and item["text"].strip(), "Empty text"
+                count += 1
+            except Exception as e:
+                print(f"❌ Line {i} error: {e}")
+                print(f"   Content: {line[:100]}")
+    print(f"✅ Valid entries: {count}")
+    return count
+valid_count = validate_manifest(manifest_path)
+if valid_count == 0:
+    raise ValueError("No valid training samples found!")
+# ============================================
+# Configuration (OPTIMIZED FOR ACCURACY)
+# ============================================
+BASE_MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
+# TRAIN_MANIFEST = "train_manifest_hf_converted.jsonl"
+TRAIN_MANIFEST = "train_manifest.jsonl"
+SAVE_DIR = "output_finetuned"
+# OPTIMIZED HYPERPARAMETERS FOR LOWER WER
+BATCH_SIZE = 8 #4  # Slightly smaller for better gradient stability
+MAX_EPOCHS = 250  # More epochs for better convergence
+LEARNING_RATE = 5e-5#3e-5#1e-5  # Lower LR prevents catastrophic forgetting
+WARMUP_STEPS = 500  # Gradual warmup for stability
+WEIGHT_DECAY = 0.00001  # Lighter regularization
+ACCUMULATE_GRAD_BATCHES = 4  # Effective batch = 8*4 = 32
+os.makedirs(SAVE_DIR, exist_ok=True)
+# ============================================
+# Load Model and Fix Tokenizer Path
+# ============================================
+print("🔹 Loading pretrained model...")
+model = EncDecHybridRNNTCTCBPEModel.restore_from(BASE_MODEL_PATH)
+# CRITICAL FIX: Set tokenizer directory to current model directory
+print("🔹 Fixing tokenizer configuration...")
+with open_dict(model.cfg):
+    # Set tokenizer directory to the extracted location
+    tokenizer_dir = os.path.join(os.path.dirname(BASE_MODEL_PATH), "tokenizer")
+    os.makedirs(tokenizer_dir, exist_ok=True)
+    if hasattr(model, 'tokenizer'):
+        print(f"ℹ️ Using existing SentencePiece tokenizer at: {tokenizer_dir}")
+        model.cfg.tokenizer.dir = tokenizer_dir
+        model.cfg.tokenizer.type = "bpe"
+    else:
+        print("⚠️ No tokenizer object found in model — check model restoration path.")
+    # Disable validation/test datasets (will add custom validation)
+    if 'validation_ds' in model.cfg:
+        model.cfg.validation_ds.manifest_filepath = None
+    if 'test_ds' in model.cfg:
+        model.cfg.test_ds.manifest_filepath = None
+# ============================================
+# Setup Training Data (OPTIMIZED)
+# ============================================
+print("🔹 Setting up training data...")
+train_ds_config = {
+    "manifest_filepath": TRAIN_MANIFEST,
+    "batch_size": BATCH_SIZE,
+    "shuffle": True,
+    "num_workers": 0,
+    "pin_memory": False,
+    "sample_rate": 16000,
+    "max_duration": 20.0,  # Limit very long utterances
+    "min_duration": 0.5,  # Filter out very short clips
+    "trim_silence": True,
+    "use_start_end_token": True,
+    # CRITICAL: Enable these for better training
+    "normalize_transcripts": True,
+    "parser": "ar",  # Arabic language parser
+}
+model.setup_training_data(train_ds_config)
+# ============================================
+# Configure Optimizer (OPTIMIZED FOR CONVERGENCE)
+# ============================================
+print("🔹 Configuring optimizer...")
+with open_dict(model.cfg):
+    # Use AdamW with lower learning rate for fine-tuning
+    model.cfg.optim.name = "adamw"
+    model.cfg.optim.lr = LEARNING_RATE
+    model.cfg.optim.betas = [0.9, 0.98]  # Better for transformers
+    model.cfg.optim.weight_decay = WEIGHT_DECAY
+    model.cfg.optim.eps = 1e-8
+    # Add learning rate scheduling for better convergence
+    model.cfg.optim.sched = {
+        "name": "CosineAnnealing",
+        "warmup_steps": 1000,
+        "warmup_ratio": None,
+        "min_lr": 1e-7,
+        "last_epoch": -1,
+    }
+    # CRITICAL: Disable aggressive augmentation during fine-tuning
+    if 'spec_augment' in model.cfg:
+        model.cfg.spec_augment.freq_masks = 0  # Reduce from default
+        model.cfg.spec_augment.time_masks = 0  # Reduce from default
+        model.cfg.spec_augment.freq_width = 15  # Reduce masking
+        model.cfg.spec_augment.time_width = 0.03  # Reduce masking
+# ============================================
+# Configure Loss Weights for Hybrid Model (OPTIMIZED)
+# ============================================
+print("🔹 Optimizing loss weights...")
+# For Hybrid RNNT-CTC models, balance the losses
+if hasattr(model, 'loss_alpha'):
+    # 0.5 = balanced, adjust based on your data
+    # Higher CTC weight (0.7-0.9) often works better for fine-tuning
+    model.loss_alpha = 0.9  # 70% CTC, 30% RNNT
+    print(f"   Loss alpha set to: {model.loss_alpha}")
+# ============================================
+# Callbacks for Best Model Selection
+# ============================================
+print("🔹 Setting up model checkpointing...")
+# Save best model based on training loss (since no validation set)
+checkpoint_callback = ModelCheckpoint(
+    dirpath=SAVE_DIR,
+    filename='best-model-{epoch:02d}-{train_loss:.4f}',
+    save_top_k=5,
+    monitor='train_loss',
+    mode='min',
+    save_last=True,
+    every_n_epochs=2,
+)
+early_stop_callback = EarlyStopping(
+    monitor='train_loss',
+    patience=20,
+    mode='min',
+    verbose=True,
+)
+# Monitor learning rate
+lr_monitor = LearningRateMonitor(logging_interval='step')
+# ============================================
+# Trainer Configuration (CPU Mode - OPTIMIZED)
+# ============================================
+print("🔹 Configuring trainer for CPU...")
+trainer = Trainer(
+    accelerator="cpu",
+    devices=1,
+    max_epochs=MAX_EPOCHS,
+    log_every_n_steps=1,
+    enable_checkpointing=True,
+    default_root_dir=SAVE_DIR,
+    callbacks=[checkpoint_callback, early_stop_callback, lr_monitor],
+    gradient_clip_val=1.0,  # Prevent gradient explosion
+    gradient_clip_algorithm="norm",
+    accumulate_grad_batches=8,  # Effective batch size = 4*8 = 32
+    val_check_interval=1.0,  # Validate every epoch
+    enable_progress_bar=True,
+    enable_model_summary=True,
+)
+# ============================================
+# Start Training
+# ============================================
+print("=" * 60)
+print("🚀 STARTING OPTIMIZED FINE-TUNING")
+print("=" * 60)
+print(f"   Model: {BASE_MODEL_PATH}")
+print(f"   Training samples: {valid_count}")
+print(f"   Max epochs: {MAX_EPOCHS}")
+print(f"   Batch size: {BATCH_SIZE} (effective: {BATCH_SIZE * 8})")
+print(f"   Learning rate: {LEARNING_RATE}")
+print(f"   Warmup steps: {WARMUP_STEPS}")
+print(f"   Loss weighting: CTC={model.loss_alpha if hasattr(model, 'loss_alpha') else 'N/A'}")
+print(f"   Early stopping patience: 20 epochs")
+print("=" * 60)
+print("⚠️  CPU training will be slow. For faster training, use Google Colab GPU.")
+print("=" * 60)
+try:
+    trainer.fit(model)
+    print("\n✅ Training completed successfully!")
+    # Load the best checkpoint
+    best_model_path = checkpoint_callback.best_model_path
+    if best_model_path:
+        print(f"📊 Best model checkpoint: {best_model_path}")
+        print(f"   Best loss: {checkpoint_callback.best_model_score:.4f}")
+        # ✅ Safe load for PyTorch 2.6+ (NeMo-compatible)
+        import typing  # Add this import at the top if not already there
+        import omegaconf
+        torch.serialization.add_safe_globals([
+            omegaconf.dictconfig.DictConfig,
+            omegaconf.base.ContainerMetadata,
+            omegaconf.listconfig.ListConfig,
+            typing.Any,  # Add this line
+        ])
+        checkpoint = torch.load(best_model_path, map_location='cpu', weights_only=False)
+        model.load_state_dict(checkpoint['state_dict'])
+        # ✅ Save the fine-tuned model to .nemo format
+        output_model_path = os.path.join(SAVE_DIR, "finetuned_model_best.nemo")
+        model.save_to(output_model_path)
+        print(f"\n💾 Final model saved to: {output_model_path}")
+    # Save training summary
+    summary_path = os.path.join(SAVE_DIR, "training_summary.txt")
+    with open(summary_path, 'w', encoding='utf-8') as f:
+        f.write(f"Training Summary\n")
+        f.write(f"================\n")
+        f.write(f"Base Model: {BASE_MODEL_PATH}\n")
+        f.write(f"Training Samples: {valid_count}\n")
+        f.write(f"Final Epochs: {trainer.current_epoch}\n")
+        f.write(f"Best Loss: {checkpoint_callback.best_model_score:.4f}\n")
+        f.write(f"Learning Rate: {LEARNING_RATE}\n")
+        f.write(f"Batch Size: {BATCH_SIZE} (effective: {BATCH_SIZE * 8})\n")
+    print(f"📝 Training summary saved to: {summary_path}")
+    print("\n" + "=" * 60)
+    print("🎉 OPTIMIZATION COMPLETE!")
+    print("=" * 60)
+    print("Next steps:")
+    print("1. Test your model on validation data to measure WER")
+    print("2. If WER is still high, consider:")
+    print("   - Increasing training data")
+    print("   - Training for more epochs")
+    print("   - Adjusting loss_alpha (try 0.5 or 0.9)")
+    print("   - Using data augmentation if needed")
+    print("=" * 60)
+except Exception as e:
+    print(f"\n❌ Training failed: {e}")
+    import traceback
+    traceback.print_exc()
+    print("\n💡 Troubleshooting tips:")
+    print("1. Check if all audio files exist and are valid")
+    print("2. Verify manifest format is correct")
+    print("3. Ensure sufficient disk space for checkpoints")
+    print("4. Try reducing batch_size if out of memory")
+# import os
+# import io
+# import json
+# import torch
+# from pytorch_lightning import Trainer
+# from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
+# from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
+# from omegaconf import OmegaConf, open_dict
+# # ============================================
+# # CRITICAL: Windows CUDA/Numba Fix
+# # ============================================
+# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+# os.environ["NUMBA_CUDA_USE_NVIDIA_BINDING"] = "1"
+# os.environ["NUMBA_DISABLE_JIT"] = "0"
+# os.environ["NUMBA_CUDA_DRIVER"] = "cuda"
+# os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Force CPU for Windows stability
+# # ============================================
+# # UTF-8 Encoding Fix
+# # ============================================
+# manifest_path = "train_manifest.jsonl"
+# with io.open(manifest_path, 'r', encoding='utf-8', errors='ignore') as f:
+#     content = f.read()
+# with io.open(manifest_path, 'w', encoding='utf-8') as f:
+#     f.write(content)
+# print("✅ train_manifest.jsonl converted to UTF-8")
+# # Patch builtins.open for UTF-8
+# import builtins
+# _old_open = open
+# def open_utf8(file, *args, **kwargs):
+#     if isinstance(file, str) and file.endswith('.jsonl') and 'encoding' not in kwargs:
+#         kwargs['encoding'] = 'utf-8'
+#     return _old_open(file, *args, **kwargs)
+# builtins.open = open_utf8
+# # ============================================
+# # Validate Manifest (With Optional Validation Split)
+# # ============================================
+# USE_VALIDATION = True  # Set to False if you don't want validation split
+# def validate_manifest(manifest_path, create_val_split=True, val_split=0.1):
+#     """Validate manifest and optionally create train/val split"""
+#     valid_entries = []
+#     with open(manifest_path, "r", encoding="utf-8") as f:
+#         for i, line in enumerate(f, 1):
+#             try:
+#                 item = json.loads(line.strip())
+#                 assert os.path.exists(item["audio_filepath"]), f"Missing: {item['audio_filepath']}"
+#                 assert "text" in item and item["text"].strip(), "Empty text"
+#                 # Optional: Filter by duration for quality
+#                 duration = item.get("duration", 0)
+#                 if 0.5 <= duration <= 20.0:  # Keep reasonable lengths
+#                     valid_entries.append(item)
+#             except Exception as e:
+#                 print(f"⚠️ Skipping line {i}: {e}")
+#     print(f"✅ Total valid entries: {len(valid_entries)}")
+#     if not create_val_split:
+#         # Use entire dataset for training
+#         print("📊 Using all data for training (no validation split)")
+#         return manifest_path, None, len(valid_entries), 0
+#     # Split into train/val
+#     import random
+#     random.seed(42)
+#     random.shuffle(valid_entries)
+#     split_idx = int(len(valid_entries) * (1 - val_split))
+#     train_entries = valid_entries[:split_idx]
+#     val_entries = valid_entries[split_idx:]
+#     # Save splits
+#     train_manifest = "train_split.jsonl"
+#     val_manifest = "val_split.jsonl"
+#     with open(train_manifest, "w", encoding="utf-8") as f:
+#         for entry in train_entries:
+#             f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+#     with open(val_manifest, "w", encoding="utf-8") as f:
+#         for entry in val_entries:
+#             f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+#     print(f"📊 Train samples: {len(train_entries)}")
+#     print(f"📊 Validation samples: {len(val_entries)}")
+#     return train_manifest, val_manifest, len(train_entries), len(val_entries)
+# train_manifest, val_manifest, train_count, val_count = validate_manifest(
+#     manifest_path,
+#     create_val_split=USE_VALIDATION
+# )
+# if train_count == 0:
+#     raise ValueError("No valid training samples found!")
+# # ============================================
+# # Configuration (OPTIMIZED FOR 4000+ SAMPLES)
+# # ============================================
+# BASE_MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
+# SAVE_DIR = "output_finetuned"
+# # OPTIMIZED HYPERPARAMETERS FOR LARGE DATASET
+# BATCH_SIZE = 8  # Larger batch for 4000+ samples (adjust based on RAM)
+# MAX_EPOCHS = 100  # Sufficient for convergence with large dataset
+# LEARNING_RATE = 5e-5  # Moderate LR for large dataset
+# WARMUP_RATIO = 0.05  # 5% warmup of total steps
+# WEIGHT_DECAY = 0.0001  # Regularization for generalization
+# ACCUMULATE_GRAD_BATCHES = 4  # Effective batch = 8*4 = 32
+# os.makedirs(SAVE_DIR, exist_ok=True)
+# # ============================================
+# # Load Model and Fix Tokenizer Path
+# # ============================================
+# print("🔹 Loading pretrained model...")
+# model = EncDecHybridRNNTCTCBPEModel.restore_from(BASE_MODEL_PATH)
+# print("🔹 Fixing tokenizer configuration...")
+# with open_dict(model.cfg):
+#     tokenizer_dir = os.path.join(os.path.dirname(BASE_MODEL_PATH), "tokenizer")
+#     os.makedirs(tokenizer_dir, exist_ok=True)
+#     if hasattr(model, 'tokenizer'):
+#         model.cfg.tokenizer.dir = tokenizer_dir
+#         model.cfg.tokenizer.type = "bpe"
+#     # CRITICAL: Properly disable validation dataset
+#     if 'validation_ds' in model.cfg:
+#         model.cfg.validation_ds = None
+#     # Disable test dataset
+#     if 'test_ds' in model.cfg:
+#         model.cfg.test_ds = None
+# # ============================================
+# # Setup Training Data (OPTIMIZED FOR ACCURACY)
+# # ============================================
+# print("🔹 Setting up training data...")
+# train_ds_config = {
+#     "manifest_filepath": train_manifest,
+#     "batch_size": BATCH_SIZE,
+#     "shuffle": True,
+#     "num_workers": 0,
+#     "pin_memory": False,
+#     "sample_rate": 16000,
+#     "max_duration": 20.0,
+#     "min_duration": 0.5,
+#     "trim_silence": True,
+#     "use_start_end_token": True,
+#     "normalize_transcripts": True,
+#     "parser": "ar",
+#     # Add augmentation for better generalization (light for fine-tuning)
+#     "augmentor": None,  # Disable for now, can enable if needed
+# }
+# model.setup_training_data(train_ds_config)
+# # ============================================
+# # Setup Validation Data (OPTIONAL)
+# # ============================================
+# if USE_VALIDATION and val_manifest:
+#     print("🔹 Setting up validation data...")
+#     val_ds_config = {
+#         "manifest_filepath": val_manifest,
+#         "batch_size": BATCH_SIZE,
+#         "shuffle": False,
+#         "num_workers": 0,
+#         "pin_memory": False,
+#         "sample_rate": 16000,
+#         "normalize_transcripts": True,
+#         "parser": "ar",
+#     }
+#     model.setup_validation_data(val_ds_config)
+# else:
+#     print("⚠️ No validation data - monitoring training loss only")
+# # ============================================
+# # Configure Optimizer (OPTIMIZED FOR CONVERGENCE)
+# # ============================================
+# print("🔹 Configuring optimizer...")
+# # Calculate total steps for scheduling
+# steps_per_epoch = train_count // (BATCH_SIZE * ACCUMULATE_GRAD_BATCHES)
+# total_steps = steps_per_epoch * MAX_EPOCHS
+# warmup_steps = int(total_steps * WARMUP_RATIO)
+# print(f"   Steps per epoch: {steps_per_epoch}")
+# print(f"   Total training steps: {total_steps}")
+# print(f"   Warmup steps: {warmup_steps}")
+# with open_dict(model.cfg):
+#     # AdamW optimizer with optimal settings
+#     model.cfg.optim.name = "adamw"
+#     model.cfg.optim.lr = LEARNING_RATE
+#     model.cfg.optim.betas = [0.9, 0.999]
+#     model.cfg.optim.weight_decay = WEIGHT_DECAY
+#     model.cfg.optim.eps = 1e-8
+#     # Polynomial decay with warmup (better than cosine for fine-tuning)
+#     model.cfg.optim.sched = {
+#         "name": "PolynomialDecayAnnealing",
+#         "warmup_steps": warmup_steps,
+#         "warmup_ratio": None,
+#         "min_lr": 1e-7,
+#         "power": 1.0,
+#         "last_epoch": -1,
+#     }
+#     # LIGHT augmentation for fine-tuning (prevents overfitting)
+#     if 'spec_augment' in model.cfg:
+#         model.cfg.spec_augment.freq_masks = 1
+#         model.cfg.spec_augment.time_masks = 2
+#         model.cfg.spec_augment.freq_width = 10
+#         model.cfg.spec_augment.time_width = 0.025
+# # ============================================
+# # Configure Loss Weights (OPTIMIZED FOR HYBRID)
+# # ============================================
+# print("🔹 Optimizing loss weights...")
+# if hasattr(model, 'loss_alpha'):
+#     # For Arabic: CTC often works better for fine-tuning
+#     model.loss_alpha = 0.8  # 80% CTC, 20% RNNT
+#     print(f"   Loss alpha: {model.loss_alpha} (CTC-focused)")
+# # ============================================
+# # Callbacks for Best Model Selection
+# # ============================================
+# print("🔹 Setting up callbacks...")
+# # Choose monitor metric based on validation availability
+# monitor_metric = 'val_loss' if USE_VALIDATION else 'train_loss'
+# monitor_mode = 'min'
+# # Save best model based on available metric
+# checkpoint_callback = ModelCheckpoint(
+#     dirpath=SAVE_DIR,
+#     filename=f'best-{{epoch:02d}}-{{{monitor_metric}:.4f}}',
+#     save_top_k=3,
+#     monitor=monitor_metric,
+#     mode=monitor_mode,
+#     save_last=True,
+#     every_n_epochs=1,
+#     verbose=True,
+# )
+# # Early stopping based on available metric
+# early_stop_callback = EarlyStopping(
+#     monitor=monitor_metric,
+#     patience=15,  # Stop if no improvement for 15 epochs
+#     mode=monitor_mode,
+#     verbose=True,
+#     min_delta=0.0001,
+# )
+# lr_monitor = LearningRateMonitor(logging_interval='step')
+# # ============================================
+# # Trainer Configuration (OPTIMIZED FOR CPU)
+# # ============================================
+# print("🔹 Configuring trainer...")
+# trainer = Trainer(
+#     accelerator="cpu",
+#     devices=1,
+#     max_epochs=MAX_EPOCHS,
+#     log_every_n_steps=5,
+#     enable_checkpointing=True,
+#     default_root_dir=SAVE_DIR,
+#     callbacks=[checkpoint_callback, early_stop_callback, lr_monitor],
+#     gradient_clip_val=1.0,
+#     gradient_clip_algorithm="norm",
+#     accumulate_grad_batches=ACCUMULATE_GRAD_BATCHES,
+#     val_check_interval=1.0,  # Validate every epoch
+#     enable_progress_bar=True,
+#     enable_model_summary=True,
+#     deterministic=False,  # Faster training
+#     benchmark=False,
+# )
+# # ============================================
+# # Start Training
+# # ============================================
+# print("=" * 70)
+# print("🚀 STARTING OPTIMIZED FINE-TUNING FOR 4000+ SAMPLES")
+# print("=" * 70)
+# print(f"   Base Model: {BASE_MODEL_PATH}")
+# print(f"   Training samples: {train_count}")
+# print(f"   Validation samples: {val_count if USE_VALIDATION else 'None (using training loss)'}")
+# print(f"   Max epochs: {MAX_EPOCHS}")
+# print(f"   Batch size: {BATCH_SIZE} (effective: {BATCH_SIZE * ACCUMULATE_GRAD_BATCHES})")
+# print(f"   Learning rate: {LEARNING_RATE}")
+# print(f"   Warmup steps: {warmup_steps}")
+# print(f"   Weight decay: {WEIGHT_DECAY}")
+# print(f"   Loss weighting: CTC={model.loss_alpha if hasattr(model, 'loss_alpha') else 'N/A'}")
+# print(f"   Monitoring: {monitor_metric}")
+# print(f"   Early stopping: 15 epochs patience")
+# print("=" * 70)
+# print("⏱️  Estimated time: ~{:.1f} hours (depends on CPU)".format(
+#     train_count * MAX_EPOCHS / (BATCH_SIZE * 3600 * 0.5)  # Rough estimate
+# ))
+# print("=" * 70)
+# try:
+#     trainer.fit(model)
+#     print("\n✅ Training completed successfully!")
+#     # Load and save the best checkpoint
+#     best_model_path = checkpoint_callback.best_model_path
+#     if best_model_path:
+#         print(f"\n📊 Best model checkpoint: {best_model_path}")
+#         print(f"   Best {monitor_metric}: {checkpoint_callback.best_model_score:.4f}")
+#         # Safe load for PyTorch 2.6+
+#         import typing
+#         import omegaconf
+#         torch.serialization.add_safe_globals([
+#             omegaconf.dictconfig.DictConfig,
+#             omegaconf.base.ContainerMetadata,
+#             omegaconf.listconfig.ListConfig,
+#             typing.Any,
+#         ])
+#         checkpoint = torch.load(best_model_path, map_location='cpu', weights_only=False)
+#         model.load_state_dict(checkpoint['state_dict'])
+#         # Save final model
+#         output_model_path = os.path.join(SAVE_DIR, "finetuned_model_best.nemo")
+#         model.save_to(output_model_path)
+#         print(f"\n💾 Final model saved to: {output_model_path}")
+#     # Save training summary
+#     summary_path = os.path.join(SAVE_DIR, "training_summary.txt")
+#     with open(summary_path, 'w', encoding='utf-8') as f:
+#         f.write(f"Training Summary - 4000+ Samples\n")
+#         f.write(f"=================================\n")
+#         f.write(f"Base Model: {BASE_MODEL_PATH}\n")
+#         f.write(f"Training Samples: {train_count}\n")
+#         f.write(f"Validation Samples: {val_count if USE_VALIDATION else 'None'}\n")
+#         f.write(f"Final Epoch: {trainer.current_epoch}\n")
+#         f.write(f"Best {monitor_metric}: {checkpoint_callback.best_model_score:.4f}\n")
+#         f.write(f"Learning Rate: {LEARNING_RATE}\n")
+#         f.write(f"Batch Size: {BATCH_SIZE} (effective: {BATCH_SIZE * ACCUMULATE_GRAD_BATCHES})\n")
+#         f.write(f"Warmup Steps: {warmup_steps}\n")
+#         f.write(f"Weight Decay: {WEIGHT_DECAY}\n")
+#     print(f"📝 Training summary saved to: {summary_path}")
+#     print("\n" + "=" * 70)
+#     print("🎉 TRAINING COMPLETE!")
+#     print("=" * 70)
+#     print("Next steps:")
+#     print("1. Evaluate WER/CER on test set using the best model")
+#     print("2. If WER is still high, try:")
+#     print("   - Train for more epochs (increase MAX_EPOCHS)")
+#     print("   - Adjust loss_alpha (try 0.5 or 0.9)")
+#     print("   - Add more training data")
+#     print("   - Enable light data augmentation")
+#     print("3. Use the validation manifest to monitor overfitting")
+#     print("=" * 70)
+# except KeyboardInterrupt:
+#     print("\n⚠️ Training interrupted by user")
+#     print("💾 Saving last checkpoint...")
+#     if hasattr(checkpoint_callback, 'last_model_path'):
+#         print(f"   Last checkpoint: {checkpoint_callback.last_model_path}")
+# except Exception as e:
+#     print(f"\n❌ Training failed: {e}")
+#     import traceback
+#     traceback.print_exc()
+#     print("\n💡 Troubleshooting:")
+#     print("1. Reduce BATCH_SIZE if out of memory")
+#     print("2. Check audio file paths in manifest")
+#     print("3. Verify all audio files are valid WAV format")
+#     print("4. Ensure sufficient disk space for checkpoints")

testing_main.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# import sounddevice as sd
+# import scipy.io.wavfile as wav
+# import nemo.collections.asr as nemo_asr
+# # ===== SETTINGS =====
+# SAMPLE_RATE = 16000
+# DURATION = 10  # seconds
+# OUTPUT_FILE = "arabic_recording.wav"
+# # ===== STEP 1: Record audio =====
+# print("🎙️ Recording... Speak Arabic now!")
+# audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
+# sd.wait()
+# wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
+# print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")
+# # ===== STEP 2: Load ASR model =====
+# print("📥 Loading Arabic ASR model...")
+# asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
+#     "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
+# )
+# # ===== STEP 3: Configure Decoding =====
+# print("🔍 Configuring decoding strategy...")
+# # Get decoding config
+# decoding_cfg = asr_model.cfg.decoding
+# # Print available parameters to debug
+# print(f"Available decoding strategies: {decoding_cfg.keys() if hasattr(decoding_cfg, 'keys') else 'N/A'}")
+# # After loading the model, add this to inspect the config:
+# print("🔍 Beam config structure:")
+# print(decoding_cfg.beam)
+# # Set beam search strategy
+# decoding_cfg.strategy = "beam"
+# decoding_cfg.beam.beam_size = 128
+# decoding_cfg.beam.return_best_hypothesis = True
+# # Only set parameters that exist
+# if hasattr(decoding_cfg.beam, 'beam_alpha'):
+#     decoding_cfg.beam.beam_alpha = 0.3
+#     print("✓ Set beam_alpha")
+# if hasattr(decoding_cfg.beam, 'beam_beta'):
+#     decoding_cfg.beam.beam_beta = 0.5
+#     print("✓ Set beam_beta")
+# # Remove softmax_temperature - it's not supported in this config
+# # If you need temperature sampling, you might need to use a different strategy
+# # Apply the decoding configuration
+# asr_model.change_decoding_strategy(decoding_cfg)
+# # ===== STEP 4: Transcribe =====
+# print("🔍 Transcribing...")
+# transcription = asr_model.transcribe(
+#     [OUTPUT_FILE],
+#     batch_size=1,
+#     num_workers=0
+# )
+# print("📝 Transcription:", transcription[0])
+# import sounddevice as sd
+# import scipy.io.wavfile as wav
+# import nemo.collections.asr as nemo_asr
+# # ===== SETTINGS =====
+# SAMPLE_RATE = 16000
+# DURATION = 10
+# OUTPUT_FILE = "arabic_recording.wav"
+# # ===== STEP 1: Record audio =====
+# print("🎙️ Recording... Speak Arabic now!")
+# audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
+# sd.wait()
+# wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
+# print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")
+# # ===== STEP 2: Load ASR model =====
+# print("📥 Loading Arabic ASR model...")
+# asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
+#     "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
+# )
+# # ===== STEP 3: Configure for LITERAL transcription =====
+# print("🔍 Configuring greedy decoding for literal output...")
+# decoding_cfg = asr_model.cfg.decoding
+# decoding_cfg.strategy = "greedy"
+# # CRITICAL: Increase max_symbols to avoid truncating repetitions
+# # The default is only 10, which is very restrictive!
+# decoding_cfg.greedy.max_symbols = 1000  # Allow much longer sequences
+# decoding_cfg.beam.beam_size = 64
+# decoding_cfg.beam.search_type = "beam"
+# print(f"✓ Set max_symbols to {decoding_cfg.greedy.max_symbols}")
+# print("Updated config:", decoding_cfg)
+# # Apply configuration
+# asr_model.change_decoding_strategy(decoding_cfg)
+# # ===== STEP 4: Transcribe =====
+# print("🔍 Transcribing...")
+# transcription = asr_model.transcribe(
+#     [OUTPUT_FILE],
+#     batch_size=1,
+#     num_workers=0
+# )
+# print("📝 Literal Transcription:", transcription[0])
+import sounddevice as sd
+import scipy.io.wavfile as wav
+import nemo.collections.asr as nemo_asr
+from omegaconf import OmegaConf
+# ===== SETTINGS =====
+SAMPLE_RATE = 16000
+DURATION = 10
+OUTPUT_FILE = "arabic_recording.wav"
+# ===== STEP 2: Load ASR model =====
+print("📥 Loading Arabic ASR model...")
+asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(
+    "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
+)
+# Add this right after loading the model to see what's actually available:
+print("Available greedy parameters:")
+print(OmegaConf.to_yaml(asr_model.cfg.decoding.greedy))
+# ===== STEP 3: Configure for LITERAL transcription =====
+print("🔍 Configuring greedy decoding for literal output...")
+# Set struct mode to False temporarily to allow modifications
+OmegaConf.set_struct(asr_model.cfg.decoding, False)
+OmegaConf.set_struct(asr_model.cfg.decoding.greedy, False)
+decoding_cfg = asr_model.cfg.decoding
+decoding_cfg.strategy = "maes"
+# Now try setting the parameters
+try:
+    decoding_cfg.greedy.max_symbols_per_step = 300
+    print(f"✓ max_symbols_per_step: {decoding_cfg.greedy.max_symbols_per_step}")
+except:
+    print("⚠ Could not set max_symbols_per_step")
+decoding_cfg.greedy.max_symbols = 500
+decoding_cfg.greedy.loop_labels = True
+decoding_cfg.greedy.preserve_alignments = True
+decoding_cfg.preserve_alignments = True
+decoding_cfg.compute_timestamps = True
+decoding_cfg.temperature = 1.3
+decoding_cfg.beam.beam_size = 64
+decoding_cfg.beam.softmax_temperature = 1.3
+decoding_cfg.beam.search_type = "beam"
+print(f"✓ max_symbols: {decoding_cfg.greedy.max_symbols}")
+print(f"✓ loop_labels: {decoding_cfg.greedy.loop_labels}")
+print(f"✓ temperature: {decoding_cfg.temperature}")
+# Re-enable struct mode
+OmegaConf.set_struct(asr_model.cfg.decoding, True)
+OmegaConf.set_struct(asr_model.cfg.decoding.greedy, True)
+# Apply configuration
+asr_model.change_decoding_strategy(decoding_cfg)
+# ===== STEP 1: Record audio =====
+print("🎙️ Recording... Speak Arabic now!")
+audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
+sd.wait()
+wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
+print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")
+# ===== STEP 4: Transcribe =====
+print("🔍 Transcribing...")
+transcription = asr_model.transcribe(
+    [OUTPUT_FILE],
+    batch_size=1,
+    num_workers=0
+)
+print("📝 Literal Transcription:", transcription[0])

testing_main_v2.py ADDED Viewed

	@@ -0,0 +1,473 @@

+# import sounddevice as sd
+# import scipy.io.wavfile as wav
+# import nemo.collections.asr as nemo_asr
+# import torch
+# import numpy as np
+# from typing import List, Tuple
+# # ===== SETTINGS =====
+# SAMPLE_RATE = 16000
+# DURATION = 10  # seconds
+# OUTPUT_FILE = "arabic_recording.wav"
+# class RepetitionAwareTranscriber:
+#     def __init__(self, model_path: str):
+#         """Initialize ASR model with repetition-aware configuration"""
+#         print("📥 Loading Arabic ASR model...")
+#         self.asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
+#         self._configure_decoding()
+#     def _configure_decoding(self):
+#         """Configure advanced decoding strategy"""
+#         decoding_cfg = self.asr_model.cfg.decoding
+#         # Use beam search for better sequence modeling
+#         decoding_cfg.strategy = "beam"
+#         decoding_cfg.beam.beam_size = 128  # Larger beam for more candidates
+#         decoding_cfg.beam.return_best_hypothesis = False  # Get multiple hypotheses
+#         # Language model parameters (if available)
+#         if hasattr(decoding_cfg.beam, 'beam_alpha'):
+#             decoding_cfg.beam.beam_alpha = 0.3  # LM weight (lower = less LM influence)
+#         if hasattr(decoding_cfg.beam, 'beam_beta'):
+#             decoding_cfg.beam.beam_beta = 0.5   # Word insertion bonus
+#         self.asr_model.change_decoding_strategy(decoding_cfg)
+#     def transcribe_with_logprobs(self, audio_file: str, temperature: float = 1.0):
+#         """
+#         Transcribe with log probabilities and temperature scaling
+#         Args:
+#             audio_file: Path to audio file
+#             temperature: Controls randomness (lower = more conservative, higher = more diverse)
+#                         0.5 = more deterministic
+#                         1.0 = standard
+#                         1.5 = more exploratory
+#         """
+#         print(f"🔍 Transcribing with temperature={temperature}...")
+#         # Update temperature in decoding config
+#         if hasattr(self.asr_model.cfg.decoding, 'temperature'):
+#             self.asr_model.cfg.decoding.temperature = temperature
+#         if hasattr(self.asr_model.cfg.decoding.beam, 'softmax_temperature'):
+#             self.asr_model.cfg.decoding.beam.softmax_temperature = temperature
+#         self.asr_model.change_decoding_strategy(self.asr_model.cfg.decoding)
+#         # Get multiple hypotheses with their scores
+#         hypotheses = self.asr_model.transcribe(
+#             [audio_file],
+#             batch_size=1,
+#             return_hypotheses=True,
+#             num_workers=0
+#         )
+#         # Handle different return types
+#         if isinstance(hypotheses, list) and len(hypotheses) > 0:
+#             hyp = hypotheses[0]
+#             # Check if it's a Hypothesis object or a list
+#             if isinstance(hyp, list):
+#                 # It's already a list of transcriptions
+#                 best_text = hyp[0] if len(hyp) > 0 else ""
+#                 print(f"\n📊 Top hypothesis: {best_text}")
+#                 return best_text
+#             elif hasattr(hyp, 'text'):
+#                 # It's a Hypothesis object
+#                 text = hyp.text
+#                 # Check for nbest hypotheses
+#                 if hasattr(hyp, 'nbest') and len(hyp.nbest) > 1:
+#                     print(f"\n📊 Top {min(5, len(hyp.nbest))} hypotheses:")
+#                     for i, nbest_hyp in enumerate(hyp.nbest[:5]):
+#                         score = nbest_hyp.score if hasattr(nbest_hyp, 'score') else 'N/A'
+#                         hyp_text = nbest_hyp.text if hasattr(nbest_hyp, 'text') else str(nbest_hyp)
+#                         print(f"  {i+1}. [{score}] {hyp_text}")
+#                 return text
+#             else:
+#                 # Fallback: convert to string
+#                 return str(hyp)
+#         return ""
+#     def transcribe_with_frame_analysis(self, audio_file: str):
+#         """
+#         Analyze frame-level predictions to detect repetitions
+#         This examines the raw CTC outputs before collapsing
+#         """
+#         print("🔍 Performing frame-level analysis...")
+#         # Get log probabilities at frame level
+#         log_probs = self.asr_model.transcribe(
+#             [audio_file],
+#             batch_size=1,
+#             logprobs=True
+#         )
+#         # Standard transcription
+#         transcription = self.asr_model.transcribe([audio_file])
+#         return transcription[0], log_probs
+#     def transcribe_with_all_methods(self, audio_file: str):
+#         """Try multiple decoding strategies and return all results"""
+#         results = {}
+#         # Method 1: Standard beam search
+#         print("\n--- Method 1: Standard Beam Search ---")
+#         results['beam_standard'] = self.transcribe_with_logprobs(audio_file, temperature=1.0)
+#         # Method 2: Lower temperature (more conservative)
+#         print("\n--- Method 2: Conservative (temp=0.5) ---")
+#         results['beam_conservative'] = self.transcribe_with_logprobs(audio_file, temperature=0.5)
+#         # Method 3: Higher temperature (more exploratory)
+#         print("\n--- Method 3: Exploratory (temp=1.5) ---")
+#         results['beam_exploratory'] = self.transcribe_with_logprobs(audio_file, temperature=1.5)
+#         # Method 4: Frame-level analysis
+#         print("\n--- Method 4: Frame-level Analysis ---")
+#         results['frame_analysis'], _ = self.transcribe_with_frame_analysis(audio_file)
+#         return results
+# def post_process_repetitions(text: str, audio_duration: float, expected_word_count: int = None) -> str:
+#     """
+#     Heuristic post-processing to restore repetitions
+#     Args:
+#         text: Transcribed text
+#         audio_duration: Duration of audio in seconds
+#         expected_word_count: Expected number of words (if known)
+#     """
+#     words = text.split()
+#     # Calculate speaking rate (words per second)
+#     speaking_rate = len(words) / audio_duration
+#     # Normal Arabic speaking rate is 2-3 words per second
+#     # For numbers, it's often slower (1-2 words per second)
+#     # If rate is too high, likely missing repetitions
+#     if speaking_rate > 3.0 and expected_word_count:
+#         print(f"⚠️ Speaking rate unusually high ({speaking_rate:.1f} w/s)")
+#         print(f"   Expected ~{expected_word_count} words, got {len(words)}")
+#         print("   Possible missing repetitions detected")
+#     return text
+# def detect_number_patterns(text: str) -> List[str]:
+#     """Detect if text contains Arabic number words"""
+#     arabic_numbers = [
+#         'صفر', 'زيرو', 'واحد', 'اثنين', 'ثلاثة', 'أربعة',
+#         'خمسة', 'ستة', 'سبعة', 'ثمانية', 'تسعة'
+#     ]
+#     words = text.split()
+#     detected = [w for w in words if w in arabic_numbers]
+#     if detected:
+#         print(f"🔢 Detected number words: {' '.join(detected)}")
+#     return detected
+# # ===== MAIN EXECUTION =====
+# if __name__ == "__main__":
+#     # ===== STEP 1: Record audio =====
+#     print("🎙️ Recording... Speak Arabic now!")
+#     print("💡 TIP: For repeated numbers, pause slightly between each repetition")
+#     print("   Example: 'زيرو [pause] زيرو [pause] واحد [pause] واحد'\n")
+#     audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
+#     sd.wait()
+#     wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
+#     print(f"✅ Recording finished. Saved as {OUTPUT_FILE}\n")
+#     # ===== STEP 2: Initialize transcriber =====
+#     model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
+#     transcriber = RepetitionAwareTranscriber(model_path)
+#     # ===== STEP 3: Transcribe with all methods =====
+#     results = transcriber.transcribe_with_all_methods(OUTPUT_FILE)
+#     # ===== STEP 4: Display all results =====
+#     print("\n" + "="*60)
+#     print("📝 FINAL RESULTS:")
+#     print("="*60)
+#     for method, transcription in results.items():
+#         print(f"\n{method.upper()}:")
+#         print(f"  {transcription}")
+#         detect_number_patterns(transcription)
+#     # ===== STEP 5: Post-processing analysis =====
+#     print("\n" + "="*60)
+#     print("🔍 POST-PROCESSING ANALYSIS:")
+#     print("="*60)
+#     best_transcription = results['beam_standard']
+#     processed = post_process_repetitions(best_transcription, DURATION)
+#     print(f"\nBest transcription: {best_transcription}")
+#     print(f"Word count: {len(best_transcription.split())}")
+#     print(f"Speaking rate: {len(best_transcription.split()) / DURATION:.2f} words/sec")
+#     # ===== STEP 6: Recommendations =====
+#     print("\n" + "="*60)
+#     print("💡 RECOMMENDATIONS:")
+#     print("="*60)
+#     print("1. Compare all method outputs above")
+#     print("2. If all methods miss repetitions, the issue is in the trained model")
+#     print("3. Consider retraining with more repetitive sequences in training data")
+#     print("4. When speaking, add slight pauses between repeated words")
+#     print("5. If transcribing phone numbers, use digit-by-digit model instead")
+import sounddevice as sd
+import scipy.io.wavfile as wav
+import nemo.collections.asr as nemo_asr
+import torch
+import numpy as np
+from typing import List, Tuple
+# ===== SETTINGS =====
+SAMPLE_RATE = 16000
+DURATION = 10  # seconds
+OUTPUT_FILE = "arabic_recording.wav"
+class RepetitionAwareTranscriber:
+    def __init__(self, model_path: str):
+        """Initialize ASR model with repetition-aware configuration"""
+        print("📥 Loading Arabic ASR model...")
+        # Try to load as Hybrid RNNT-CTC first (better for repetitions!)
+        try:
+            self.asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(model_path)
+            self.model_type = "hybrid_rnnt_ctc"
+            print("✅ Loaded as Hybrid RNNT-CTC model (excellent for repetitions!)")
+        except:
+            try:
+                self.asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(model_path)
+                self.model_type = "rnnt"
+                print("✅ Loaded as RNNT model")
+            except:
+                self.asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
+                self.model_type = "ctc"
+                print("✅ Loaded as CTC model")
+        self._configure_decoding()
+    def _configure_decoding(self):
+        """Configure advanced decoding strategy"""
+        decoding_cfg = self.asr_model.cfg.decoding
+        # Use beam search for better sequence modeling
+        decoding_cfg.strategy = "beam"
+        decoding_cfg.beam.beam_size = 128  # Larger beam for more candidates
+        decoding_cfg.beam.return_best_hypothesis = False  # Get multiple hypotheses
+        # Language model parameters (if available)
+        if hasattr(decoding_cfg.beam, 'beam_alpha'):
+            decoding_cfg.beam.beam_alpha = 0.3  # LM weight (lower = less LM influence)
+        if hasattr(decoding_cfg.beam, 'beam_beta'):
+            decoding_cfg.beam.beam_beta = 0.5   # Word insertion bonus
+        self.asr_model.change_decoding_strategy(decoding_cfg)
+    def transcribe_with_logprobs(self, audio_file: str, temperature: float = 1.0):
+        """
+        Transcribe with log probabilities and temperature scaling
+        Args:
+            audio_file: Path to audio file
+            temperature: Controls randomness (lower = more conservative, higher = more diverse)
+                        0.5 = more deterministic
+                        1.0 = standard
+                        1.5 = more exploratory
+        """
+        print(f"🔍 Transcribing with temperature={temperature}...")
+        # Update temperature in decoding config
+        if hasattr(self.asr_model.cfg.decoding, 'temperature'):
+            self.asr_model.cfg.decoding.temperature = temperature
+        if hasattr(self.asr_model.cfg.decoding.beam, 'softmax_temperature'):
+            self.asr_model.cfg.decoding.beam.softmax_temperature = temperature
+        self.asr_model.change_decoding_strategy(self.asr_model.cfg.decoding)
+        # Get multiple hypotheses with their scores
+        hypotheses = self.asr_model.transcribe(
+            [audio_file],
+            batch_size=1,
+            return_hypotheses=True,
+            num_workers=0
+        )
+        print(hypotheses)
+        # Handle different return types
+        if isinstance(hypotheses, list) and len(hypotheses) > 0:
+            hyp = hypotheses[0]
+            # Check if it's a Hypothesis object or a list
+            if isinstance(hyp, list):
+                # It's already a list of transcriptions
+                best_text = hyp[0] if len(hyp) > 0 else ""
+                print(f"\n📊 Top hypothesis: {best_text}")
+                return best_text
+            elif hasattr(hyp, 'text'):
+                # It's a Hypothesis object
+                text = hyp.text
+                # Check for nbest hypotheses
+                if hasattr(hyp, 'nbest') and len(hyp.nbest) > 1:
+                    print(f"\n📊 Top {min(5, len(hyp.nbest))} hypotheses:")
+                    for i, nbest_hyp in enumerate(hyp.nbest[:5]):
+                        score = nbest_hyp.score if hasattr(nbest_hyp, 'score') else 'N/A'
+                        hyp_text = nbest_hyp.text if hasattr(nbest_hyp, 'text') else str(nbest_hyp)
+                        print(f"  {i+1}. [{score}] {hyp_text}")
+                return text
+            else:
+                # Fallback: convert to string
+                return str(hyp)
+        return ""
+    def transcribe_with_frame_analysis(self, audio_file: str):
+        """
+        Analyze frame-level predictions to detect repetitions
+        This examines the raw CTC outputs before collapsing
+        """
+        print("🔍 Performing frame-level analysis...")
+        # Get log probabilities at frame level
+        log_probs = self.asr_model.transcribe(
+            [audio_file],
+            batch_size=1,
+            logprobs=True
+        )
+        # Standard transcription
+        transcription = self.asr_model.transcribe([audio_file])
+        return transcription[0], log_probs
+    def transcribe_with_all_methods(self, audio_file: str):
+        """Try multiple decoding strategies and return all results"""
+        results = {}
+        # Method 1: Standard beam search
+        print("\n--- Method 1: Standard Beam Search ---")
+        results['beam_standard'] = self.transcribe_with_logprobs(audio_file, temperature=1.0)
+        print(f"Results with Temp 1.0 : {results['beam_standard']}")
+        # Method 2: Lower temperature (more conservative)
+        print("\n--- Method 2: Conservative (temp=0.5) ---")
+        results['beam_conservative'] = self.transcribe_with_logprobs(audio_file, temperature=0.5)
+        print(f"Results with Temp 0.5 : {results['beam_conservative']}")
+        # Method 3: Higher temperature (more exploratory)
+        print("\n--- Method 3: Exploratory (temp=1.5) ---")
+        results['beam_exploratory'] = self.transcribe_with_logprobs(audio_file, temperature=1.5)
+        print(f"Results with Temp 1.5 : {results['beam_exploratory']}")
+        # Method 4: Frame-level analysis
+        # print("\n--- Method 4: Frame-level Analysis ---")
+        # results['frame_analysis'], _ = self.transcribe_with_frame_analysis(audio_file)
+        return results
+def post_process_repetitions(text: str, audio_duration: float, expected_word_count: int = None) -> str:
+    """
+    Heuristic post-processing to restore repetitions
+    Args:
+        text: Transcribed text
+        audio_duration: Duration of audio in seconds
+        expected_word_count: Expected number of words (if known)
+    """
+    words = text.split()
+    # Calculate speaking rate (words per second)
+    speaking_rate = len(words) / audio_duration
+    # Normal Arabic speaking rate is 2-3 words per second
+    # For numbers, it's often slower (1-2 words per second)
+    # If rate is too high, likely missing repetitions
+    if speaking_rate > 3.0 and expected_word_count:
+        print(f"⚠️ Speaking rate unusually high ({speaking_rate:.1f} w/s)")
+        print(f"   Expected ~{expected_word_count} words, got {len(words)}")
+        print("   Possible missing repetitions detected")
+    return text
+def detect_number_patterns(text: str) -> List[str]:
+    """Detect if text contains Arabic number words"""
+    arabic_numbers = [
+        'صفر', 'زيرو', 'واحد', 'اثنين', 'ثلاثة', 'أربعة',
+        'خمسة', 'ستة', 'سبعة', 'ثمانية', 'تسعة'
+    ]
+    words = text.split()
+    detected = [w for w in words if w in arabic_numbers]
+    if detected:
+        print(f"🔢 Detected number words: {' '.join(detected)}")
+    return detected
+# ===== MAIN EXECUTION =====
+if __name__ == "__main__":
+    # ===== STEP 1: Record audio =====
+    print("🎙️ Recording... Speak Arabic now!")
+    print("💡 TIP: For repeated numbers, pause slightly between each repetition")
+    print("   Example: 'زيرو [pause] زيرو [pause] واحد [pause] واحد'\n")
+    audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
+    sd.wait()
+    wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
+    print(f"✅ Recording finished. Saved as {OUTPUT_FILE}\n")
+    # ===== STEP 2: Initialize transcriber =====
+    model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
+    transcriber = RepetitionAwareTranscriber(model_path)
+    # ===== STEP 3: Transcribe with all methods =====
+    results = transcriber.transcribe_with_all_methods(OUTPUT_FILE)
+    # ===== STEP 4: Display all results =====
+    print("\n" + "="*60)
+    print("📝 FINAL RESULTS:")
+    print("="*60)
+    for method, transcription in results.items():
+        print(f"\n{method.upper()}:")
+        print(f"  {transcription}")
+        detect_number_patterns(transcription)
+    # ===== STEP 5: Post-processing analysis =====
+    print("\n" + "="*60)
+    print("🔍 POST-PROCESSING ANALYSIS:")
+    print("="*60)
+    best_transcription = results['beam_standard']
+    processed = post_process_repetitions(best_transcription, DURATION)
+    print(f"\nBest transcription: {best_transcription}")
+    print(f"Word count: {len(best_transcription.split())}")
+    print(f"Speaking rate: {len(best_transcription.split()) / DURATION:.2f} words/sec")
+    # ===== STEP 6: Recommendations =====
+    print("\n" + "="*60)
+    print("💡 RECOMMENDATIONS:")
+    print("="*60)
+    print("1. Compare all method outputs above")
+    print("2. If all methods miss repetitions, the issue is in the trained model")
+    print("3. Consider retraining with more repetitive sequences in training data")
+    print("4. When speaking, add slight pauses between repeated words")
+    print("5. If transcribing phone numbers, use digit-by-digit model instead")

train_manifest.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

train_split.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff