alaatiger989 commited on
Commit
b5e57ee
·
verified ·
1 Parent(s): d6bee05

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +6 -0
  2. Extracting_tokenizer_dir_from_Nemo_model.py +126 -0
  3. StartingServer.txt +2 -0
  4. UploadingtoGitlab.txt +4 -0
  5. WER_CER_eval.py +123 -0
  6. WER_calc.py +64 -0
  7. app_api_2.py +345 -0
  8. continue_finetuning_nemo.py +199 -0
  9. converting_dataset_to_8khz.py +95 -0
  10. data_tts/gcloud_tts_sample_001.wav +0 -0
  11. data_tts/gcloud_tts_sample_002.wav +0 -0
  12. data_tts/gcloud_tts_sample_053.wav +0 -0
  13. data_tts/gcloud_tts_sample_060.wav +0 -0
  14. data_tts/gcloud_tts_sample_1065.wav +3 -0
  15. data_tts/gcloud_tts_sample_1067.wav +3 -0
  16. data_tts/gcloud_tts_sample_107.wav +3 -0
  17. data_tts/gcloud_tts_sample_1078.wav +3 -0
  18. data_tts/gcloud_tts_sample_1080.wav +3 -0
  19. data_tts/gcloud_tts_sample_1082.wav +3 -0
  20. data_tts/gcloud_tts_sample_1189.wav +0 -0
  21. data_tts/gcloud_tts_sample_1190.wav +0 -0
  22. data_tts/gcloud_tts_sample_1191.wav +0 -0
  23. data_tts/gcloud_tts_sample_1192.wav +0 -0
  24. data_tts/gcloud_tts_sample_1193.wav +0 -0
  25. data_tts/gcloud_tts_sample_1221.wav +0 -0
  26. data_tts/gcloud_tts_sample_1222.wav +0 -0
  27. data_tts/gcloud_tts_sample_1236.wav +0 -0
  28. data_tts/gcloud_tts_sample_1241.wav +0 -0
  29. data_tts/gcloud_tts_sample_1277.wav +0 -0
  30. data_tts/gcloud_tts_sample_1278.wav +0 -0
  31. data_tts/gcloud_tts_sample_1279.wav +0 -0
  32. data_tts/gcloud_tts_sample_1280.wav +0 -0
  33. data_tts/gcloud_tts_sample_1286.wav +0 -0
  34. data_tts/gcloud_tts_sample_1287.wav +0 -0
  35. data_tts/gcloud_tts_sample_1295.wav +0 -0
  36. data_tts/gcloud_tts_sample_1296.wav +0 -0
  37. data_tts/gcloud_tts_sample_1297.wav +0 -0
  38. data_tts/gcloud_tts_sample_1304.wav +0 -0
  39. data_tts/gcloud_tts_sample_1305.wav +0 -0
  40. data_tts/gcloud_tts_sample_1306.wav +0 -0
  41. data_tts/gcloud_tts_sample_1313.wav +0 -0
  42. data_tts/gcloud_tts_sample_1314.wav +0 -0
  43. data_tts/gcloud_tts_sample_1322.wav +0 -0
  44. eval_manifest.jsonl +163 -0
  45. evaluation_results.csv +164 -0
  46. finetune_asr.py +711 -0
  47. testing_main.py +192 -0
  48. testing_main_v2.py +473 -0
  49. train_manifest.jsonl +0 -0
  50. train_split.jsonl +0 -0
.gitattributes CHANGED
@@ -968,3 +968,9 @@ data_tts/gcloud_tts_sample_1073.wav filter=lfs diff=lfs merge=lfs -text
968
  data_tts/gcloud_tts_sample_1075.wav filter=lfs diff=lfs merge=lfs -text
969
  data_tts/gcloud_tts_sample_1072.wav filter=lfs diff=lfs merge=lfs -text
970
  data_tts/gcloud_tts_sample_1079.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
968
  data_tts/gcloud_tts_sample_1075.wav filter=lfs diff=lfs merge=lfs -text
969
  data_tts/gcloud_tts_sample_1072.wav filter=lfs diff=lfs merge=lfs -text
970
  data_tts/gcloud_tts_sample_1079.wav filter=lfs diff=lfs merge=lfs -text
971
+ data_tts/gcloud_tts_sample_107.wav filter=lfs diff=lfs merge=lfs -text
972
+ data_tts/gcloud_tts_sample_1065.wav filter=lfs diff=lfs merge=lfs -text
973
+ data_tts/gcloud_tts_sample_1078.wav filter=lfs diff=lfs merge=lfs -text
974
+ data_tts/gcloud_tts_sample_1067.wav filter=lfs diff=lfs merge=lfs -text
975
+ data_tts/gcloud_tts_sample_1082.wav filter=lfs diff=lfs merge=lfs -text
976
+ data_tts/gcloud_tts_sample_1080.wav filter=lfs diff=lfs merge=lfs -text
Extracting_tokenizer_dir_from_Nemo_model.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Run this script FIRST to extract the tokenizer from the .nemo file
3
+ This creates the tokenizer folder that the training script needs
4
+ """
5
+ import os
6
+ import tarfile
7
+ import zipfile
8
+ import shutil
9
+
10
+ MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
11
+ OUTPUT_DIR = "tokenizer"
12
+
13
+ print("🔹 Detecting .nemo file format...")
14
+
15
+ def try_extract_tokenizer():
16
+ """Try different methods to extract tokenizer"""
17
+
18
+ # Method 1: Try as regular tar (no compression)
19
+ try:
20
+ print("Trying: Regular tar format...")
21
+ with tarfile.open(MODEL_PATH, 'r:') as tar:
22
+ return extract_from_tar(tar)
23
+ except Exception as e:
24
+ print(f" ✗ Not a regular tar: {e}")
25
+
26
+ # Method 2: Try as gzipped tar
27
+ try:
28
+ print("Trying: Gzipped tar format...")
29
+ with tarfile.open(MODEL_PATH, 'r:gz') as tar:
30
+ return extract_from_tar(tar)
31
+ except Exception as e:
32
+ print(f" ✗ Not gzipped tar: {e}")
33
+
34
+ # Method 3: Try as zip file
35
+ try:
36
+ print("Trying: ZIP format...")
37
+ with zipfile.ZipFile(MODEL_PATH, 'r') as zf:
38
+ return extract_from_zip(zf)
39
+ except Exception as e:
40
+ print(f" ✗ Not a ZIP file: {e}")
41
+
42
+ # Method 4: Try auto-detect
43
+ try:
44
+ print("Trying: Auto-detect format...")
45
+ with tarfile.open(MODEL_PATH, 'r:*') as tar:
46
+ return extract_from_tar(tar)
47
+ except Exception as e:
48
+ print(f" ✗ Auto-detect failed: {e}")
49
+
50
+ return False
51
+
52
+ def extract_from_tar(tar):
53
+ """Extract tokenizer files from tar archive"""
54
+ tokenizer_files = [m for m in tar.getmembers() if 'tokenizer' in m.name.lower()]
55
+
56
+ if not tokenizer_files:
57
+ print("\n📋 Available files in archive:")
58
+ for member in tar.getmembers()[:20]: # Show first 20
59
+ print(f" - {member.name}")
60
+ if len(tar.getmembers()) > 20:
61
+ print(f" ... and {len(tar.getmembers()) - 20} more files")
62
+ return False
63
+
64
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
65
+
66
+ for member in tokenizer_files:
67
+ # Extract to temp directory
68
+ tar.extract(member, path="temp_extract")
69
+
70
+ # Move to tokenizer directory
71
+ src = os.path.join("temp_extract", member.name)
72
+ if os.path.isfile(src):
73
+ dst = os.path.join(OUTPUT_DIR, os.path.basename(member.name))
74
+ shutil.copy2(src, dst)
75
+ print(f"✅ Extracted: {os.path.basename(member.name)}")
76
+
77
+ # Cleanup
78
+ if os.path.exists("temp_extract"):
79
+ shutil.rmtree("temp_extract")
80
+
81
+ return True
82
+
83
+ def extract_from_zip(zf):
84
+ """Extract tokenizer files from zip archive"""
85
+ tokenizer_files = [n for n in zf.namelist() if 'tokenizer' in n.lower()]
86
+
87
+ if not tokenizer_files:
88
+ print("\n📋 Available files in archive:")
89
+ for name in zf.namelist()[:20]:
90
+ print(f" - {name}")
91
+ if len(zf.namelist()) > 20:
92
+ print(f" ... and {len(zf.namelist()) - 20} more files")
93
+ return False
94
+
95
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
96
+
97
+ for name in tokenizer_files:
98
+ # Extract file
99
+ zf.extract(name, path="temp_extract")
100
+
101
+ # Move to tokenizer directory
102
+ src = os.path.join("temp_extract", name)
103
+ if os.path.isfile(src):
104
+ dst = os.path.join(OUTPUT_DIR, os.path.basename(name))
105
+ shutil.copy2(src, dst)
106
+ print(f"✅ Extracted: {os.path.basename(name)}")
107
+
108
+ # Cleanup
109
+ if os.path.exists("temp_extract"):
110
+ shutil.rmtree("temp_extract")
111
+
112
+ return True
113
+
114
+ # Try extraction
115
+ success = try_extract_tokenizer()
116
+
117
+ if success:
118
+ print(f"\n✅ Tokenizer extracted to: {OUTPUT_DIR}")
119
+ print("\n📁 Tokenizer files:")
120
+ for file in os.listdir(OUTPUT_DIR):
121
+ print(f" - {file}")
122
+ print("\n✅ Now you can run the training script!")
123
+ else:
124
+ print("\n❌ Could not extract tokenizer from .nemo file")
125
+ print("\n🔧 Alternative solution: The training script will use the embedded tokenizer")
126
+ print(" No action needed - proceed with training!")
StartingServer.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ python -m streamlit run app.py
2
+ python -m uvicorn app_api:app --host 0.0.0.0 --port 8070 --reload
UploadingtoGitlab.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ cd existing_repo
2
+ git remote add origin https://gitlab.expertflow.com/bot/ai/contextual_asr.git
3
+ git branch -M main
4
+ git push -uf origin main
WER_CER_eval.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import torch
4
+ import pandas as pd
5
+ import Levenshtein as lev
6
+ from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
7
+ from nemo.collections.asr.metrics.wer import word_error_rate # ✅ Keep this
8
+
9
+ # ==========================
10
+ # CONFIGURATION
11
+ # ==========================
12
+ MODEL_PATH = "output_finetuned/finetuned_model_best.nemo"
13
+ EVAL_MANIFEST = "eval_manifest.jsonl"
14
+
15
+ # ==========================
16
+ # LOAD MODEL
17
+ # ==========================
18
+ device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ print(f"Loading model on: {device}")
20
+
21
+ try:
22
+ model = EncDecHybridRNNTCTCBPEModel.restore_from(restore_path=MODEL_PATH, map_location=device)
23
+ model = model.to(device)
24
+ model.eval()
25
+ print("✅ Model loaded successfully.")
26
+ except Exception as e:
27
+ print(f"❌ Failed to load model: {e}")
28
+ exit()
29
+
30
+ # ==========================
31
+ # LOAD MANIFEST
32
+ # ==========================
33
+ def load_manifest(manifest_path):
34
+ """Load audio paths and text from a JSONL manifest file."""
35
+ data = []
36
+ with open(manifest_path, "r", encoding="utf-8") as f:
37
+ for line in f:
38
+ try:
39
+ item = json.loads(line.strip())
40
+ audio_path = item["audio_filepath"]
41
+ text = item.get("text", "").strip()
42
+ if os.path.exists(audio_path) and text:
43
+ data.append((audio_path, text))
44
+ else:
45
+ print(f"⚠️ Skipping invalid entry: {audio_path}")
46
+ except json.JSONDecodeError as e:
47
+ print(f"❌ Invalid JSON line: {e}")
48
+ print(f"\n📁 Loaded {len(data)} valid samples from manifest.")
49
+ return data
50
+
51
+ # ==========================
52
+ # CER FUNCTION
53
+ # ==========================
54
+ def calculate_cer(reference, hypothesis):
55
+ """Compute Character Error Rate (CER)."""
56
+ reference = reference.replace(" ", "")
57
+ hypothesis = hypothesis.replace(" ", "")
58
+ if len(reference) == 0:
59
+ return 0.0
60
+ return lev.distance(reference, hypothesis) / len(reference)
61
+
62
+ # ==========================
63
+ # EVALUATION FUNCTION
64
+ # ==========================
65
+ def evaluate_model(model, dataset):
66
+ total_wer, total_cer = 0.0, 0.0
67
+ results = []
68
+
69
+ for i, (audio_path, expected_text) in enumerate(dataset, 1):
70
+ print(f"\n🔍 [{i}/{len(dataset)}] Evaluating: {audio_path}")
71
+
72
+ with torch.no_grad():
73
+ output = model.transcribe([audio_path])
74
+ if isinstance(output, tuple):
75
+ prediction_list = output[0]
76
+ else:
77
+ prediction_list = output
78
+ prediction = prediction_list[0] if isinstance(prediction_list, list) else prediction_list
79
+
80
+ # Compute WER & CER
81
+ wer = word_error_rate([expected_text], [prediction])
82
+ cer = calculate_cer(expected_text, prediction)
83
+
84
+ print(f"Expected : {expected_text}")
85
+ print(f"Predicted: {prediction}")
86
+ print(f"WER={wer:.3f}, CER={cer:.3f}")
87
+
88
+ results.append({
89
+ "audio": os.path.basename(audio_path),
90
+ "expected": expected_text,
91
+ "predicted": prediction,
92
+ "WER": wer,
93
+ "CER": cer
94
+ })
95
+
96
+ total_wer += wer
97
+ total_cer += cer
98
+
99
+ avg_wer = total_wer / len(dataset)
100
+ avg_cer = total_cer / len(dataset)
101
+ accuracy = (1 - avg_wer) * 100 # ✅ Calculate accuracy percentage
102
+ print("\n==============================")
103
+ print(f"📊 Average WER: {avg_wer:.3f}")
104
+ print(f"🎯 Accuracy: {accuracy:.2f}%") # ✅ Added this line
105
+ print(f"📊 Average CER: {avg_cer:.3f}")
106
+ print("==============================")
107
+
108
+ return results, avg_wer, avg_cer
109
+
110
+ # ==========================
111
+ # RUN EVALUATION
112
+ # ==========================
113
+ if __name__ == "__main__":
114
+ dataset = load_manifest(EVAL_MANIFEST)
115
+ if not dataset:
116
+ print("❌ No valid data found in manifest.")
117
+ exit()
118
+
119
+ results, avg_wer, avg_cer = evaluate_model(model, dataset)
120
+
121
+ df = pd.DataFrame(results)
122
+ df.to_csv("evaluation_results.csv", index=False, encoding="utf-8-sig")
123
+ print("\n💾 Results saved to: evaluation_results.csv")
WER_calc.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
4
+ from nemo.collections.asr.metrics.wer import word_error_rate
5
+
6
+ # ==========================
7
+ # CONFIGURATION
8
+ # ==========================
9
+ MODEL_PATH = "output_finetuned/finetuned_model_best.nemo"
10
+ SAMPLE_AUDIO = "arabic_recording.wav"
11
+ EXPECTED_TEXT = "زيرو واحد واحد واحد واحد واحد واحد اتنين اربعة ستة"
12
+
13
+ # ==========================
14
+ # LOAD MODEL
15
+ # ==========================
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ print(f"Loading model on: {device}")
18
+
19
+ try:
20
+ model = EncDecHybridRNNTCTCBPEModel.restore_from(restore_path=MODEL_PATH, map_location=device)
21
+ model.eval()
22
+ print("✅ Model loaded successfully.")
23
+ except Exception as e:
24
+ print(f"❌ Failed to load model: {e}")
25
+ exit()
26
+
27
+ # ==========================
28
+ # TEST FUNCTION
29
+ # ==========================
30
+ def test_model(model, sample_audio, expected_text):
31
+ if not os.path.exists(sample_audio):
32
+ print(f"❌ Audio file not found: {sample_audio}")
33
+ return
34
+
35
+ print(f"\n🔍 Testing on: {sample_audio}")
36
+
37
+ # Transcribe
38
+ with torch.no_grad():
39
+ output = model.transcribe([sample_audio])
40
+
41
+ # Handle different return types
42
+ if isinstance(output, tuple):
43
+ # Sometimes returns (predictions, tokens)
44
+ prediction_list = output[0]
45
+ else:
46
+ prediction_list = output
47
+
48
+ # Ensure it's a single string
49
+ prediction = prediction_list[0] if isinstance(prediction_list, list) else prediction_list
50
+
51
+ # Display results
52
+ print(f"\nPredicted: {prediction}")
53
+ print(f"Expected : {expected_text}")
54
+
55
+ # Compute WER
56
+ wer = word_error_rate([expected_text], [prediction])
57
+ print(f"\n📊 Word Error Rate (WER): {wer:.3f}")
58
+ return prediction, wer
59
+
60
+ # ==========================
61
+ # RUN TEST
62
+ # ==========================
63
+ if __name__ == "__main__":
64
+ prediction, wer = test_model(model, SAMPLE_AUDIO, EXPECTED_TEXT)
app_api_2.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from fastapi import FastAPI, File, UploadFile
2
+ # from fastapi.responses import JSONResponse
3
+ # import uvicorn
4
+ # import tempfile
5
+ # import nemo.collections.asr as nemo_asr
6
+ # import re
7
+ # import os
8
+ # import librosa
9
+ # import soundfile as sf
10
+
11
+ # # ===== Arabic number mapping (expanded) =====
12
+ # arabic_numbers = {
13
+ # "صفر": "0", "زيرو": "0", "٠": "0", "زيو": "0", "زير": "0",
14
+ # "واحد": "1", "واحدة": "1", "١": "1",
15
+ # "اتنين": "2", "اثنين": "2", "اثنان": "2", "٢": "2",
16
+ # "تلاتة": "3", "ثلاثة": "3", "٣": "3","ثلاث": "3","تلات": "3",
17
+ # "اربعة": "4", "أربعة": "4", "٤": "4",
18
+ # "خمسة": "5", "٥": "5","خمسه": "5",
19
+ # "ستة": "6", "٦": "6",
20
+ # "سبعة": "7", "٧": "7","سبعه": "7",
21
+ # "تمانية": "8", "ثمانية": "8", "٨": "8",
22
+ # "تسعة": "9", "٩": "9",
23
+ # "عشرة": "10", "١٠": "10","عشره": "10",
24
+ # "حداشر": "11", "احد عشر": "11", "احداشر": "11",
25
+ # "اتناشر": "12", "اثنا عشر": "12",
26
+ # "تلتاشر": "13", "ثلاثة عشر": "13",
27
+ # "اربعتاشر": "14", "أربعة عشر": "14",
28
+ # "خمستاشر": "15", "خمسة عشر": "15",
29
+ # "ستاشر": "16", "ستة عشر": "16",
30
+ # "سبعتاشر": "17", "سبعة عشر": "17",
31
+ # "طمنتاشر": "18", "ثمانية عشر": "18",
32
+ # "تسعتاشر": "19", "تسعة عشر": "19",
33
+ # "عشرين": "20", "٢٠": "20",
34
+ # "تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
35
+ # "اربعين": "40", "أربعين": "40", "٤٠": "40",
36
+ # "خمسين": "50", "٥٠": "50",
37
+ # "ستين": "60", "٦٠": "60",
38
+ # "سبعين": "70", "٧٠": "70",
39
+ # "تمانين": "80", "ثمانين": "80", "٨٠": "80",
40
+ # "تسعين": "90", "٩٠": "90",
41
+ # "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
42
+ # "ميتين": "200", "مائتين": "200",
43
+ # "تلاتمية": "300", "ثلاثمائة": "300",
44
+ # "اربعمية": "400", "أربعمائة": "400",
45
+ # "خمسمية": "500", "خمسمائة": "500",
46
+ # "ستمية": "600", "ستمائة": "600",
47
+ # "سبعمية": "700", "سبعمائة": "700",
48
+ # "تمانمية": "800", "ثمانمائة": "800",
49
+ # "تسعمية": "900", "تسعمائة": "900",
50
+ # "ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
51
+ # "ألفين": "2000", "الفين": "2000",
52
+ # "تلات تلاف": "3000", "ثلاثة آلاف": "3000",
53
+ # "اربعة آلاف": "4000", "أربعة آلاف": "4000",
54
+ # "خمسة آلاف": "5000",
55
+ # "ستة آلاف": "6000",
56
+ # "سبعة آلاف": "7000",
57
+ # "تمانية آلاف": "8000", "ثمانية آلاف": "8000",
58
+ # "تسعة آلاف": "9000",
59
+ # "عشرة آلاف": "10000",
60
+ # "مية ألف": "100000", "مائة ألف": "100000",
61
+ # "مليون": "1000000", "ملايين": "1000000",
62
+ # "مليار": "1000000000"
63
+ # }
64
+
65
+ # # ===== Helpers =====
66
+ # def normalize_arabic(text: str) -> str:
67
+ # diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
68
+ # text = re.sub(diacritics, '', text)
69
+ # text = re.sub(r'[إأآا]', 'ا', text)
70
+ # text = re.sub(r'ى', 'ي', text)
71
+ # text = re.sub(r'ؤ', 'و', text)
72
+ # text = re.sub(r'ئ', 'ي', text)
73
+ # text = re.sub(r'ة', 'ه', text)
74
+ # return text
75
+
76
+ # def replace_arabic_numbers(text: str) -> str:
77
+ # for word, digit in arabic_numbers.items():
78
+ # text = re.sub(fr"(?:^|\s){word}(?:$|\s)", f" {digit} ", text)
79
+ # return " ".join(text.split())
80
+
81
+ # def join_digit_sequences(text: str) -> str:
82
+ # tokens = text.split()
83
+ # out, buffer = [], []
84
+ # for tok in tokens:
85
+ # if tok.isdigit() and len(tok) == 1:
86
+ # buffer.append(tok)
87
+ # else:
88
+ # if buffer:
89
+ # out.append("".join(buffer))
90
+ # buffer = []
91
+ # out.append(tok)
92
+ # if buffer:
93
+ # out.append("".join(buffer))
94
+ # return " ".join(out)
95
+
96
+ # def ensure_16k_wav(input_path, output_path):
97
+ # y, sr = librosa.load(input_path, sr=16000, mono=True)
98
+ # sf.write(output_path, y, 16000)
99
+
100
+ # # ===== FastAPI app =====
101
+ # app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic digit conversion")
102
+
103
+ # @app.on_event("startup")
104
+ # def load_model():
105
+ # global asr_model
106
+ # model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/asr-egyptian-nemo-v2.0.nemo"
107
+ # asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
108
+
109
+ # @app.post("/transcribe")
110
+ # async def transcribe_audio(file: UploadFile = File(...)):
111
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
112
+ # tmp.write(await file.read())
113
+ # tmp_path = tmp.name
114
+
115
+ # # Resample to 16kHz
116
+ # resampled_path = tmp_path.replace(".wav", "_16k.wav")
117
+ # ensure_16k_wav(tmp_path, resampled_path)
118
+
119
+ # try:
120
+ # result = asr_model.transcribe([resampled_path])
121
+ # raw_text = result[0].text
122
+
123
+ # raw_text = normalize_arabic(raw_text)
124
+ # cleaned_text = replace_arabic_numbers(raw_text)
125
+ # cleaned_text = join_digit_sequences(cleaned_text)
126
+
127
+ # return JSONResponse(content={"transcription": cleaned_text})
128
+
129
+ # finally:
130
+ # os.remove(tmp_path)
131
+ # if os.path.exists(resampled_path):
132
+ # os.remove(resampled_path)
133
+
134
+ # @app.post("/transcribe-bytes")
135
+ # async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
136
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
137
+ # tmp.write(audio_bytes)
138
+ # tmp_path = tmp.name
139
+
140
+ # resampled_path = tmp_path.replace(".wav", "_16k.wav")
141
+ # ensure_16k_wav(tmp_path, resampled_path)
142
+
143
+ # try:
144
+ # result = asr_model.transcribe([resampled_path])
145
+ # raw_text = result[0].text
146
+
147
+ # raw_text = normalize_arabic(raw_text)
148
+ # cleaned_text = replace_arabic_numbers(raw_text)
149
+ # cleaned_text = join_digit_sequences(cleaned_text)
150
+
151
+ # return JSONResponse(content={"transcription": cleaned_text})
152
+
153
+ # finally:
154
+ # os.remove(tmp_path)
155
+ # if os.path.exists(resampled_path):
156
+ # os.remove(resampled_path)
157
+
158
+ # if __name__ == "__main__":
159
+ # uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
160
+ from fastapi import FastAPI, File, UploadFile
161
+ from fastapi.responses import JSONResponse
162
+ import uvicorn
163
+ import tempfile
164
+ import nemo.collections.asr as nemo_asr
165
+ import re
166
+ import os
167
+ import librosa
168
+ import soundfile as sf
169
+ from omegaconf import OmegaConf
170
+ # ===== Arabic + English number mapping (expanded) =====
171
+ arabic_numbers = {
172
+ "صفر": "0", "زيرو": "0", "زيو": "0", "زير": "0", "٠": "0",
173
+ "واحد": "1", "واحدة": "1", "واحده": "1", "١": "1",
174
+ "اثنين": "2", "اثنان": "2", "اتنين": "2", "٢": "2",
175
+ "ثلاثة": "3", "ثلاث": "3", "تلاتة": "3", "تلات": "3", "ثلاثه": "3", "تلاته": "3",
176
+ "أربعة": "4", "اربعة": "4", "٤": "4","أربعه": "4","اربعه": "4",
177
+ "خمسة": "5", "خمسه": "5", "٥": "5",
178
+ "ستة": "6", "ست": "6", "٦": "6","سته": "6",
179
+ "سبعة": "7", "سبعه": "7", "٧": "7",
180
+ "ثمانية": "8", "تمانية": "8", "تمنية": "8", "ثمان": "8", "٨": "8","تمانيه": "8",
181
+ "تسعة": "9", "تسعه": "9", "٩": "9"
182
+ }
183
+
184
+
185
+ # ===== Helpers =====
186
+ def normalize_arabic(text: str) -> str:
187
+ diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
188
+ text = re.sub(diacritics, '', text)
189
+ text = re.sub(r'[إأآا]', 'ا', text)
190
+ text = re.sub(r'ى', 'ي', text)
191
+ text = re.sub(r'ؤ', 'و', text)
192
+ text = re.sub(r'ئ', 'ي', text)
193
+ text = re.sub(r'ة', 'ه', text)
194
+ return text
195
+
196
+ def replace_arabic_numbers(text: str) -> str:
197
+ # Replace Arabic words 0-9 with digits
198
+ for word, digit in arabic_numbers.items():
199
+ text = re.sub(rf'\b{re.escape(word)}\b', digit, text)
200
+ return text
201
+
202
+ def join_digit_sequences(text: str) -> str:
203
+ # Merge consecutive digits into single numbers
204
+ tokens = text.split()
205
+ out, buffer = [], []
206
+ for tok in tokens:
207
+ if tok.isdigit():
208
+ buffer.append(tok)
209
+ else:
210
+ if buffer:
211
+ out.append("".join(buffer))
212
+ buffer = []
213
+ out.append(tok)
214
+ if buffer:
215
+ out.append("".join(buffer))
216
+ return " ".join(out)
217
+
218
+ def ensure_16k_wav(input_path, output_path):
219
+ y, sr = librosa.load(input_path, sr=16000, mono=True)
220
+ sf.write(output_path, y, 16000)
221
+
222
+ # ===== FastAPI app =====
223
+ app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic/English digit conversion")
224
+
225
+ @app.on_event("startup")
226
+ def load_model():
227
+ global asr_model
228
+ model_path = "output_finetuned/finetuned_model_best.nemo"
229
+ asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(model_path)
230
+ # Add this right after loading the model to see what's actually available:
231
+ print("Available greedy parameters:")
232
+ print(OmegaConf.to_yaml(asr_model.cfg.decoding.greedy))
233
+
234
+ # ===== STEP 3: Configure for LITERAL transcription =====
235
+ print("🔍 Configuring greedy decoding for literal output...")
236
+
237
+ # Set struct mode to False temporarily to allow modifications
238
+ OmegaConf.set_struct(asr_model.cfg.decoding, False)
239
+ OmegaConf.set_struct(asr_model.cfg.decoding.greedy, False)
240
+
241
+ decoding_cfg = asr_model.cfg.decoding
242
+ decoding_cfg.strategy = "maes"
243
+
244
+ # Now try setting the parameters
245
+ try:
246
+ decoding_cfg.greedy.max_symbols_per_step = 300
247
+ print(f"✓ max_symbols_per_step: {decoding_cfg.greedy.max_symbols_per_step}")
248
+ except:
249
+ print("⚠ Could not set max_symbols_per_step")
250
+
251
+ decoding_cfg.greedy.max_symbols = 500
252
+ decoding_cfg.greedy.loop_labels = True
253
+ decoding_cfg.greedy.preserve_alignments = True
254
+ decoding_cfg.preserve_alignments = True
255
+ decoding_cfg.compute_timestamps = True
256
+ decoding_cfg.temperature = 1.3
257
+
258
+ decoding_cfg.beam.beam_size = 64
259
+ decoding_cfg.beam.softmax_temperature = 1.3
260
+ decoding_cfg.beam.search_type = "beam"
261
+ print(f"✓ max_symbols: {decoding_cfg.greedy.max_symbols}")
262
+ print(f"✓ loop_labels: {decoding_cfg.greedy.loop_labels}")
263
+ print(f"✓ temperature: {decoding_cfg.temperature}")
264
+
265
+ # Re-enable struct mode
266
+ OmegaConf.set_struct(asr_model.cfg.decoding, True)
267
+ OmegaConf.set_struct(asr_model.cfg.decoding.greedy, True)
268
+
269
+ # Apply configuration
270
+ asr_model.change_decoding_strategy(decoding_cfg)
271
+
272
+
273
+ @app.post("/transcribe")
274
+ async def transcribe_audio(file: UploadFile = File(...)):
275
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
276
+ tmp.write(await file.read())
277
+ tmp_path = tmp.name
278
+
279
+ resampled_path = tmp_path.replace(".wav", "_16k.wav")
280
+ ensure_16k_wav(tmp_path, resampled_path)
281
+
282
+ try:
283
+ result = asr_model.transcribe([resampled_path])
284
+
285
+ # Handle NeMo tuple/list structure robustly
286
+ if isinstance(result, tuple):
287
+ result = result[0] # take first element if tuple
288
+ if isinstance(result, list):
289
+ raw_text = result[0]
290
+ else:
291
+ raw_text = str(result)
292
+
293
+ # Normalize and replace Arabic numerals
294
+ raw_text = normalize_arabic(raw_text)
295
+ cleaned_text = replace_arabic_numbers(raw_text)
296
+ cleaned_text = join_digit_sequences(cleaned_text)
297
+
298
+ print("📝 Cleaned Transcription:", cleaned_text) # for debug
299
+ return JSONResponse(content={"transcription": cleaned_text})
300
+
301
+ finally:
302
+ os.remove(tmp_path)
303
+ if os.path.exists(resampled_path):
304
+ os.remove(resampled_path)
305
+
306
+
307
+ @app.post("/transcribe-bytes")
308
+ async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
309
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
310
+ tmp.write(audio_bytes)
311
+ tmp_path = tmp.name
312
+
313
+ resampled_path = tmp_path.replace(".wav", "_16k.wav")
314
+ ensure_16k_wav(tmp_path, resampled_path)
315
+
316
+ try:
317
+ result = asr_model.transcribe([resampled_path])
318
+ # Robust extraction
319
+ if isinstance(result, list):
320
+ # if first element is also a list, flatten it
321
+ first = result[0]
322
+ if isinstance(first, list):
323
+ raw_text = first[0]
324
+ elif isinstance(first, str):
325
+ raw_text = first
326
+ elif hasattr(first, "text"): # sometimes result contains objects with 'text'
327
+ raw_text = first.text
328
+ else:
329
+ raw_text = str(first) # fallback to string
330
+ else:
331
+ raw_text = str(result)
332
+ #print("Raw text:", raw_text)
333
+
334
+ raw_text = normalize_arabic(raw_text)
335
+ cleaned_text = replace_arabic_numbers(raw_text)
336
+ cleaned_text = join_digit_sequences(cleaned_text)
337
+
338
+ return JSONResponse(content={"transcription": cleaned_text})
339
+ finally:
340
+ os.remove(tmp_path)
341
+ if os.path.exists(resampled_path):
342
+ os.remove(resampled_path)
343
+
344
+ if __name__ == "__main__":
345
+ uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
continue_finetuning_nemo.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import json
4
+ import torch
5
+ from pytorch_lightning import Trainer
6
+ from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
7
+ from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
8
+ from omegaconf import open_dict , DictConfig
9
+
10
+ # ============================================================
11
+ # Environment Fixes (Windows / CUDA)
12
+ # ============================================================
13
+ os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
14
+ os.environ["NUMBA_CUDA_USE_NVIDIA_BINDING"] = "1"
15
+ os.environ["NUMBA_DISABLE_JIT"] = "0"
16
+ os.environ["NUMBA_CUDA_DRIVER"] = "cuda"
17
+
18
+ # Uncomment to use GPU (recommended for RTX 3070)
19
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
20
+
21
+ # ============================================================
22
+ # UTF-8 Fix for Manifest
23
+ # ============================================================
24
+ manifest_path = "train_manifest.jsonl"
25
+ with io.open(manifest_path, 'r', encoding='utf-8', errors='ignore') as f:
26
+ content = f.read()
27
+ with io.open(manifest_path, 'w', encoding='utf-8') as f:
28
+ f.write(content)
29
+ print("✅ train_manifest.jsonl converted to UTF-8")
30
+
31
+ # Patch builtins.open for UTF-8
32
+ import builtins
33
+ _old_open = open
34
+ def open_utf8(file, *args, **kwargs):
35
+ if isinstance(file, str) and file.endswith('.jsonl') and 'encoding' not in kwargs:
36
+ kwargs['encoding'] = 'utf-8'
37
+ return _old_open(file, *args, **kwargs)
38
+ builtins.open = open_utf8
39
+
40
+ # ============================================================
41
+ # Validate Manifest
42
+ # ============================================================
43
+ def validate_manifest(manifest_path):
44
+ count = 0
45
+ with open(manifest_path, "r", encoding="utf-8") as f:
46
+ for i, line in enumerate(f, 1):
47
+ try:
48
+ item = json.loads(line.strip())
49
+ assert os.path.exists(item["audio_filepath"]), f"Missing: {item['audio_filepath']}"
50
+ assert "text" in item and item["text"].strip(), "Empty text"
51
+ count += 1
52
+ except Exception as e:
53
+ print(f"❌ Line {i} error: {e}")
54
+ print(f" Content: {line[:100]}")
55
+ print(f"✅ Valid entries: {count}")
56
+ return count
57
+
58
+ valid_count = validate_manifest(manifest_path)
59
+ if valid_count == 0:
60
+ raise ValueError("No valid training samples found!")
61
+
62
+ # ============================================================
63
+ # Paths and Hyperparameters
64
+ # ============================================================
65
+ BASE_MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
66
+ SAVE_DIR = "output_finetuned"
67
+ LAST_CKPT = os.path.join(SAVE_DIR, "last.ckpt")
68
+
69
+ BATCH_SIZE = 4
70
+ ADDITIONAL_EPOCHS = 50
71
+ LEARNING_RATE = 1e-5
72
+ WARMUP_STEPS = 500
73
+ WEIGHT_DECAY = 0.00001
74
+
75
+ os.makedirs(SAVE_DIR, exist_ok=True)
76
+
77
+ # ============================================================
78
+ # Load Model
79
+ # ============================================================
80
+ print("🔹 Loading pretrained or last fine-tuned model...")
81
+ model = EncDecHybridRNNTCTCBPEModel.restore_from(BASE_MODEL_PATH)
82
+
83
+ # ============================================================
84
+ # Tokenizer Fix
85
+ # ============================================================
86
+ with open_dict(model.cfg):
87
+ tokenizer_dir = os.path.join(os.path.dirname(BASE_MODEL_PATH), "tokenizer")
88
+ os.makedirs(tokenizer_dir, exist_ok=True)
89
+ model.cfg.tokenizer.dir = tokenizer_dir
90
+ model.cfg.tokenizer.type = "bpe"
91
+
92
+ if 'validation_ds' in model.cfg:
93
+ model.cfg.validation_ds.manifest_filepath = None
94
+ if 'test_ds' in model.cfg:
95
+ model.cfg.test_ds.manifest_filepath = None
96
+
97
+ # ============================================================
98
+ # Setup Training Data
99
+ # ============================================================
100
+ train_ds_config = {
101
+ "manifest_filepath": manifest_path,
102
+ "batch_size": BATCH_SIZE,
103
+ "shuffle": True,
104
+ "num_workers": 0,
105
+ "pin_memory": False,
106
+ "sample_rate": 16000,
107
+ "max_duration": 20.0,
108
+ "min_duration": 0.5,
109
+ "trim_silence": True,
110
+ "use_start_end_token": True,
111
+ "normalize_transcripts": True,
112
+ "parser": "ar",
113
+ }
114
+ model.setup_training_data(train_ds_config)
115
+
116
+ # ============================================================
117
+ # Optimizer & Scheduler
118
+ # ============================================================
119
+ with open_dict(model.cfg):
120
+ model.cfg.optim.name = "adamw"
121
+ model.cfg.optim.lr = LEARNING_RATE
122
+ model.cfg.optim.betas = [0.9, 0.98]
123
+ model.cfg.optim.weight_decay = WEIGHT_DECAY
124
+ model.cfg.optim.eps = 1e-8
125
+ model.cfg.optim.sched = {
126
+ "name": "CosineAnnealing",
127
+ "warmup_steps": WARMUP_STEPS,
128
+ "min_lr": 1e-7,
129
+ "last_epoch": -1,
130
+ }
131
+
132
+ # ============================================================
133
+ # Callbacks
134
+ # ============================================================
135
+ checkpoint_callback = ModelCheckpoint(
136
+ dirpath=SAVE_DIR,
137
+ filename='continued-{epoch:02d}-{train_loss:.4f}',
138
+ save_top_k=3,
139
+ monitor='train_loss',
140
+ mode='min',
141
+ save_last=True,
142
+ )
143
+
144
+ early_stop_callback = EarlyStopping(
145
+ monitor='train_loss',
146
+ patience=20,
147
+ mode='min',
148
+ verbose=True,
149
+ )
150
+
151
+ lr_monitor = LearningRateMonitor(logging_interval='step')
152
+
153
+ # ============================================================
154
+ # Determine Max Epochs Based on Last Checkpoint
155
+ # ============================================================
156
+ # ============================================================
157
+ # Allow loading full NeMo checkpoint (trusted source)
158
+ # ============================================================
159
+ torch.serialization.add_safe_globals([DictConfig])
160
+
161
+ if os.path.exists(LAST_CKPT):
162
+ ckpt_data = torch.load(LAST_CKPT, map_location="cpu", weights_only=False)
163
+ last_epoch = ckpt_data.get("epoch", 0)
164
+ new_max_epochs = last_epoch + ADDITIONAL_EPOCHS
165
+ print(f"🧩 Last checkpoint epoch: {last_epoch} → continuing up to {new_max_epochs} epochs total.")
166
+ else:
167
+ new_max_epochs = ADDITIONAL_EPOCHS
168
+
169
+ # ============================================================
170
+ # Trainer
171
+ # ============================================================
172
+ trainer = Trainer(
173
+ accelerator="gpu" if torch.cuda.is_available() else "cpu",
174
+ devices=1,
175
+ max_epochs=new_max_epochs,
176
+ log_every_n_steps=1,
177
+ enable_checkpointing=True,
178
+ default_root_dir=SAVE_DIR,
179
+ callbacks=[checkpoint_callback, early_stop_callback, lr_monitor],
180
+ gradient_clip_val=1.0,
181
+ accumulate_grad_batches=4,
182
+ )
183
+
184
+ # ============================================================
185
+ # Continue Training
186
+ # ============================================================
187
+ if os.path.exists(LAST_CKPT):
188
+ print(f"🚀 Continuing training from checkpoint: {LAST_CKPT}")
189
+ trainer.fit(model, ckpt_path=LAST_CKPT)
190
+ else:
191
+ print("⚠️ No checkpoint found, training from base model...")
192
+ trainer.fit(model)
193
+
194
+ # ============================================================
195
+ # Save Final Model
196
+ # ============================================================
197
+ final_model_path = os.path.join(SAVE_DIR, "finetuned_model_continued.nemo")
198
+ model.save_to(final_model_path)
199
+ print(f"\n✅ Continued fine-tuned model saved to: {final_model_path}")
converting_dataset_to_8khz.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import subprocess
4
+ import soundfile as sf
5
+
6
+ # ==============================
7
+ # CONFIGURATION
8
+ # ==============================
9
+ input_folder = "data_tts_evaluation"
10
+ output_folder = "data_tts_eval_8k_ulaw"
11
+ old_manifest = "eval_manifest.jsonl" # Optional: use old text references
12
+ new_manifest = "eval_manifest_8k_ulaw.jsonl"
13
+
14
+ # Create output folder if it doesn’t exist
15
+ os.makedirs(output_folder, exist_ok=True)
16
+
17
+ # Supported audio formats
18
+ valid_ext = (".wav", ".mp3", ".flac", ".ogg", ".m4a")
19
+
20
+ # ==============================
21
+ # Load Texts from Old Manifest (if exists)
22
+ # ==============================
23
+ text_map = {}
24
+ if os.path.exists(old_manifest):
25
+ print(f"🔹 Loading existing manifest: {old_manifest}")
26
+ with open(old_manifest, "r", encoding="utf-8") as f:
27
+ for line in f:
28
+ try:
29
+ item = json.loads(line.strip())
30
+ # Extract filename without extension for mapping
31
+ key = os.path.splitext(os.path.basename(item["audio_filepath"]))[0]
32
+ text_map[key] = item.get("text", "")
33
+ except Exception as e:
34
+ print(f"⚠️ Error reading line: {e}")
35
+
36
+ # ==============================
37
+ # CONVERSION LOOP + MANIFEST CREATION
38
+ # ==============================
39
+ converted_entries = []
40
+
41
+ for filename in os.listdir(input_folder):
42
+ if not filename.lower().endswith(valid_ext):
43
+ continue
44
+
45
+ input_path = os.path.join(input_folder, filename)
46
+ base_name = os.path.splitext(filename)[0]
47
+ output_name = base_name + "_8k_ulaw.wav"
48
+ output_path = os.path.join(output_folder, output_name)
49
+
50
+ # FFmpeg command: convert to mono 8kHz u-law
51
+ cmd = [
52
+ "ffmpeg",
53
+ "-y", # overwrite
54
+ "-i", input_path,
55
+ "-ar", "8000", # 8kHz
56
+ "-ac", "1", # mono
57
+ "-c:a", "pcm_mulaw",
58
+ output_path
59
+ ]
60
+
61
+ try:
62
+ subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
63
+
64
+ # Get duration of new file
65
+ data, samplerate = sf.read(output_path)
66
+ duration = round(len(data) / samplerate, 2)
67
+
68
+ # Get text (if exists from old manifest)
69
+ text = text_map.get(base_name, "")
70
+
71
+ # Add entry to new manifest
72
+ converted_entries.append({
73
+ "audio_filepath": output_path.replace("\\", "/"),
74
+ "duration": duration,
75
+ "text": text
76
+ })
77
+
78
+ print(f"✅ Converted: {filename} → {output_name} ({duration}s)")
79
+
80
+ except subprocess.CalledProcessError as e:
81
+ print(f"❌ Error converting {filename}: {e.stderr.decode('utf-8', errors='ignore')}")
82
+
83
+ # ==============================
84
+ # SAVE NEW MANIFEST
85
+ # ==============================
86
+ if converted_entries:
87
+ with open(new_manifest, "w", encoding="utf-8") as f:
88
+ for entry in converted_entries:
89
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
90
+ print(f"\n💾 Manifest saved to: {new_manifest}")
91
+ print(f"📊 Total entries: {len(converted_entries)}")
92
+ else:
93
+ print("⚠️ No audio files converted or manifest empty!")
94
+
95
+ print(f"\n🎯 Conversion complete! {len(converted_entries)} files saved in '{output_folder}'.")
data_tts/gcloud_tts_sample_001.wav ADDED
Binary file (94.6 kB). View file
 
data_tts/gcloud_tts_sample_002.wav ADDED
Binary file (82.5 kB). View file
 
data_tts/gcloud_tts_sample_053.wav ADDED
Binary file (98.5 kB). View file
 
data_tts/gcloud_tts_sample_060.wav ADDED
Binary file (99.7 kB). View file
 
data_tts/gcloud_tts_sample_1065.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3299bdc8d6906ab1152f326a4b2839966a842deab4762c837aa073b7c4b286dd
3
+ size 183062
data_tts/gcloud_tts_sample_1067.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:211ce2911a62fc66fd143e6434b4f053f0ccde0ce5b007f0a99c98a5caeefa8f
3
+ size 133318
data_tts/gcloud_tts_sample_107.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c840bba5d03b45f415219171e226055fc94735f43187c816f4d723a9eb162d7a
3
+ size 158232
data_tts/gcloud_tts_sample_1078.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1774d6b6b212dac76b782be61c89a466a900e2ab9620b223e78df14c70f30d3d
3
+ size 172362
data_tts/gcloud_tts_sample_1080.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fae59de9a6431baadb0b50aa9ff7e2adc12808f498827c3913e4519adcc51849
3
+ size 179680
data_tts/gcloud_tts_sample_1082.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dfddaf1051afae6e4fa8298fe948d6ffcfc4681ef35c883195969c8ba5a22c1
3
+ size 162944
data_tts/gcloud_tts_sample_1189.wav ADDED
Binary file (77.5 kB). View file
 
data_tts/gcloud_tts_sample_1190.wav ADDED
Binary file (69.7 kB). View file
 
data_tts/gcloud_tts_sample_1191.wav ADDED
Binary file (77 kB). View file
 
data_tts/gcloud_tts_sample_1192.wav ADDED
Binary file (97.9 kB). View file
 
data_tts/gcloud_tts_sample_1193.wav ADDED
Binary file (74.6 kB). View file
 
data_tts/gcloud_tts_sample_1221.wav ADDED
Binary file (99.7 kB). View file
 
data_tts/gcloud_tts_sample_1222.wav ADDED
Binary file (95.8 kB). View file
 
data_tts/gcloud_tts_sample_1236.wav ADDED
Binary file (83.7 kB). View file
 
data_tts/gcloud_tts_sample_1241.wav ADDED
Binary file (99.7 kB). View file
 
data_tts/gcloud_tts_sample_1277.wav ADDED
Binary file (32.9 kB). View file
 
data_tts/gcloud_tts_sample_1278.wav ADDED
Binary file (48.7 kB). View file
 
data_tts/gcloud_tts_sample_1279.wav ADDED
Binary file (66.2 kB). View file
 
data_tts/gcloud_tts_sample_1280.wav ADDED
Binary file (83.7 kB). View file
 
data_tts/gcloud_tts_sample_1286.wav ADDED
Binary file (47.6 kB). View file
 
data_tts/gcloud_tts_sample_1287.wav ADDED
Binary file (75.2 kB). View file
 
data_tts/gcloud_tts_sample_1295.wav ADDED
Binary file (43 kB). View file
 
data_tts/gcloud_tts_sample_1296.wav ADDED
Binary file (75.1 kB). View file
 
data_tts/gcloud_tts_sample_1297.wav ADDED
Binary file (95.8 kB). View file
 
data_tts/gcloud_tts_sample_1304.wav ADDED
Binary file (41.8 kB). View file
 
data_tts/gcloud_tts_sample_1305.wav ADDED
Binary file (64.5 kB). View file
 
data_tts/gcloud_tts_sample_1306.wav ADDED
Binary file (88.5 kB). View file
 
data_tts/gcloud_tts_sample_1313.wav ADDED
Binary file (50 kB). View file
 
data_tts/gcloud_tts_sample_1314.wav ADDED
Binary file (81.7 kB). View file
 
data_tts/gcloud_tts_sample_1322.wav ADDED
Binary file (51.5 kB). View file
 
eval_manifest.jsonl ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_000.wav", "duration": 1.6, "text": "علاء سيد عبد الله"}
2
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_001.wav", "duration": 2.0, "text": "محمد أحمد عبد الرحمن"}
3
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_002.wav", "duration": 2.7, "text": "كريم محمود عبد الغفار"}
4
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_003.wav", "duration": 1.7, "text": "يوسف علي عبد الحليم"}
5
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_004.wav", "duration": 1.6, "text": "مصطفى طارق حسن"}
6
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_005.wav", "duration": 2.55, "text": "إبراهيم محمد عبد العزيز"}
7
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_006.wav", "duration": 2.85, "text": "خالد عمر عبد السميع"}
8
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_007.wav", "duration": 1.6, "text": "أحمد سامي حسين"}
9
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_008.wav", "duration": 2.0, "text": "محمود ناصر عبد اللطيف"}
10
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_009.wav", "duration": 1.9, "text": "عمر عبد الله محمد"}
11
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_010.wav", "duration": 1.65, "text": "مينا فادي نصيف"}
12
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_011.wav", "duration": 1.65, "text": "بيتر عادل صليب"}
13
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_012.wav", "duration": 1.65, "text": "جرجس سامح حكيم"}
14
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_013.wav", "duration": 1.75, "text": "رامي فوزي بشارة"}
15
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_014.wav", "duration": 1.7, "text": "فادي منير عوض"}
16
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_015.wav", "duration": 1.5, "text": "مريم يوسف فؤاد"}
17
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_016.wav", "duration": 2.0, "text": "نانسي شريف عياد"}
18
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_017.wav", "duration": 1.35, "text": "كيرلس ممدوح سمعان"}
19
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_018.wav", "duration": 1.65, "text": "هالة فؤاد حبيب"}
20
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_019.wav", "duration": 1.7, "text": "مارجريت جرجس فخري"}
21
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_020.wav", "duration": 1.8, "text": "ريم أحمد عبد الباري"}
22
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_021.wav", "duration": 1.9, "text": "شروق محمد عبد الرحيم"}
23
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_022.wav", "duration": 1.65, "text": "إيمان حسن مصطفى"}
24
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_023.wav", "duration": 2.5, "text": "فاطمة الزهراء عبد الله"}
25
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_024.wav", "duration": 2.7, "text": "سارة خالد عبد الباقي"}
26
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_025.wav", "duration": 1.8, "text": "ندى إبراهيم حسن"}
27
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_026.wav", "duration": 1.45, "text": "دينا محمود فوزي"}
28
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_027.wav", "duration": 2.15, "text": "لبنى عبد الرحمن السيد"}
29
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_028.wav", "duration": 1.7, "text": "آية طارق عبد الجليل"}
30
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_029.wav", "duration": 1.85, "text": "أسماء علي إبراهيم"}
31
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_030.wav", "duration": 1.9, "text": "أحمد عصام عبد الرحمن"}
32
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_031.wav", "duration": 1.75, "text": "نور هشام عبد الله"}
33
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_032.wav", "duration": 1.4, "text": "نجلاء سامي فؤاد"}
34
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_033.wav", "duration": 1.7, "text": "رنا علاء الدين أحمد"}
35
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_034.wav", "duration": 2.55, "text": "عادل فخري سمعان"}
36
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_035.wav", "duration": 1.4, "text": "بولا هاني رزق"}
37
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_036.wav", "duration": 1.45, "text": "مينا يوسف بشاي"}
38
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_037.wav", "duration": 1.75, "text": "أبانوب فادي كامل"}
39
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_038.wav", "duration": 2.25, "text": "مارينا جرجس جاد"}
40
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_039.wav", "duration": 2.1, "text": "كريستين فؤاد صموئيل"}
41
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_040.wav", "duration": 2.1, "text": "سليم أحمد عبد المقصود"}
42
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_041.wav", "duration": 1.95, "text": "عمار محمد عبد الرحيم"}
43
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_042.wav", "duration": 2.6, "text": "أنس عبد الله محمود"}
44
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_043.wav", "duration": 1.7, "text": "زياد عمرو ناصر"}
45
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_044.wav", "duration": 2.0, "text": "أمير يوسف عبد الغفار"}
46
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_045.wav", "duration": 2.4, "text": "خالد مصطفى عبد الحميد"}
47
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_046.wav", "duration": 1.75, "text": "جرجس عادل لبيب"}
48
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_047.wav", "duration": 1.4, "text": "بولا فخري بطرس"}
49
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_048.wav", "duration": 1.75, "text": "مارينا فادي صادق"}
50
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_049.wav", "duration": 2.3, "text": "جوليان جورج عزيز"}
51
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_050.wav", "duration": 1.45, "text": "نادر سامي رزق"}
52
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_051.wav", "duration": 1.75, "text": "عبد الرحمن أحمد عبد الله"}
53
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_052.wav", "duration": 1.5, "text": "محمد طه السيد"}
54
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_053.wav", "duration": 1.75, "text": "أحمد ياسر مصطفى"}
55
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_054.wav", "duration": 2.1, "text": "سيد عبد الفتاح عبد الغني"}
56
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_055.wav", "duration": 3.05, "text": "محمد رمضان عبد الحكيم"}
57
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_056.wav", "duration": 2.1, "text": "عبد الله حمدي عبد الفتاح"}
58
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_057.wav", "duration": 2.1, "text": "أيمن جمال عبد الناصر"}
59
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_058.wav", "duration": 2.45, "text": "أحمد عبد الرازق حسن"}
60
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_059.wav", "duration": 1.65, "text": "محمود خالد محمد"}
61
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_060.wav", "duration": 1.6, "text": "مروان عماد عبد الله"}
62
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_061.wav", "duration": 2.7, "text": "عبد الرحمن محمد شريف"}
63
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_062.wav", "duration": 1.95, "text": "أحمد محروس عبد اللطيف"}
64
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_063.wav", "duration": 2.4, "text": "مصطفى عبد القادر عبد السميع"}
65
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_064.wav", "duration": 1.9, "text": "عبد العزيز حسن عبد الله"}
66
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_065.wav", "duration": 2.25, "text": "مينا شنودة فخري"}
67
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_066.wav", "duration": 1.5, "text": "بولا يوسف بطرس"}
68
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_067.wav", "duration": 1.45, "text": "فادي عادل رسمي"}
69
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_068.wav", "duration": 2.2, "text": "جرجس فوزي منصور"}
70
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_069.wav", "duration": 1.95, "text": "كيرلس رأفت نجيب"}
71
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_070.wav", "duration": 1.5, "text": "مارينا جورج عادل"}
72
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_071.wav", "duration": 1.85, "text": "ديفيد ماهر منير"}
73
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_072.wav", "duration": 1.75, "text": "كارولين فادي شكر"}
74
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_073.wav", "duration": 1.9, "text": "مريم سامي فؤاد"}
75
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_074.wav", "duration": 1.6, "text": "أندرو فؤاد رزق"}
76
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_075.wav", "duration": 1.9, "text": "ريهام عبد الله محمد"}
77
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_076.wav", "duration": 2.7, "text": "سارة عماد حسن"}
78
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_077.wav", "duration": 1.9, "text": "ميادة عبد الحميد ناصر"}
79
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_078.wav", "duration": 1.7, "text": "آية أحمد عبد الله"}
80
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_079.wav", "duration": 1.95, "text": "نورهان عبد الفتاح علي"}
81
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_080.wav", "duration": 1.35, "text": "هدير خالد حسن"}
82
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_081.wav", "duration": 1.9, "text": "بسمة إبراهيم عبد الغني"}
83
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_082.wav", "duration": 2.3, "text": "أسماء طارق عبد الرحمن"}
84
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_083.wav", "duration": 1.75, "text": "يمنى محمد عبد الحليم"}
85
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_084.wav", "duration": 2.25, "text": "صفاء عبد الرحمن السيد"}
86
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_085.wav", "duration": 1.9, "text": "منال أحمد حسن"}
87
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_086.wav", "duration": 2.65, "text": "رحمة عبد الله محمود"}
88
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_087.wav", "duration": 2.35, "text": "ياسمين خالد عبد الرحمن"}
89
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_088.wav", "duration": 2.2, "text": "شيماء أحمد عبد الغفار"}
90
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_089.wav", "duration": 1.95, "text": "علا سامي عبد المقصود"}
91
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_090.wav", "duration": 1.95, "text": "رغدة علي عبد الباري"}
92
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_091.wav", "duration": 1.95, "text": "هايدي جرجس بطرس"}
93
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_092.wav", "duration": 1.5, "text": "نيرمين مينا فؤاد"}
94
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_093.wav", "duration": 1.7, "text": "جيسيكا بولا منصور"}
95
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_094.wav", "duration": 1.65, "text": "ماريان يوسف رسمي"}
96
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_095.wav", "duration": 1.5, "text": "كارين فادي شنودة"}
97
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_096.wav", "duration": 1.75, "text": "أميرة أحمد عبد الله"}
98
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_097.wav", "duration": 2.3, "text": "نورا إبراهيم حسن"}
99
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_098.wav", "duration": 1.6, "text": "هبة طارق عبد الرحمن"}
100
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_099.wav", "duration": 1.65, "text": "دعاء عبد الله السيد"}
101
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_100.wav", "duration": 1.9, "text": "عبير خالد عبد العزيز"}
102
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_101.wav", "duration": 1.7, "text": "خلود ناصر عبد الغفار"}
103
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_102.wav", "duration": 2.25, "text": "جيهان عبد الرحمن محمود"}
104
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_103.wav", "duration": 7.6, "text": "اثنين ثلاثة زيرو واحد واحد اثنين زيرو سبعة واحد زيرو زيرو واحد واحد اثنين"}
105
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_104.wav", "duration": 7.35, "text": "اثنين خمسة زيرو تسعة زيرو خمسة اثنين واحد واحد تسعة زيرو زيرو ثلاثة"}
106
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_105.wav", "duration": 7.15, "text": "ثلاثة زيرو واحد واحد اثنين ثلاثة زيرو سبعة واحد سبعة تسعة زيرو خمسة"}
107
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_106.wav", "duration": 6.75, "text": "اثنين تسعة زيرو سبعة واحد واحد زيرو اثنين واحد زيرو خمسة زيرو اثنين"}
108
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_107.wav", "duration": 8.2, "text": "ثلاثة واحد زيرو اثنين واحد اثنين زيرو تسعة واحد زيرو سبعة زيرو واحد"}
109
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_108.wav", "duration": 6.9, "text": "ثلاثة اثنين زيرو ثلاثة واحد واحد زيرو سبعة واحد تسعة زيرو زيرو أربعة"}
110
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_109.wav", "duration": 6.7, "text": "اثنين ثمانية زيرو تسعة واحد واحد زيرو خمسة واحد زيرو زيرو زيرو ستة"}
111
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_110.wav", "duration": 8.25, "text": "ثلاثة زيرو واحد واحد زيرو تسعة زيرو تسعة واحد ثمانية زيرو زيرو سبعة"}
112
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_111.wav", "duration": 5.25, "text": "اثنين خمسة زيرو اثنين زيرو اثنين زيرو سبعة واحد زيرو زيرو زيرو خمسة"}
113
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_112.wav", "duration": 7.95, "text": "ثلاثة واحد زيرو سبعة واحد زيرو زيرو خمسة واحد زيرو تسعة زيرو واحد"}
114
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_113.wav", "duration": 5.45, "text": "اثنين ستة زيرو واحد زيرو ثلاثة زيرو سبعة واحد زيرو زيرو زيرو تسعة"}
115
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_114.wav", "duration": 7.3, "text": "اثنين تسعة زيرو ثمانية واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو اثنين"}
116
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_115.wav", "duration": 6.35, "text": "ثلاثة اثنين زيرو سبعة واحد واحد زيرو تسعة واحد زيرو زيرو زيرو خمسة"}
117
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_116.wav", "duration": 7.15, "text": "ثلاثة واحد زيرو واحد واحد اثنين زيرو خمسة واحد زيرو زيرو زيرو أربعة"}
118
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_117.wav", "duration": 7.3, "text": "اثنين ثمانية زيرو تسعة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو اثنين"}
119
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_118.wav", "duration": 7.65, "text": "ثلاثة واحد زيرو خمسة واحد اثنين زيرو تسعة واحد زيرو تسعة زيرو ثلاثة"}
120
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_119.wav", "duration": 7.45, "text": "اثنين تسعة زيرو اثنين واحد واحد زيرو ثمانية واحد زيرو زيرو زيرو خمسة"}
121
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_120.wav", "duration": 8.25, "text": "ثلاثة اثنين زيرو واحد واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو تسعة"}
122
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_121.wav", "duration": 6.25, "text": "اثنين واحد زيرو ثلاثة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو أربعة"}
123
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_122.wav", "duration": 6.65, "text": "ثلاثة اثنين زيرو ثمانية واحد واحد زيرو خمسة واحد زيرو تسعة زيرو أربعة"}
124
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_123.wav", "duration": 6.7, "text": "زيرو واحد زيرو واحد اثنين ثلاثة أربعة خمسة ستة سبعة ثمانية"}
125
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_124.wav", "duration": 5.65, "text": "زيرو واحد واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين"}
126
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_125.wav", "duration": 5.15, "text": "زيرو واحد اثنين زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة"}
127
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_126.wav", "duration": 5.15, "text": "زيرو واحد خمسة سبعة ثمانية تسعة أربعة ثلاثة اثنين واحد زيرو"}
128
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_127.wav", "duration": 5.6, "text": "زيرو واحد زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين واحد"}
129
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_128.wav", "duration": 7.25, "text": "زيرو واحد واحد زيرو أربعة خمسة ستة سبعة ثمانية تسعة زيرو اثنين"}
130
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_129.wav", "duration": 8.25, "text": "زيرو واحد اثنين واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين"}
131
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_130.wav", "duration": 8.9, "text": "زيرو واحد خمسة زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين"}
132
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_131.wav", "duration": 7.3, "text": "زيرو واحد واحد خمسة ستة سبعة ثمانية تسعة أربعة ثلاثة اثنين زيرو"}
133
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_132.wav", "duration": 7.15, "text": "زيرو واحد اثنين أربعة خمسة ستة سبعة ثمانية تسعة زيرو واحد اثنين"}
134
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_133.wav", "duration": 2.1, "text": "في حادث عربية عند كوبري عباس"}
135
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_134.wav", "duration": 2.05, "text": "فيه حريق في عمارة في شارع فيصل"}
136
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_135.wav", "duration": 1.5, "text": "لقيت طفل تاه في المول"}
137
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_136.wav", "duration": 2.0, "text": "في خناقة كبيرة في ميدان الجيزة"}
138
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_137.wav", "duration": 2.75, "text": "عربية مقلوبة على الطريق الدائري"}
139
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_138.wav", "duration": 1.6, "text": "في صوت ضرب نار في الهرم"}
140
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_139.wav", "duration": 1.7, "text": "جارتي وقعت من البلكونة"}
141
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_140.wav", "duration": 2.95, "text": "حصلت سرقة في الشارع عند السوبرماركت"}
142
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_141.wav", "duration": 2.25, "text": "في واحد بيعتدي على بنت في الشارع"}
143
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_142.wav", "duration": 2.4, "text": "حصل انفجار صغير في محل الغاز"}
144
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_143.wav", "duration": 2.2, "text": "شفت عربية بتخبط موتوسيكل وهربت"}
145
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_144.wav", "duration": 2.35, "text": "طفل محبوس في الأسانسير"}
146
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_145.wav", "duration": 2.4, "text": "في شاب مصاب قدام محطة المترو"}
147
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_146.wav", "duration": 1.95, "text": "العربية عطلت في نص الطريق"}
148
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_147.wav", "duration": 1.9, "text": "في تسريب غاز في العمارة"}
149
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_148.wav", "duration": 2.9, "text": "واحد كبير في السن مغمى عليه في المسجد"}
150
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_149.wav", "duration": 3.0, "text": "حصلت مشاجرة بالسكاكين في السوق"}
151
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_150.wav", "duration": 2.55, "text": "عربية إسعاف اتأخرت على المكان"}
152
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_151.wav", "duration": 2.95, "text": "فيه كلب شرس بيهاجم الناس في الشارع"}
153
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_152.wav", "duration": 3.7, "text": "في بنت اتخطفِت من عند المدرسة"}
154
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_153.wav", "duration": 2.85, "text": "في حادث تصادم في محور 26 يوليو"}
155
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_154.wav", "duration": 4.15, "text": "واحد وقع من فوق سلم البيت"}
156
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_155.wav", "duration": 1.65, "text": "النور قاطع في الشارع كله"}
157
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_156.wav", "duration": 1.85, "text": "صوت انفجار جامد في المنطقة"}
158
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_157.wav", "duration": 3.05, "text": "العربية دخلت في محل في الهرم"}
159
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_158.wav", "duration": 1.75, "text": "طفلة ضايعة في المول"}
160
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_159.wav", "duration": 2.9, "text": "في تسريب مياه من الدور الرابع"}
161
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_160.wav", "duration": 2.15, "text": "خناقة بين الجيران فوق السطح"}
162
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_161.wav", "duration": 3.35, "text": "فيه عربية مركونة غلط قافلة الشارع"}
163
+ {"audio_filepath": "data_tts_evaluation\\openai_tts_sample_162.wav", "duration": 3.45, "text": "الغاز بيخرج من البوتاجاز ومفيش حد في الشقة"}
evaluation_results.csv ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio,expected,predicted,WER,CER
2
+ openai_tts_sample_000.wav,علاء سيد عبد الله,علاء سيد عبد الله,0.0,0.0
3
+ openai_tts_sample_001.wav,محمد أحمد عبد الرحمن,محمد أحمد عبد الرحمن,0.0,0.0
4
+ openai_tts_sample_002.wav,كريم محمود عبد الغفار,كريم محمود عبد الغفار,0.0,0.0
5
+ openai_tts_sample_003.wav,يوسف علي عبد الحليم,يوسف علي عبد الحليم,0.0,0.0
6
+ openai_tts_sample_004.wav,مصطفى طارق حسن,مصطفى طارق حسن,0.0,0.0
7
+ openai_tts_sample_005.wav,إبراهيم محمد عبد العزيز,إبراهيم محمد عبد العزيز,0.0,0.0
8
+ openai_tts_sample_006.wav,خالد عمر عبد السميع,خالد عمر عبد السميع,0.0,0.0
9
+ openai_tts_sample_007.wav,أحمد سامي حسين,أحمد سامي حسين,0.0,0.0
10
+ openai_tts_sample_008.wav,محمود ناصر عبد اللطيف,محمود ناصر عبد اللطيف,0.0,0.0
11
+ openai_tts_sample_009.wav,عمر عبد الله محمد,عمر عبد الله محمد,0.0,0.0
12
+ openai_tts_sample_010.wav,مينا فادي نصيف,مينا فادي نصيف,0.0,0.0
13
+ openai_tts_sample_011.wav,بيتر عادل صليب,بيتر عادل صليب,0.0,0.0
14
+ openai_tts_sample_012.wav,جرجس سامح حكيم,جرجس سامح حكيم,0.0,0.0
15
+ openai_tts_sample_013.wav,رامي فوزي بشارة,رامي فوزي بشارة,0.0,0.0
16
+ openai_tts_sample_014.wav,فادي منير عوض,فادي منير عوض,0.0,0.0
17
+ openai_tts_sample_015.wav,مريم يوسف فؤاد,مريم يوسف فؤاد,0.0,0.0
18
+ openai_tts_sample_016.wav,نانسي شريف عياد,نانسي شريف عياد,0.0,0.0
19
+ openai_tts_sample_017.wav,كيرلس ممدوح سمعان,كيرلس ممدوح سمعان,0.0,0.0
20
+ openai_tts_sample_018.wav,هالة فؤاد حبيب,هانا فؤاد حبيب,0.3333333333333333,0.16666666666666666
21
+ openai_tts_sample_019.wav,مارجريت جرجس فخري,ماربريت جرجس فخري,0.3333333333333333,0.06666666666666667
22
+ openai_tts_sample_020.wav,ريم أحمد عبد الباري,ريم أحمد عبد الباري,0.0,0.0
23
+ openai_tts_sample_021.wav,شروق محمد عبد الرحيم,شروق محمد عبد الرحيم,0.0,0.0
24
+ openai_tts_sample_022.wav,إيمان حسن مصطفى,إيمان حسن مصطفى,0.0,0.0
25
+ openai_tts_sample_023.wav,فاطمة الزهراء عبد الله,فاطمة زحراء عبد الله,0.25,0.15789473684210525
26
+ openai_tts_sample_024.wav,سارة خالد عبد الباقي,سارة خالد عبد الباقي,0.0,0.0
27
+ openai_tts_sample_025.wav,ندى إبراهيم حسن,ندى إبراهيم حسن,0.0,0.0
28
+ openai_tts_sample_026.wav,دينا محمود فوزي,دينا محمود فوزي,0.0,0.0
29
+ openai_tts_sample_027.wav,لبنى عبد الرحمن السيد,لبنى عبد الرحمن السيد,0.0,0.0
30
+ openai_tts_sample_028.wav,آية طارق عبد الجليل,آية طارق عبد الجليل,0.0,0.0
31
+ openai_tts_sample_029.wav,أسماء علي إبراهيم,أسماء علي إبراهيم,0.0,0.0
32
+ openai_tts_sample_030.wav,أحمد عصام عبد الرحمن,أحمد عصام عبد الرحمن,0.0,0.0
33
+ openai_tts_sample_031.wav,نور هشام عبد الله,نور هشام عبد الله,0.0,0.0
34
+ openai_tts_sample_032.wav,نجلاء سامي فؤاد,نجلاء سامي فؤاد,0.0,0.0
35
+ openai_tts_sample_033.wav,رنا علاء الدين أحمد,رنا علاء الدين أحمد,0.0,0.0
36
+ openai_tts_sample_034.wav,عادل فخري سمعان,عادل فخري سمعان,0.0,0.0
37
+ openai_tts_sample_035.wav,بولا هاني رزق,بولا هاني رزق,0.0,0.0
38
+ openai_tts_sample_036.wav,مينا يوسف بشاي,مينا يوسف بيشاي,0.3333333333333333,0.08333333333333333
39
+ openai_tts_sample_037.wav,أبانوب فادي كامل,أبانوب فادي كامل,0.0,0.0
40
+ openai_tts_sample_038.wav,مارينا جرجس جاد,مارينا جرجس كاد,0.3333333333333333,0.07692307692307693
41
+ openai_tts_sample_039.wav,كريستين فؤاد صموئيل,كريستين فؤاد صموئيل,0.0,0.0
42
+ openai_tts_sample_040.wav,سليم أحمد عبد المقصود,سليم أحمد عبد المقصود,0.0,0.0
43
+ openai_tts_sample_041.wav,عمار محمد عبد الرحيم,أنار محمد عبد الرحيم,0.25,0.11764705882352941
44
+ openai_tts_sample_042.wav,أنس عبد الله محمود,أنس عبد الله محمود,0.0,0.0
45
+ openai_tts_sample_043.wav,زياد عمرو ناصر,زياد عمرو ناصر,0.0,0.0
46
+ openai_tts_sample_044.wav,أمير يوسف عبد الغفار,أمير يوسف عبد الغفار,0.0,0.0
47
+ openai_tts_sample_045.wav,خالد مصطفى عبد الحميد,خالد مصطفى عبد الحميد,0.0,0.0
48
+ openai_tts_sample_046.wav,جرجس عادل لبيب,جرجس عادل لبيب,0.0,0.0
49
+ openai_tts_sample_047.wav,بولا فخري بطرس,ولا فخري بطرس,0.3333333333333333,0.08333333333333333
50
+ openai_tts_sample_048.wav,مارينا فادي صادق,مارينا فادي صادق,0.0,0.0
51
+ openai_tts_sample_049.wav,جوليان جورج عزيز,جوليان جورج عزيز,0.0,0.0
52
+ openai_tts_sample_050.wav,نادر سامي رزق,نادر سامي رزق,0.0,0.0
53
+ openai_tts_sample_051.wav,عبد الرحمن أحمد عبد الله,عبد الرحمن أحمد عبد الله,0.0,0.0
54
+ openai_tts_sample_052.wav,محمد طه السيد,محمد طه السيد,0.0,0.0
55
+ openai_tts_sample_053.wav,أحمد ياسر مصطفى,أحمد ياسر مصطفى,0.0,0.0
56
+ openai_tts_sample_054.wav,سيد عبد الفتاح عبد الغني,سيد عبد الفتاح عبد الغني,0.0,0.0
57
+ openai_tts_sample_055.wav,محمد رمضان عبد الحكيم,محمد رمضان عبد الحكيم,0.0,0.0
58
+ openai_tts_sample_056.wav,عبد الله حمدي عبد الفتاح,عبد الله حمدي عبد الفتاح,0.0,0.0
59
+ openai_tts_sample_057.wav,أيمن جمال عبد الناصر,أيمن جمال عبد الناصر,0.0,0.0
60
+ openai_tts_sample_058.wav,أحمد عبد الرازق حسن,أحمد عبد الرازق حسن,0.0,0.0
61
+ openai_tts_sample_059.wav,محمود خالد محمد,محمود خالد محمد,0.0,0.0
62
+ openai_tts_sample_060.wav,مروان عماد عبد الله,مروان عماد عبد الله,0.0,0.0
63
+ openai_tts_sample_061.wav,عبد الرحمن محمد شريف,عبد الرحمن محمد شريف,0.0,0.0
64
+ openai_tts_sample_062.wav,أحمد محروس عبد اللطيف,أحمد محروس عبد اللطيف,0.0,0.0
65
+ openai_tts_sample_063.wav,مصطفى عبد القادر عبد السميع,مصطفى عبد القادر عبد السميع,0.0,0.0
66
+ openai_tts_sample_064.wav,عبد العزيز حسن عبد الله,عبد العزيز حسن عبد الله,0.0,0.0
67
+ openai_tts_sample_065.wav,مينا شنودة فخري,مينا شنودة فخري,0.0,0.0
68
+ openai_tts_sample_066.wav,بولا يوسف بطرس,بولا يوسف بطرس,0.0,0.0
69
+ openai_tts_sample_067.wav,فادي عادل رسمي,فادي عادل لسمي,0.3333333333333333,0.08333333333333333
70
+ openai_tts_sample_068.wav,جرجس فوزي منصور,جرجس فوزي منصور,0.0,0.0
71
+ openai_tts_sample_069.wav,كيرلس رأفت نجيب,كيرلس رأفت نجيب,0.0,0.0
72
+ openai_tts_sample_070.wav,مارينا جورج عادل,مارينا جورج عادل,0.0,0.0
73
+ openai_tts_sample_071.wav,ديفيد ماهر منير,ديفيد ماهر منير,0.0,0.0
74
+ openai_tts_sample_072.wav,كارولين فادي شكر,كارولين فادي شكر,0.0,0.0
75
+ openai_tts_sample_073.wav,مريم سامي فؤاد,مريم سامي فؤاد,0.0,0.0
76
+ openai_tts_sample_074.wav,أندرو فؤاد رزق,أندرو فؤاد رزق,0.0,0.0
77
+ openai_tts_sample_075.wav,ريهام عبد الله محمد,ريهام عبد الله محمد,0.0,0.0
78
+ openai_tts_sample_076.wav,سارة عماد حسن,سارة عماد حسن,0.0,0.0
79
+ openai_tts_sample_077.wav,ميادة عبد الحميد ناصر,مادة عبد الحميد ناصر,0.25,0.05555555555555555
80
+ openai_tts_sample_078.wav,آية أحمد عبد الله,آية أحمد عبد الله,0.0,0.0
81
+ openai_tts_sample_079.wav,نورهان عبد الفتاح علي,نرهان عبد الفتاح علي,0.25,0.05555555555555555
82
+ openai_tts_sample_080.wav,هدير خالد حسن,هدير خالد حسن,0.0,0.0
83
+ openai_tts_sample_081.wav,بسمة إبراهيم عبد الغني,بسمة إبراهيم عبد الغني,0.0,0.0
84
+ openai_tts_sample_082.wav,أسماء طارق عبد الرحمن,أسماء طارق عبد الرحمن,0.0,0.0
85
+ openai_tts_sample_083.wav,يمنى محمد عبد الحليم,يمنى محمد عبد الحليم,0.0,0.0
86
+ openai_tts_sample_084.wav,صفاء عبد الرحمن السيد,صفاء عبد الرحمن السيد,0.0,0.0
87
+ openai_tts_sample_085.wav,منال أحمد حسن,منال أحمد حسن,0.0,0.0
88
+ openai_tts_sample_086.wav,رحمة عبد الله محمود,رحمة عبد الله محمود,0.0,0.0
89
+ openai_tts_sample_087.wav,ياسمين خالد عبد الرحمن,ياسمين خالد عبد الرحمن,0.0,0.0
90
+ openai_tts_sample_088.wav,شيماء أحمد عبد الغفار,شيماء أحمد عبد الغفار,0.0,0.0
91
+ openai_tts_sample_089.wav,علا سامي عبد المقصود,علا سامي عبد المقصود,0.0,0.0
92
+ openai_tts_sample_090.wav,رغدة علي عبد الباري,رغدة علي عبد الباري,0.0,0.0
93
+ openai_tts_sample_091.wav,هايدي جرجس بطرس,هايدي جرجس بطرس,0.0,0.0
94
+ openai_tts_sample_092.wav,نيرمين مينا فؤاد,نرمين مينا فؤاد,0.3333333333333333,0.07142857142857142
95
+ openai_tts_sample_093.wav,جيسيكا بولا منصور,كيسيك بولا منصور,0.3333333333333333,0.13333333333333333
96
+ openai_tts_sample_094.wav,ماريان يوسف رسمي,ماريان يوسف رسمي,0.0,0.0
97
+ openai_tts_sample_095.wav,كارين فادي شنودة,كريم فادي شنودة,0.3333333333333333,0.14285714285714285
98
+ openai_tts_sample_096.wav,أميرة أحمد عبد الله,أميرة أحمد عبد الله,0.0,0.0
99
+ openai_tts_sample_097.wav,نورا إبراهيم حسن,نورا إبراهيم حسن,0.0,0.0
100
+ openai_tts_sample_098.wav,هبة طارق عبد الرحمن,هبة طارق عبد الرحمن,0.0,0.0
101
+ openai_tts_sample_099.wav,دعاء عبد الله السيد,دعاء عبد الله السيد,0.0,0.0
102
+ openai_tts_sample_100.wav,عبير خالد عبد العزيز,أمير خالد عبد العزيز,0.25,0.11764705882352941
103
+ openai_tts_sample_101.wav,خلود ناصر عبد الغفار,ولود ناصر عبد الغفار,0.25,0.058823529411764705
104
+ openai_tts_sample_102.wav,جيهان عبد الرحمن محمود,جيهان عبد الرحمن محمود,0.0,0.0
105
+ openai_tts_sample_103.wav,اثنين ثلاثة زيرو واحد واحد اثنين زيرو سبعة واحد زيرو زيرو واحد واحد اثنين,اثنين ثلاثة زيرو واحد واحد اثنين زيرو سبعة واحد زيرو زيرو واحد واحد اثنين,0.0,0.0
106
+ openai_tts_sample_104.wav,اثنين خمسة زيرو تسعة زيرو خمسة اثنين واحد واحد تسعة زيرو زيرو ثلاثة,اتنين خمسة زيرو تسعة زيرو خمسة اتنين واحد واحد تسعة زيرو زيرو تلاتة,0.23076923076923078,0.07272727272727272
107
+ openai_tts_sample_105.wav,ثلاثة زيرو واحد واحد اثنين ثلاثة زيرو سبعة واحد سبعة تسعة زيرو خمسة,ثلاثة زيرو واحد واحد اثنين ثلاثة زيرو سبعة واحد سبعة تسعة زيرو خمسة,0.0,0.0
108
+ openai_tts_sample_106.wav,اثنين تسعة زيرو سبعة واحد واحد زيرو اثنين واحد زيرو خمسة زيرو اثنين,اتنين تسعة زيرو سبعة واحد واحد زيرو اثنين واحد واحد زيرو خمسة زيرو اثنين,0.14285714285714285,0.09090909090909091
109
+ openai_tts_sample_107.wav,ثلاثة واحد زيرو اثنين واحد اثنين زيرو تسعة واحد زيرو سبعة زيرو واحد,ثلاثة واحد زيرو اثنين واحد اثنين زيرو تسعة واحد زيرو سبعة زيرو واحد,0.0,0.0
110
+ openai_tts_sample_108.wav,ثلاثة اثنين زيرو ثلاثة واحد واحد زيرو سبعة واحد تسعة زيرو زيرو أربعة,ثلاثة اتنين زيرو ثلاثة واحد زيرو سبعة واحد تسعة زيرو زيرو أربعة,0.16666666666666666,0.08928571428571429
111
+ openai_tts_sample_109.wav,اثنين ثمانية زيرو تسعة واحد واحد زيرو خمسة واحد زيرو زيرو زيرو ستة,اتنين تمانية زيرو تسعة واحد زيرو خمسة واحد زيرو زيرو زيرو ستة,0.25,0.1111111111111111
112
+ openai_tts_sample_110.wav,ثلاثة زيرو واحد واحد زيرو تسعة زيرو تسعة واحد ثمانية زيرو زيرو سبعة,ثلاثة زيرو واحد واحد زيرو تسعة زيرو تسعة واحد ثمانية زيرو سبعة,0.08333333333333333,0.07272727272727272
113
+ openai_tts_sample_111.wav,اثنين خمسة زيرو اثنين زيرو اثنين زيرو سبعة واحد زيرو زيرو زيرو خمسة,اتنين خمسة زيرو اتنين زيرو اتنين زيرو سبعة واحد زيرو زيرو زيرو خمسة,0.23076923076923078,0.05454545454545454
114
+ openai_tts_sample_112.wav,ثلاثة واحد زيرو سبعة واحد زيرو زيرو خمسة واحد زيرو تسعة زيرو واحد,ثلاثة واحد زيرو سبعة واحد زيرو زيرو خمسة واحد زيرو تسعة زيرو واحد,0.0,0.0
115
+ openai_tts_sample_113.wav,اثنين ستة زيرو واحد زيرو ثلاثة زيرو سبعة واحد زيرو زيرو زيرو تسعة,اتنين ستة زيرو واحد زيرو ثلاثة زيرو سبعة واحد زيرو زيرو زيرو تسعة,0.07692307692307693,0.018867924528301886
116
+ openai_tts_sample_114.wav,اثنين تسعة زيرو ثمانية واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو اثنين,اتنين تسعة زيرو تمانية واحد اثنين زيرو اربعة واحد زيرو زيرو اتنين,0.4166666666666667,0.13793103448275862
117
+ openai_tts_sample_115.wav,ثلاثة اثنين زيرو سبعة واحد واحد زيرو تسعة واحد زيرو زيرو زيرو خمسة,ثلاثة اثنين زيرو سبعة واحد زيرو زيرو زيرو خمسة,0.4444444444444444,0.2962962962962963
118
+ openai_tts_sample_116.wav,ثلاثة واحد زيرو واحد واحد اثنين زيرو خمسة واحد زيرو زيرو زيرو أربعة,ثلاثة زيرو واحد واحد اثنين زيرو خمسة واحد زيرو زيرو زيرو أربعة,0.08333333333333333,0.07272727272727272
119
+ openai_tts_sample_117.wav,اثنين ثمانية زيرو تسعة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو اثنين,اتنين تمانية زيرو تسعة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو اتنين,0.23076923076923078,0.05357142857142857
120
+ openai_tts_sample_118.wav,ثلاثة واحد زيرو خمسة واحد اثنين زيرو تسعة واحد زيرو تسعة زيرو ثلاثة,تلاتة واحد زيرو خمسة واحد اتنين زيرو تسعة واحد زيرو تسعة,0.36363636363636365,0.21818181818181817
121
+ openai_tts_sample_119.wav,اثنين تسعة زيرو اثنين واحد واحد زيرو ثمانية واحد زيرو زيرو زيرو خمسة,اتنين تسعة زيرو اثنين واحد واحد زيرو اثنين زيرو زيرو خمسة,0.36363636363636365,0.23214285714285715
122
+ openai_tts_sample_120.wav,ثلاثة اثنين زيرو واحد واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو تسعة,ثلاثة اثنين زيرو واحد واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو تسعة,0.0,0.0
123
+ openai_tts_sample_121.wav,اثنين واحد زيرو ثلاثة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو أربعة,اتنين واحد زيرو ثلاثة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو أربعة,0.07692307692307693,0.01818181818181818
124
+ openai_tts_sample_122.wav,ثلاثة اثنين زيرو ثمانية واحد واحد زيرو خمسة واحد زيرو تسعة زيرو أربعة,ثلاثة اتنين زيرو تمانية واحد زيرو خمسة واحد زيرو تسعة زيرو أربعة,0.25,0.10526315789473684
125
+ openai_tts_sample_123.wav,زيرو واحد زيرو واحد اثنين ثلاثة أربعة خمسة ستة سبعة ثمانية,زيرو واحد زيرو واحد اثنين تلاتة أربعة خمسة ستة سبعة تمانية,0.18181818181818182,0.0625
126
+ openai_tts_sample_124.wav,زيرو واحد واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,زيرو واحد واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,0.0,0.0
127
+ openai_tts_sample_125.wav,زيرو واحد اثنين زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة,زيرو واحد اتنين زيرو تسعة تمانية سبعة ستة خمسة أربعة تلاتة,0.2727272727272727,0.08333333333333333
128
+ openai_tts_sample_126.wav,زيرو واحد خمسة سبعة ثمانية تسعة أربعة ثلاثة اثنين واحد زيرو,زيرو واحد خمسة سبعة اتنين تسعة أربعة تلاتة اتنين واحد زيرو,0.2727272727272727,0.14285714285714285
129
+ openai_tts_sample_127.wav,زيرو واحد زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين واحد,زيرو واحد زيرو تسعة تمانية سبعة ستة خمسة أربعة تلاتة اتنين واحد,0.25,0.07692307692307693
130
+ openai_tts_sample_128.wav,زيرو واحد واحد زيرو أربعة خمسة ستة سبعة ثمانية تسعة زيرو اثنين,زيرو واحد واحد زيرو أربعة خمسة ستة سبعة ثمانية تسعة زيرو اثنين,0.0,0.0
131
+ openai_tts_sample_129.wav,زيرو واحد اثنين واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,زيرو واحد اثنين واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,0.0,0.0
132
+ openai_tts_sample_130.wav,زيرو واحد خمسة زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,زيرو واحد خمسة زيرو تسعة تمانية سبعة ستة خمسة أربعة تلاتة اتنين,0.25,0.07692307692307693
133
+ openai_tts_sample_131.wav,زيرو واحد واحد خمسة ستة سبعة ثمانية تسعة أربعة ثلاثة اثنين زيرو,زيرو واحد واحد خمسة ستة سبعة ثمانية تسعة أربعة ثلاثة اثنين زيرو,0.0,0.0
134
+ openai_tts_sample_132.wav,زيرو واحد اثنين أربعة خمسة ستة سبعة ثمانية تسعة زيرو واحد اثنين,زيرو واحد اتنين أربعة خمسة ستة سبعة ثمانية تسعة زيرو واحد اتنين,0.16666666666666666,0.038461538461538464
135
+ openai_tts_sample_133.wav,في حادث عربية عند كوبري عباس,في حادث عربية عند كوبري عباس,0.0,0.0
136
+ openai_tts_sample_134.wav,فيه حريق في عمارة في شارع فيصل,فيه حريق في عمارة في شارع فيصل.,0.14285714285714285,0.041666666666666664
137
+ openai_tts_sample_135.wav,لقيت طفل تاه في المول,ماجيت طفل تايه في المول,0.4,0.23529411764705882
138
+ openai_tts_sample_136.wav,في خناقة كبيرة في ميدان الجيزة,في خناقة كبيرة في ميدان الجيزة.,0.16666666666666666,0.04
139
+ openai_tts_sample_137.wav,ع��بية مقلوبة على الطريق الدائري,عربية مقلوبة على الطريق الدائري.,0.2,0.037037037037037035
140
+ openai_tts_sample_138.wav,في صوت ضرب نار في الهرم,في سود ضرب نار في الهرم.,0.3333333333333333,0.16666666666666666
141
+ openai_tts_sample_139.wav,جارتي وقعت من البلكونة,جارتي وقعت من البلكونة,0.0,0.0
142
+ openai_tts_sample_140.wav,حصلت سرقة في الشارع عند السوبرماركت,حصلت سرقة في الشارع عند السوبرماركت,0.0,0.0
143
+ openai_tts_sample_141.wav,في واحد بيعتدي على بنت في الشارع,في واحد بيعتدي على بنت في الشارع,0.0,0.0
144
+ openai_tts_sample_142.wav,حصل انفجار صغير في محل الغاز,حصل انفجار صغير في محل الغاز,0.0,0.0
145
+ openai_tts_sample_143.wav,شفت عربية بتخبط موتوسيكل وهربت,شفت عربية بتخبط موتوسيكل وهربت.,0.2,0.038461538461538464
146
+ openai_tts_sample_144.wav,طفل محبوس في الأسانسير,طفل محبوس في الأسانسير,0.0,0.0
147
+ openai_tts_sample_145.wav,في شاب مصاب قدام محطة المترو,في شاب مصاب قدام محطة المترو.,0.16666666666666666,0.043478260869565216
148
+ openai_tts_sample_146.wav,العربية عطلت في نص الطريق,العربية عطلت في نص الطريق.,0.2,0.047619047619047616
149
+ openai_tts_sample_147.wav,في تسريب غاز في العمارة,في تسريب غاز في العمارة.,0.2,0.05263157894736842
150
+ openai_tts_sample_148.wav,واحد كبير في السن مغمى عليه في المسجد,واحد كبير في السن مغمى عليه في المسجد,0.0,0.0
151
+ openai_tts_sample_149.wav,حصلت مشاجرة بالسكاكين في السوق,حصلت مشاجرة بالسكاكين في السوق,0.0,0.0
152
+ openai_tts_sample_150.wav,عربية إسعاف اتأخرت على المكان,عربية اسعاف اتأخرت على المكان.,0.4,0.08
153
+ openai_tts_sample_151.wav,فيه كلب شرس بيهاجم الناس في الشارع,فيه كلب شرس بيهاجم الناس في الشارع,0.0,0.0
154
+ openai_tts_sample_152.wav,في بنت اتخطفِت من عند المدرسة,في بنت اتخطفت من عند المدرسة,0.16666666666666666,0.041666666666666664
155
+ openai_tts_sample_153.wav,في حادث تصادم في محور 26 يوليو,في حادث تصادم في محور ⁇ يوليو.,0.2857142857142857,0.125
156
+ openai_tts_sample_154.wav,واحد وقع من فوق سلم البيت,واحد وقع من فوق سلم البيت,0.0,0.0
157
+ openai_tts_sample_155.wav,النور قاطع في الشارع كله,النور قاطع في الشارع كله,0.0,0.0
158
+ openai_tts_sample_156.wav,صوت انفجار جامد في المنطقة,صوت انفجار جامد في المنطقة,0.0,0.0
159
+ openai_tts_sample_157.wav,العربية دخلت في محل في الهرم,العربية دخلت في محل في الهرم.,0.16666666666666666,0.043478260869565216
160
+ openai_tts_sample_158.wav,طفلة ضايعة في المول,طفلة ضايعة في المول,0.0,0.0
161
+ openai_tts_sample_159.wav,في تسريب مياه من الدور الرابع,في تسريب مياه من الدور الرابع.,0.16666666666666666,0.041666666666666664
162
+ openai_tts_sample_160.wav,خناقة بين الجيران فوق السطح,خناقة بين الجيران فوق السطح,0.0,0.0
163
+ openai_tts_sample_161.wav,فيه عربية مركونة غلط قافلة الشارع,فيه عربية مركونة غلط قافل الشارع,0.16666666666666666,0.03571428571428571
164
+ openai_tts_sample_162.wav,الغاز بيخرج من البوتاجاز ومفيش حد في الشقة,الغاز بيخرج من البوتاجاز ومفيش حد في الشقة,0.0,0.0
finetune_asr.py ADDED
@@ -0,0 +1,711 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import json
4
+ import torch
5
+ from pytorch_lightning import Trainer
6
+ from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
7
+ from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
8
+ from omegaconf import OmegaConf, open_dict
9
+
10
+ # ============================================
11
+ # CRITICAL: Windows CUDA/Numba Fix
12
+ # ============================================
13
+ os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
14
+ os.environ["NUMBA_CUDA_USE_NVIDIA_BINDING"] = "1"
15
+ os.environ["NUMBA_DISABLE_JIT"] = "0"
16
+ os.environ["NUMBA_CUDA_DRIVER"] = "cuda"
17
+
18
+ # Force CPU for RNNT loss on Windows (prevents access violation)
19
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
20
+
21
+ # ============================================
22
+ # UTF-8 Encoding Fix
23
+ # ============================================
24
+ manifest_path = "train_manifest.jsonl"
25
+
26
+ with io.open(manifest_path, 'r', encoding='utf-8', errors='ignore') as f:
27
+ content = f.read()
28
+ with io.open(manifest_path, 'w', encoding='utf-8') as f:
29
+ f.write(content)
30
+ print("✅ train_manifest.jsonl converted to UTF-8")
31
+
32
+ # Patch builtins.open for UTF-8
33
+ import builtins
34
+ _old_open = open
35
+ def open_utf8(file, *args, **kwargs):
36
+ if isinstance(file, str) and file.endswith('.jsonl') and 'encoding' not in kwargs:
37
+ kwargs['encoding'] = 'utf-8'
38
+ return _old_open(file, *args, **kwargs)
39
+ builtins.open = open_utf8
40
+
41
+ # ============================================
42
+ # Validate Manifest
43
+ # ============================================
44
+ def validate_manifest(manifest_path):
45
+ count = 0
46
+ with open(manifest_path, "r", encoding="utf-8") as f:
47
+ for i, line in enumerate(f, 1):
48
+ try:
49
+ item = json.loads(line.strip())
50
+ assert os.path.exists(item["audio_filepath"]), f"Missing: {item['audio_filepath']}"
51
+ assert "text" in item and item["text"].strip(), "Empty text"
52
+ count += 1
53
+ except Exception as e:
54
+ print(f"❌ Line {i} error: {e}")
55
+ print(f" Content: {line[:100]}")
56
+ print(f"✅ Valid entries: {count}")
57
+ return count
58
+
59
+ valid_count = validate_manifest(manifest_path)
60
+ if valid_count == 0:
61
+ raise ValueError("No valid training samples found!")
62
+
63
+ # ============================================
64
+ # Configuration (OPTIMIZED FOR ACCURACY)
65
+ # ============================================
66
+ BASE_MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
67
+ # TRAIN_MANIFEST = "train_manifest_hf_converted.jsonl"
68
+ TRAIN_MANIFEST = "train_manifest.jsonl"
69
+ SAVE_DIR = "output_finetuned"
70
+
71
+ # OPTIMIZED HYPERPARAMETERS FOR LOWER WER
72
+ BATCH_SIZE = 8 #4 # Slightly smaller for better gradient stability
73
+ MAX_EPOCHS = 250 # More epochs for better convergence
74
+ LEARNING_RATE = 5e-5#3e-5#1e-5 # Lower LR prevents catastrophic forgetting
75
+ WARMUP_STEPS = 500 # Gradual warmup for stability
76
+ WEIGHT_DECAY = 0.00001 # Lighter regularization
77
+ ACCUMULATE_GRAD_BATCHES = 4 # Effective batch = 8*4 = 32
78
+
79
+
80
+ os.makedirs(SAVE_DIR, exist_ok=True)
81
+
82
+ # ============================================
83
+ # Load Model and Fix Tokenizer Path
84
+ # ============================================
85
+ print("🔹 Loading pretrained model...")
86
+ model = EncDecHybridRNNTCTCBPEModel.restore_from(BASE_MODEL_PATH)
87
+
88
+ # CRITICAL FIX: Set tokenizer directory to current model directory
89
+ print("🔹 Fixing tokenizer configuration...")
90
+ with open_dict(model.cfg):
91
+ # Set tokenizer directory to the extracted location
92
+ tokenizer_dir = os.path.join(os.path.dirname(BASE_MODEL_PATH), "tokenizer")
93
+ os.makedirs(tokenizer_dir, exist_ok=True)
94
+
95
+ if hasattr(model, 'tokenizer'):
96
+ print(f"ℹ️ Using existing SentencePiece tokenizer at: {tokenizer_dir}")
97
+ model.cfg.tokenizer.dir = tokenizer_dir
98
+ model.cfg.tokenizer.type = "bpe"
99
+ else:
100
+ print("⚠️ No tokenizer object found in model — check model restoration path.")
101
+
102
+ # Disable validation/test datasets (will add custom validation)
103
+ if 'validation_ds' in model.cfg:
104
+ model.cfg.validation_ds.manifest_filepath = None
105
+ if 'test_ds' in model.cfg:
106
+ model.cfg.test_ds.manifest_filepath = None
107
+
108
+ # ============================================
109
+ # Setup Training Data (OPTIMIZED)
110
+ # ============================================
111
+ print("🔹 Setting up training data...")
112
+ train_ds_config = {
113
+ "manifest_filepath": TRAIN_MANIFEST,
114
+ "batch_size": BATCH_SIZE,
115
+ "shuffle": True,
116
+ "num_workers": 0,
117
+ "pin_memory": False,
118
+ "sample_rate": 16000,
119
+ "max_duration": 20.0, # Limit very long utterances
120
+ "min_duration": 0.5, # Filter out very short clips
121
+ "trim_silence": True,
122
+ "use_start_end_token": True,
123
+ # CRITICAL: Enable these for better training
124
+ "normalize_transcripts": True,
125
+ "parser": "ar", # Arabic language parser
126
+ }
127
+
128
+ model.setup_training_data(train_ds_config)
129
+
130
+ # ============================================
131
+ # Configure Optimizer (OPTIMIZED FOR CONVERGENCE)
132
+ # ============================================
133
+ print("🔹 Configuring optimizer...")
134
+ with open_dict(model.cfg):
135
+ # Use AdamW with lower learning rate for fine-tuning
136
+ model.cfg.optim.name = "adamw"
137
+ model.cfg.optim.lr = LEARNING_RATE
138
+ model.cfg.optim.betas = [0.9, 0.98] # Better for transformers
139
+ model.cfg.optim.weight_decay = WEIGHT_DECAY
140
+ model.cfg.optim.eps = 1e-8
141
+
142
+ # Add learning rate scheduling for better convergence
143
+ model.cfg.optim.sched = {
144
+ "name": "CosineAnnealing",
145
+ "warmup_steps": 1000,
146
+ "warmup_ratio": None,
147
+ "min_lr": 1e-7,
148
+ "last_epoch": -1,
149
+ }
150
+
151
+ # CRITICAL: Disable aggressive augmentation during fine-tuning
152
+ if 'spec_augment' in model.cfg:
153
+ model.cfg.spec_augment.freq_masks = 0 # Reduce from default
154
+ model.cfg.spec_augment.time_masks = 0 # Reduce from default
155
+ model.cfg.spec_augment.freq_width = 15 # Reduce masking
156
+ model.cfg.spec_augment.time_width = 0.03 # Reduce masking
157
+
158
+ # ============================================
159
+ # Configure Loss Weights for Hybrid Model (OPTIMIZED)
160
+ # ============================================
161
+ print("🔹 Optimizing loss weights...")
162
+ # For Hybrid RNNT-CTC models, balance the losses
163
+ if hasattr(model, 'loss_alpha'):
164
+ # 0.5 = balanced, adjust based on your data
165
+ # Higher CTC weight (0.7-0.9) often works better for fine-tuning
166
+ model.loss_alpha = 0.9 # 70% CTC, 30% RNNT
167
+ print(f" Loss alpha set to: {model.loss_alpha}")
168
+
169
+ # ============================================
170
+ # Callbacks for Best Model Selection
171
+ # ============================================
172
+ print("🔹 Setting up model checkpointing...")
173
+
174
+ # Save best model based on training loss (since no validation set)
175
+ checkpoint_callback = ModelCheckpoint(
176
+ dirpath=SAVE_DIR,
177
+ filename='best-model-{epoch:02d}-{train_loss:.4f}',
178
+ save_top_k=5,
179
+ monitor='train_loss',
180
+ mode='min',
181
+ save_last=True,
182
+ every_n_epochs=2,
183
+ )
184
+
185
+ early_stop_callback = EarlyStopping(
186
+ monitor='train_loss',
187
+ patience=20,
188
+ mode='min',
189
+ verbose=True,
190
+ )
191
+
192
+
193
+ # Monitor learning rate
194
+ lr_monitor = LearningRateMonitor(logging_interval='step')
195
+
196
+ # ============================================
197
+ # Trainer Configuration (CPU Mode - OPTIMIZED)
198
+ # ============================================
199
+ print("🔹 Configuring trainer for CPU...")
200
+ trainer = Trainer(
201
+ accelerator="cpu",
202
+ devices=1,
203
+ max_epochs=MAX_EPOCHS,
204
+ log_every_n_steps=1,
205
+ enable_checkpointing=True,
206
+ default_root_dir=SAVE_DIR,
207
+ callbacks=[checkpoint_callback, early_stop_callback, lr_monitor],
208
+ gradient_clip_val=1.0, # Prevent gradient explosion
209
+ gradient_clip_algorithm="norm",
210
+ accumulate_grad_batches=8, # Effective batch size = 4*8 = 32
211
+ val_check_interval=1.0, # Validate every epoch
212
+ enable_progress_bar=True,
213
+ enable_model_summary=True,
214
+ )
215
+
216
+ # ============================================
217
+ # Start Training
218
+ # ============================================
219
+ print("=" * 60)
220
+ print("🚀 STARTING OPTIMIZED FINE-TUNING")
221
+ print("=" * 60)
222
+ print(f" Model: {BASE_MODEL_PATH}")
223
+ print(f" Training samples: {valid_count}")
224
+ print(f" Max epochs: {MAX_EPOCHS}")
225
+ print(f" Batch size: {BATCH_SIZE} (effective: {BATCH_SIZE * 8})")
226
+ print(f" Learning rate: {LEARNING_RATE}")
227
+ print(f" Warmup steps: {WARMUP_STEPS}")
228
+ print(f" Loss weighting: CTC={model.loss_alpha if hasattr(model, 'loss_alpha') else 'N/A'}")
229
+ print(f" Early stopping patience: 20 epochs")
230
+ print("=" * 60)
231
+ print("⚠️ CPU training will be slow. For faster training, use Google Colab GPU.")
232
+ print("=" * 60)
233
+
234
+ try:
235
+ trainer.fit(model)
236
+ print("\n✅ Training completed successfully!")
237
+
238
+ # Load the best checkpoint
239
+ best_model_path = checkpoint_callback.best_model_path
240
+ if best_model_path:
241
+ print(f"📊 Best model checkpoint: {best_model_path}")
242
+ print(f" Best loss: {checkpoint_callback.best_model_score:.4f}")
243
+
244
+ # ✅ Safe load for PyTorch 2.6+ (NeMo-compatible)
245
+ import typing # Add this import at the top if not already there
246
+ import omegaconf
247
+ torch.serialization.add_safe_globals([
248
+ omegaconf.dictconfig.DictConfig,
249
+ omegaconf.base.ContainerMetadata,
250
+ omegaconf.listconfig.ListConfig,
251
+ typing.Any, # Add this line
252
+ ])
253
+
254
+
255
+
256
+ checkpoint = torch.load(best_model_path, map_location='cpu', weights_only=False)
257
+ model.load_state_dict(checkpoint['state_dict'])
258
+
259
+ # ✅ Save the fine-tuned model to .nemo format
260
+ output_model_path = os.path.join(SAVE_DIR, "finetuned_model_best.nemo")
261
+ model.save_to(output_model_path)
262
+ print(f"\n💾 Final model saved to: {output_model_path}")
263
+
264
+ # Save training summary
265
+ summary_path = os.path.join(SAVE_DIR, "training_summary.txt")
266
+ with open(summary_path, 'w', encoding='utf-8') as f:
267
+ f.write(f"Training Summary\n")
268
+ f.write(f"================\n")
269
+ f.write(f"Base Model: {BASE_MODEL_PATH}\n")
270
+ f.write(f"Training Samples: {valid_count}\n")
271
+ f.write(f"Final Epochs: {trainer.current_epoch}\n")
272
+ f.write(f"Best Loss: {checkpoint_callback.best_model_score:.4f}\n")
273
+ f.write(f"Learning Rate: {LEARNING_RATE}\n")
274
+ f.write(f"Batch Size: {BATCH_SIZE} (effective: {BATCH_SIZE * 8})\n")
275
+ print(f"📝 Training summary saved to: {summary_path}")
276
+
277
+ print("\n" + "=" * 60)
278
+ print("🎉 OPTIMIZATION COMPLETE!")
279
+ print("=" * 60)
280
+ print("Next steps:")
281
+ print("1. Test your model on validation data to measure WER")
282
+ print("2. If WER is still high, consider:")
283
+ print(" - Increasing training data")
284
+ print(" - Training for more epochs")
285
+ print(" - Adjusting loss_alpha (try 0.5 or 0.9)")
286
+ print(" - Using data augmentation if needed")
287
+ print("=" * 60)
288
+
289
+ except Exception as e:
290
+ print(f"\n❌ Training failed: {e}")
291
+ import traceback
292
+ traceback.print_exc()
293
+ print("\n💡 Troubleshooting tips:")
294
+ print("1. Check if all audio files exist and are valid")
295
+ print("2. Verify manifest format is correct")
296
+ print("3. Ensure sufficient disk space for checkpoints")
297
+ print("4. Try reducing batch_size if out of memory")
298
+
299
+
300
+
301
+
302
+
303
+
304
+
305
+
306
+
307
+
308
+
309
+
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+
318
+
319
+
320
+
321
+
322
+
323
+
324
+
325
+
326
+
327
+
328
+
329
+
330
+
331
+
332
+ # import os
333
+ # import io
334
+ # import json
335
+ # import torch
336
+ # from pytorch_lightning import Trainer
337
+ # from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
338
+ # from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
339
+ # from omegaconf import OmegaConf, open_dict
340
+
341
+ # # ============================================
342
+ # # CRITICAL: Windows CUDA/Numba Fix
343
+ # # ============================================
344
+ # os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
345
+ # os.environ["NUMBA_CUDA_USE_NVIDIA_BINDING"] = "1"
346
+ # os.environ["NUMBA_DISABLE_JIT"] = "0"
347
+ # os.environ["NUMBA_CUDA_DRIVER"] = "cuda"
348
+ # os.environ["CUDA_VISIBLE_DEVICES"] = "" # Force CPU for Windows stability
349
+
350
+ # # ============================================
351
+ # # UTF-8 Encoding Fix
352
+ # # ============================================
353
+ # manifest_path = "train_manifest.jsonl"
354
+
355
+ # with io.open(manifest_path, 'r', encoding='utf-8', errors='ignore') as f:
356
+ # content = f.read()
357
+ # with io.open(manifest_path, 'w', encoding='utf-8') as f:
358
+ # f.write(content)
359
+ # print("✅ train_manifest.jsonl converted to UTF-8")
360
+
361
+ # # Patch builtins.open for UTF-8
362
+ # import builtins
363
+ # _old_open = open
364
+ # def open_utf8(file, *args, **kwargs):
365
+ # if isinstance(file, str) and file.endswith('.jsonl') and 'encoding' not in kwargs:
366
+ # kwargs['encoding'] = 'utf-8'
367
+ # return _old_open(file, *args, **kwargs)
368
+ # builtins.open = open_utf8
369
+
370
+ # # ============================================
371
+ # # Validate Manifest (With Optional Validation Split)
372
+ # # ============================================
373
+ # USE_VALIDATION = True # Set to False if you don't want validation split
374
+
375
+ # def validate_manifest(manifest_path, create_val_split=True, val_split=0.1):
376
+ # """Validate manifest and optionally create train/val split"""
377
+ # valid_entries = []
378
+
379
+ # with open(manifest_path, "r", encoding="utf-8") as f:
380
+ # for i, line in enumerate(f, 1):
381
+ # try:
382
+ # item = json.loads(line.strip())
383
+ # assert os.path.exists(item["audio_filepath"]), f"Missing: {item['audio_filepath']}"
384
+ # assert "text" in item and item["text"].strip(), "Empty text"
385
+
386
+ # # Optional: Filter by duration for quality
387
+ # duration = item.get("duration", 0)
388
+ # if 0.5 <= duration <= 20.0: # Keep reasonable lengths
389
+ # valid_entries.append(item)
390
+ # except Exception as e:
391
+ # print(f"⚠️ Skipping line {i}: {e}")
392
+
393
+ # print(f"✅ Total valid entries: {len(valid_entries)}")
394
+
395
+ # if not create_val_split:
396
+ # # Use entire dataset for training
397
+ # print("📊 Using all data for training (no validation split)")
398
+ # return manifest_path, None, len(valid_entries), 0
399
+
400
+ # # Split into train/val
401
+ # import random
402
+ # random.seed(42)
403
+ # random.shuffle(valid_entries)
404
+
405
+ # split_idx = int(len(valid_entries) * (1 - val_split))
406
+ # train_entries = valid_entries[:split_idx]
407
+ # val_entries = valid_entries[split_idx:]
408
+
409
+ # # Save splits
410
+ # train_manifest = "train_split.jsonl"
411
+ # val_manifest = "val_split.jsonl"
412
+
413
+ # with open(train_manifest, "w", encoding="utf-8") as f:
414
+ # for entry in train_entries:
415
+ # f.write(json.dumps(entry, ensure_ascii=False) + "\n")
416
+
417
+ # with open(val_manifest, "w", encoding="utf-8") as f:
418
+ # for entry in val_entries:
419
+ # f.write(json.dumps(entry, ensure_ascii=False) + "\n")
420
+
421
+ # print(f"📊 Train samples: {len(train_entries)}")
422
+ # print(f"📊 Validation samples: {len(val_entries)}")
423
+
424
+ # return train_manifest, val_manifest, len(train_entries), len(val_entries)
425
+
426
+ # train_manifest, val_manifest, train_count, val_count = validate_manifest(
427
+ # manifest_path,
428
+ # create_val_split=USE_VALIDATION
429
+ # )
430
+
431
+ # if train_count == 0:
432
+ # raise ValueError("No valid training samples found!")
433
+
434
+ # # ============================================
435
+ # # Configuration (OPTIMIZED FOR 4000+ SAMPLES)
436
+ # # ============================================
437
+ # BASE_MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
438
+ # SAVE_DIR = "output_finetuned"
439
+
440
+ # # OPTIMIZED HYPERPARAMETERS FOR LARGE DATASET
441
+ # BATCH_SIZE = 8 # Larger batch for 4000+ samples (adjust based on RAM)
442
+ # MAX_EPOCHS = 100 # Sufficient for convergence with large dataset
443
+ # LEARNING_RATE = 5e-5 # Moderate LR for large dataset
444
+ # WARMUP_RATIO = 0.05 # 5% warmup of total steps
445
+ # WEIGHT_DECAY = 0.0001 # Regularization for generalization
446
+ # ACCUMULATE_GRAD_BATCHES = 4 # Effective batch = 8*4 = 32
447
+
448
+ # os.makedirs(SAVE_DIR, exist_ok=True)
449
+
450
+ # # ============================================
451
+ # # Load Model and Fix Tokenizer Path
452
+ # # ============================================
453
+ # print("🔹 Loading pretrained model...")
454
+ # model = EncDecHybridRNNTCTCBPEModel.restore_from(BASE_MODEL_PATH)
455
+
456
+ # print("🔹 Fixing tokenizer configuration...")
457
+ # with open_dict(model.cfg):
458
+ # tokenizer_dir = os.path.join(os.path.dirname(BASE_MODEL_PATH), "tokenizer")
459
+ # os.makedirs(tokenizer_dir, exist_ok=True)
460
+
461
+ # if hasattr(model, 'tokenizer'):
462
+ # model.cfg.tokenizer.dir = tokenizer_dir
463
+ # model.cfg.tokenizer.type = "bpe"
464
+
465
+ # # CRITICAL: Properly disable validation dataset
466
+ # if 'validation_ds' in model.cfg:
467
+ # model.cfg.validation_ds = None
468
+
469
+ # # Disable test dataset
470
+ # if 'test_ds' in model.cfg:
471
+ # model.cfg.test_ds = None
472
+
473
+ # # ============================================
474
+ # # Setup Training Data (OPTIMIZED FOR ACCURACY)
475
+ # # ============================================
476
+ # print("🔹 Setting up training data...")
477
+ # train_ds_config = {
478
+ # "manifest_filepath": train_manifest,
479
+ # "batch_size": BATCH_SIZE,
480
+ # "shuffle": True,
481
+ # "num_workers": 0,
482
+ # "pin_memory": False,
483
+ # "sample_rate": 16000,
484
+ # "max_duration": 20.0,
485
+ # "min_duration": 0.5,
486
+ # "trim_silence": True,
487
+ # "use_start_end_token": True,
488
+ # "normalize_transcripts": True,
489
+ # "parser": "ar",
490
+ # # Add augmentation for better generalization (light for fine-tuning)
491
+ # "augmentor": None, # Disable for now, can enable if needed
492
+ # }
493
+
494
+ # model.setup_training_data(train_ds_config)
495
+
496
+ # # ============================================
497
+ # # Setup Validation Data (OPTIONAL)
498
+ # # ============================================
499
+ # if USE_VALIDATION and val_manifest:
500
+ # print("🔹 Setting up validation data...")
501
+ # val_ds_config = {
502
+ # "manifest_filepath": val_manifest,
503
+ # "batch_size": BATCH_SIZE,
504
+ # "shuffle": False,
505
+ # "num_workers": 0,
506
+ # "pin_memory": False,
507
+ # "sample_rate": 16000,
508
+ # "normalize_transcripts": True,
509
+ # "parser": "ar",
510
+ # }
511
+ # model.setup_validation_data(val_ds_config)
512
+ # else:
513
+ # print("⚠️ No validation data - monitoring training loss only")
514
+
515
+ # # ============================================
516
+ # # Configure Optimizer (OPTIMIZED FOR CONVERGENCE)
517
+ # # ============================================
518
+ # print("🔹 Configuring optimizer...")
519
+
520
+ # # Calculate total steps for scheduling
521
+ # steps_per_epoch = train_count // (BATCH_SIZE * ACCUMULATE_GRAD_BATCHES)
522
+ # total_steps = steps_per_epoch * MAX_EPOCHS
523
+ # warmup_steps = int(total_steps * WARMUP_RATIO)
524
+
525
+ # print(f" Steps per epoch: {steps_per_epoch}")
526
+ # print(f" Total training steps: {total_steps}")
527
+ # print(f" Warmup steps: {warmup_steps}")
528
+
529
+ # with open_dict(model.cfg):
530
+ # # AdamW optimizer with optimal settings
531
+ # model.cfg.optim.name = "adamw"
532
+ # model.cfg.optim.lr = LEARNING_RATE
533
+ # model.cfg.optim.betas = [0.9, 0.999]
534
+ # model.cfg.optim.weight_decay = WEIGHT_DECAY
535
+ # model.cfg.optim.eps = 1e-8
536
+
537
+ # # Polynomial decay with warmup (better than cosine for fine-tuning)
538
+ # model.cfg.optim.sched = {
539
+ # "name": "PolynomialDecayAnnealing",
540
+ # "warmup_steps": warmup_steps,
541
+ # "warmup_ratio": None,
542
+ # "min_lr": 1e-7,
543
+ # "power": 1.0,
544
+ # "last_epoch": -1,
545
+ # }
546
+
547
+ # # LIGHT augmentation for fine-tuning (prevents overfitting)
548
+ # if 'spec_augment' in model.cfg:
549
+ # model.cfg.spec_augment.freq_masks = 1
550
+ # model.cfg.spec_augment.time_masks = 2
551
+ # model.cfg.spec_augment.freq_width = 10
552
+ # model.cfg.spec_augment.time_width = 0.025
553
+
554
+ # # ============================================
555
+ # # Configure Loss Weights (OPTIMIZED FOR HYBRID)
556
+ # # ============================================
557
+ # print("🔹 Optimizing loss weights...")
558
+ # if hasattr(model, 'loss_alpha'):
559
+ # # For Arabic: CTC often works better for fine-tuning
560
+ # model.loss_alpha = 0.8 # 80% CTC, 20% RNNT
561
+ # print(f" Loss alpha: {model.loss_alpha} (CTC-focused)")
562
+
563
+ # # ============================================
564
+ # # Callbacks for Best Model Selection
565
+ # # ============================================
566
+ # print("🔹 Setting up callbacks...")
567
+
568
+ # # Choose monitor metric based on validation availability
569
+ # monitor_metric = 'val_loss' if USE_VALIDATION else 'train_loss'
570
+ # monitor_mode = 'min'
571
+
572
+ # # Save best model based on available metric
573
+ # checkpoint_callback = ModelCheckpoint(
574
+ # dirpath=SAVE_DIR,
575
+ # filename=f'best-{{epoch:02d}}-{{{monitor_metric}:.4f}}',
576
+ # save_top_k=3,
577
+ # monitor=monitor_metric,
578
+ # mode=monitor_mode,
579
+ # save_last=True,
580
+ # every_n_epochs=1,
581
+ # verbose=True,
582
+ # )
583
+
584
+ # # Early stopping based on available metric
585
+ # early_stop_callback = EarlyStopping(
586
+ # monitor=monitor_metric,
587
+ # patience=15, # Stop if no improvement for 15 epochs
588
+ # mode=monitor_mode,
589
+ # verbose=True,
590
+ # min_delta=0.0001,
591
+ # )
592
+
593
+ # lr_monitor = LearningRateMonitor(logging_interval='step')
594
+
595
+ # # ============================================
596
+ # # Trainer Configuration (OPTIMIZED FOR CPU)
597
+ # # ============================================
598
+ # print("🔹 Configuring trainer...")
599
+ # trainer = Trainer(
600
+ # accelerator="cpu",
601
+ # devices=1,
602
+ # max_epochs=MAX_EPOCHS,
603
+ # log_every_n_steps=5,
604
+ # enable_checkpointing=True,
605
+ # default_root_dir=SAVE_DIR,
606
+ # callbacks=[checkpoint_callback, early_stop_callback, lr_monitor],
607
+ # gradient_clip_val=1.0,
608
+ # gradient_clip_algorithm="norm",
609
+ # accumulate_grad_batches=ACCUMULATE_GRAD_BATCHES,
610
+ # val_check_interval=1.0, # Validate every epoch
611
+ # enable_progress_bar=True,
612
+ # enable_model_summary=True,
613
+ # deterministic=False, # Faster training
614
+ # benchmark=False,
615
+ # )
616
+
617
+ # # ============================================
618
+ # # Start Training
619
+ # # ============================================
620
+ # print("=" * 70)
621
+ # print("🚀 STARTING OPTIMIZED FINE-TUNING FOR 4000+ SAMPLES")
622
+ # print("=" * 70)
623
+ # print(f" Base Model: {BASE_MODEL_PATH}")
624
+ # print(f" Training samples: {train_count}")
625
+ # print(f" Validation samples: {val_count if USE_VALIDATION else 'None (using training loss)'}")
626
+ # print(f" Max epochs: {MAX_EPOCHS}")
627
+ # print(f" Batch size: {BATCH_SIZE} (effective: {BATCH_SIZE * ACCUMULATE_GRAD_BATCHES})")
628
+ # print(f" Learning rate: {LEARNING_RATE}")
629
+ # print(f" Warmup steps: {warmup_steps}")
630
+ # print(f" Weight decay: {WEIGHT_DECAY}")
631
+ # print(f" Loss weighting: CTC={model.loss_alpha if hasattr(model, 'loss_alpha') else 'N/A'}")
632
+ # print(f" Monitoring: {monitor_metric}")
633
+ # print(f" Early stopping: 15 epochs patience")
634
+ # print("=" * 70)
635
+ # print("⏱️ Estimated time: ~{:.1f} hours (depends on CPU)".format(
636
+ # train_count * MAX_EPOCHS / (BATCH_SIZE * 3600 * 0.5) # Rough estimate
637
+ # ))
638
+ # print("=" * 70)
639
+
640
+ # try:
641
+ # trainer.fit(model)
642
+ # print("\n✅ Training completed successfully!")
643
+
644
+ # # Load and save the best checkpoint
645
+ # best_model_path = checkpoint_callback.best_model_path
646
+ # if best_model_path:
647
+ # print(f"\n📊 Best model checkpoint: {best_model_path}")
648
+ # print(f" Best {monitor_metric}: {checkpoint_callback.best_model_score:.4f}")
649
+
650
+ # # Safe load for PyTorch 2.6+
651
+ # import typing
652
+ # import omegaconf
653
+ # torch.serialization.add_safe_globals([
654
+ # omegaconf.dictconfig.DictConfig,
655
+ # omegaconf.base.ContainerMetadata,
656
+ # omegaconf.listconfig.ListConfig,
657
+ # typing.Any,
658
+ # ])
659
+
660
+ # checkpoint = torch.load(best_model_path, map_location='cpu', weights_only=False)
661
+ # model.load_state_dict(checkpoint['state_dict'])
662
+
663
+ # # Save final model
664
+ # output_model_path = os.path.join(SAVE_DIR, "finetuned_model_best.nemo")
665
+ # model.save_to(output_model_path)
666
+ # print(f"\n💾 Final model saved to: {output_model_path}")
667
+
668
+ # # Save training summary
669
+ # summary_path = os.path.join(SAVE_DIR, "training_summary.txt")
670
+ # with open(summary_path, 'w', encoding='utf-8') as f:
671
+ # f.write(f"Training Summary - 4000+ Samples\n")
672
+ # f.write(f"=================================\n")
673
+ # f.write(f"Base Model: {BASE_MODEL_PATH}\n")
674
+ # f.write(f"Training Samples: {train_count}\n")
675
+ # f.write(f"Validation Samples: {val_count if USE_VALIDATION else 'None'}\n")
676
+ # f.write(f"Final Epoch: {trainer.current_epoch}\n")
677
+ # f.write(f"Best {monitor_metric}: {checkpoint_callback.best_model_score:.4f}\n")
678
+ # f.write(f"Learning Rate: {LEARNING_RATE}\n")
679
+ # f.write(f"Batch Size: {BATCH_SIZE} (effective: {BATCH_SIZE * ACCUMULATE_GRAD_BATCHES})\n")
680
+ # f.write(f"Warmup Steps: {warmup_steps}\n")
681
+ # f.write(f"Weight Decay: {WEIGHT_DECAY}\n")
682
+ # print(f"📝 Training summary saved to: {summary_path}")
683
+
684
+ # print("\n" + "=" * 70)
685
+ # print("🎉 TRAINING COMPLETE!")
686
+ # print("=" * 70)
687
+ # print("Next steps:")
688
+ # print("1. Evaluate WER/CER on test set using the best model")
689
+ # print("2. If WER is still high, try:")
690
+ # print(" - Train for more epochs (increase MAX_EPOCHS)")
691
+ # print(" - Adjust loss_alpha (try 0.5 or 0.9)")
692
+ # print(" - Add more training data")
693
+ # print(" - Enable light data augmentation")
694
+ # print("3. Use the validation manifest to monitor overfitting")
695
+ # print("=" * 70)
696
+
697
+ # except KeyboardInterrupt:
698
+ # print("\n⚠️ Training interrupted by user")
699
+ # print("💾 Saving last checkpoint...")
700
+ # if hasattr(checkpoint_callback, 'last_model_path'):
701
+ # print(f" Last checkpoint: {checkpoint_callback.last_model_path}")
702
+
703
+ # except Exception as e:
704
+ # print(f"\n❌ Training failed: {e}")
705
+ # import traceback
706
+ # traceback.print_exc()
707
+ # print("\n💡 Troubleshooting:")
708
+ # print("1. Reduce BATCH_SIZE if out of memory")
709
+ # print("2. Check audio file paths in manifest")
710
+ # print("3. Verify all audio files are valid WAV format")
711
+ # print("4. Ensure sufficient disk space for checkpoints")
testing_main.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import sounddevice as sd
2
+ # import scipy.io.wavfile as wav
3
+ # import nemo.collections.asr as nemo_asr
4
+
5
+ # # ===== SETTINGS =====
6
+ # SAMPLE_RATE = 16000
7
+ # DURATION = 10 # seconds
8
+ # OUTPUT_FILE = "arabic_recording.wav"
9
+
10
+ # # ===== STEP 1: Record audio =====
11
+ # print("🎙️ Recording... Speak Arabic now!")
12
+ # audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
13
+ # sd.wait()
14
+ # wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
15
+ # print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")
16
+
17
+ # # ===== STEP 2: Load ASR model =====
18
+ # print("📥 Loading Arabic ASR model...")
19
+ # asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
20
+ # "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
21
+ # )
22
+
23
+ # # ===== STEP 3: Configure Decoding =====
24
+ # print("🔍 Configuring decoding strategy...")
25
+
26
+ # # Get decoding config
27
+ # decoding_cfg = asr_model.cfg.decoding
28
+
29
+ # # Print available parameters to debug
30
+ # print(f"Available decoding strategies: {decoding_cfg.keys() if hasattr(decoding_cfg, 'keys') else 'N/A'}")
31
+ # # After loading the model, add this to inspect the config:
32
+ # print("🔍 Beam config structure:")
33
+ # print(decoding_cfg.beam)
34
+ # # Set beam search strategy
35
+ # decoding_cfg.strategy = "beam"
36
+ # decoding_cfg.beam.beam_size = 128
37
+ # decoding_cfg.beam.return_best_hypothesis = True
38
+
39
+ # # Only set parameters that exist
40
+ # if hasattr(decoding_cfg.beam, 'beam_alpha'):
41
+ # decoding_cfg.beam.beam_alpha = 0.3
42
+ # print("✓ Set beam_alpha")
43
+
44
+ # if hasattr(decoding_cfg.beam, 'beam_beta'):
45
+ # decoding_cfg.beam.beam_beta = 0.5
46
+ # print("✓ Set beam_beta")
47
+
48
+ # # Remove softmax_temperature - it's not supported in this config
49
+ # # If you need temperature sampling, you might need to use a different strategy
50
+
51
+ # # Apply the decoding configuration
52
+ # asr_model.change_decoding_strategy(decoding_cfg)
53
+
54
+ # # ===== STEP 4: Transcribe =====
55
+ # print("🔍 Transcribing...")
56
+ # transcription = asr_model.transcribe(
57
+ # [OUTPUT_FILE],
58
+ # batch_size=1,
59
+ # num_workers=0
60
+ # )
61
+
62
+ # print("📝 Transcription:", transcription[0])
63
+
64
+
65
+
66
+ # import sounddevice as sd
67
+ # import scipy.io.wavfile as wav
68
+ # import nemo.collections.asr as nemo_asr
69
+
70
+ # # ===== SETTINGS =====
71
+ # SAMPLE_RATE = 16000
72
+ # DURATION = 10
73
+ # OUTPUT_FILE = "arabic_recording.wav"
74
+
75
+ # # ===== STEP 1: Record audio =====
76
+ # print("🎙️ Recording... Speak Arabic now!")
77
+ # audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
78
+ # sd.wait()
79
+ # wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
80
+ # print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")
81
+
82
+ # # ===== STEP 2: Load ASR model =====
83
+ # print("📥 Loading Arabic ASR model...")
84
+ # asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
85
+ # "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
86
+ # )
87
+
88
+ # # ===== STEP 3: Configure for LITERAL transcription =====
89
+ # print("🔍 Configuring greedy decoding for literal output...")
90
+
91
+ # decoding_cfg = asr_model.cfg.decoding
92
+ # decoding_cfg.strategy = "greedy"
93
+
94
+ # # CRITICAL: Increase max_symbols to avoid truncating repetitions
95
+ # # The default is only 10, which is very restrictive!
96
+ # decoding_cfg.greedy.max_symbols = 1000 # Allow much longer sequences
97
+ # decoding_cfg.beam.beam_size = 64
98
+ # decoding_cfg.beam.search_type = "beam"
99
+ # print(f"✓ Set max_symbols to {decoding_cfg.greedy.max_symbols}")
100
+ # print("Updated config:", decoding_cfg)
101
+
102
+ # # Apply configuration
103
+ # asr_model.change_decoding_strategy(decoding_cfg)
104
+
105
+ # # ===== STEP 4: Transcribe =====
106
+ # print("🔍 Transcribing...")
107
+ # transcription = asr_model.transcribe(
108
+ # [OUTPUT_FILE],
109
+ # batch_size=1,
110
+ # num_workers=0
111
+ # )
112
+
113
+ # print("📝 Literal Transcription:", transcription[0])
114
+
115
+
116
+ import sounddevice as sd
117
+ import scipy.io.wavfile as wav
118
+ import nemo.collections.asr as nemo_asr
119
+ from omegaconf import OmegaConf
120
+
121
+ # ===== SETTINGS =====
122
+ SAMPLE_RATE = 16000
123
+ DURATION = 10
124
+ OUTPUT_FILE = "arabic_recording.wav"
125
+ # ===== STEP 2: Load ASR model =====
126
+ print("📥 Loading Arabic ASR model...")
127
+ asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(
128
+ "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
129
+ )
130
+
131
+
132
+ # Add this right after loading the model to see what's actually available:
133
+ print("Available greedy parameters:")
134
+ print(OmegaConf.to_yaml(asr_model.cfg.decoding.greedy))
135
+
136
+ # ===== STEP 3: Configure for LITERAL transcription =====
137
+ print("🔍 Configuring greedy decoding for literal output...")
138
+
139
+ # Set struct mode to False temporarily to allow modifications
140
+ OmegaConf.set_struct(asr_model.cfg.decoding, False)
141
+ OmegaConf.set_struct(asr_model.cfg.decoding.greedy, False)
142
+
143
+ decoding_cfg = asr_model.cfg.decoding
144
+ decoding_cfg.strategy = "maes"
145
+
146
+ # Now try setting the parameters
147
+ try:
148
+ decoding_cfg.greedy.max_symbols_per_step = 300
149
+ print(f"✓ max_symbols_per_step: {decoding_cfg.greedy.max_symbols_per_step}")
150
+ except:
151
+ print("⚠ Could not set max_symbols_per_step")
152
+
153
+ decoding_cfg.greedy.max_symbols = 500
154
+ decoding_cfg.greedy.loop_labels = True
155
+ decoding_cfg.greedy.preserve_alignments = True
156
+ decoding_cfg.preserve_alignments = True
157
+ decoding_cfg.compute_timestamps = True
158
+ decoding_cfg.temperature = 1.3
159
+
160
+ decoding_cfg.beam.beam_size = 64
161
+ decoding_cfg.beam.softmax_temperature = 1.3
162
+ decoding_cfg.beam.search_type = "beam"
163
+ print(f"✓ max_symbols: {decoding_cfg.greedy.max_symbols}")
164
+ print(f"✓ loop_labels: {decoding_cfg.greedy.loop_labels}")
165
+ print(f"✓ temperature: {decoding_cfg.temperature}")
166
+
167
+ # Re-enable struct mode
168
+ OmegaConf.set_struct(asr_model.cfg.decoding, True)
169
+ OmegaConf.set_struct(asr_model.cfg.decoding.greedy, True)
170
+
171
+ # Apply configuration
172
+ asr_model.change_decoding_strategy(decoding_cfg)
173
+
174
+
175
+ # ===== STEP 1: Record audio =====
176
+ print("🎙️ Recording... Speak Arabic now!")
177
+ audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
178
+ sd.wait()
179
+ wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
180
+ print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")
181
+
182
+
183
+
184
+ # ===== STEP 4: Transcribe =====
185
+ print("🔍 Transcribing...")
186
+ transcription = asr_model.transcribe(
187
+ [OUTPUT_FILE],
188
+ batch_size=1,
189
+ num_workers=0
190
+ )
191
+
192
+ print("📝 Literal Transcription:", transcription[0])
testing_main_v2.py ADDED
@@ -0,0 +1,473 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import sounddevice as sd
2
+ # import scipy.io.wavfile as wav
3
+ # import nemo.collections.asr as nemo_asr
4
+ # import torch
5
+ # import numpy as np
6
+ # from typing import List, Tuple
7
+
8
+ # # ===== SETTINGS =====
9
+ # SAMPLE_RATE = 16000
10
+ # DURATION = 10 # seconds
11
+ # OUTPUT_FILE = "arabic_recording.wav"
12
+
13
+ # class RepetitionAwareTranscriber:
14
+ # def __init__(self, model_path: str):
15
+ # """Initialize ASR model with repetition-aware configuration"""
16
+ # print("📥 Loading Arabic ASR model...")
17
+ # self.asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
18
+ # self._configure_decoding()
19
+
20
+ # def _configure_decoding(self):
21
+ # """Configure advanced decoding strategy"""
22
+ # decoding_cfg = self.asr_model.cfg.decoding
23
+
24
+ # # Use beam search for better sequence modeling
25
+ # decoding_cfg.strategy = "beam"
26
+ # decoding_cfg.beam.beam_size = 128 # Larger beam for more candidates
27
+ # decoding_cfg.beam.return_best_hypothesis = False # Get multiple hypotheses
28
+
29
+ # # Language model parameters (if available)
30
+ # if hasattr(decoding_cfg.beam, 'beam_alpha'):
31
+ # decoding_cfg.beam.beam_alpha = 0.3 # LM weight (lower = less LM influence)
32
+ # if hasattr(decoding_cfg.beam, 'beam_beta'):
33
+ # decoding_cfg.beam.beam_beta = 0.5 # Word insertion bonus
34
+
35
+ # self.asr_model.change_decoding_strategy(decoding_cfg)
36
+
37
+ # def transcribe_with_logprobs(self, audio_file: str, temperature: float = 1.0):
38
+ # """
39
+ # Transcribe with log probabilities and temperature scaling
40
+
41
+ # Args:
42
+ # audio_file: Path to audio file
43
+ # temperature: Controls randomness (lower = more conservative, higher = more diverse)
44
+ # 0.5 = more deterministic
45
+ # 1.0 = standard
46
+ # 1.5 = more exploratory
47
+ # """
48
+ # print(f"🔍 Transcribing with temperature={temperature}...")
49
+
50
+ # # Update temperature in decoding config
51
+ # if hasattr(self.asr_model.cfg.decoding, 'temperature'):
52
+ # self.asr_model.cfg.decoding.temperature = temperature
53
+ # if hasattr(self.asr_model.cfg.decoding.beam, 'softmax_temperature'):
54
+ # self.asr_model.cfg.decoding.beam.softmax_temperature = temperature
55
+
56
+ # self.asr_model.change_decoding_strategy(self.asr_model.cfg.decoding)
57
+
58
+ # # Get multiple hypotheses with their scores
59
+ # hypotheses = self.asr_model.transcribe(
60
+ # [audio_file],
61
+ # batch_size=1,
62
+ # return_hypotheses=True,
63
+ # num_workers=0
64
+ # )
65
+
66
+ # # Handle different return types
67
+ # if isinstance(hypotheses, list) and len(hypotheses) > 0:
68
+ # hyp = hypotheses[0]
69
+
70
+ # # Check if it's a Hypothesis object or a list
71
+ # if isinstance(hyp, list):
72
+ # # It's already a list of transcriptions
73
+ # best_text = hyp[0] if len(hyp) > 0 else ""
74
+ # print(f"\n📊 Top hypothesis: {best_text}")
75
+ # return best_text
76
+ # elif hasattr(hyp, 'text'):
77
+ # # It's a Hypothesis object
78
+ # text = hyp.text
79
+
80
+ # # Check for nbest hypotheses
81
+ # if hasattr(hyp, 'nbest') and len(hyp.nbest) > 1:
82
+ # print(f"\n📊 Top {min(5, len(hyp.nbest))} hypotheses:")
83
+ # for i, nbest_hyp in enumerate(hyp.nbest[:5]):
84
+ # score = nbest_hyp.score if hasattr(nbest_hyp, 'score') else 'N/A'
85
+ # hyp_text = nbest_hyp.text if hasattr(nbest_hyp, 'text') else str(nbest_hyp)
86
+ # print(f" {i+1}. [{score}] {hyp_text}")
87
+
88
+ # return text
89
+ # else:
90
+ # # Fallback: convert to string
91
+ # return str(hyp)
92
+
93
+ # return ""
94
+
95
+ # def transcribe_with_frame_analysis(self, audio_file: str):
96
+ # """
97
+ # Analyze frame-level predictions to detect repetitions
98
+ # This examines the raw CTC outputs before collapsing
99
+ # """
100
+ # print("🔍 Performing frame-level analysis...")
101
+
102
+ # # Get log probabilities at frame level
103
+ # log_probs = self.asr_model.transcribe(
104
+ # [audio_file],
105
+ # batch_size=1,
106
+ # logprobs=True
107
+ # )
108
+
109
+ # # Standard transcription
110
+ # transcription = self.asr_model.transcribe([audio_file])
111
+
112
+ # return transcription[0], log_probs
113
+
114
+ # def transcribe_with_all_methods(self, audio_file: str):
115
+ # """Try multiple decoding strategies and return all results"""
116
+ # results = {}
117
+
118
+ # # Method 1: Standard beam search
119
+ # print("\n--- Method 1: Standard Beam Search ---")
120
+ # results['beam_standard'] = self.transcribe_with_logprobs(audio_file, temperature=1.0)
121
+
122
+ # # Method 2: Lower temperature (more conservative)
123
+ # print("\n--- Method 2: Conservative (temp=0.5) ---")
124
+ # results['beam_conservative'] = self.transcribe_with_logprobs(audio_file, temperature=0.5)
125
+
126
+ # # Method 3: Higher temperature (more exploratory)
127
+ # print("\n--- Method 3: Exploratory (temp=1.5) ---")
128
+ # results['beam_exploratory'] = self.transcribe_with_logprobs(audio_file, temperature=1.5)
129
+
130
+ # # Method 4: Frame-level analysis
131
+ # print("\n--- Method 4: Frame-level Analysis ---")
132
+ # results['frame_analysis'], _ = self.transcribe_with_frame_analysis(audio_file)
133
+
134
+ # return results
135
+
136
+
137
+ # def post_process_repetitions(text: str, audio_duration: float, expected_word_count: int = None) -> str:
138
+ # """
139
+ # Heuristic post-processing to restore repetitions
140
+
141
+ # Args:
142
+ # text: Transcribed text
143
+ # audio_duration: Duration of audio in seconds
144
+ # expected_word_count: Expected number of words (if known)
145
+ # """
146
+ # words = text.split()
147
+
148
+ # # Calculate speaking rate (words per second)
149
+ # speaking_rate = len(words) / audio_duration
150
+
151
+ # # Normal Arabic speaking rate is 2-3 words per second
152
+ # # For numbers, it's often slower (1-2 words per second)
153
+ # # If rate is too high, likely missing repetitions
154
+
155
+ # if speaking_rate > 3.0 and expected_word_count:
156
+ # print(f"⚠️ Speaking rate unusually high ({speaking_rate:.1f} w/s)")
157
+ # print(f" Expected ~{expected_word_count} words, got {len(words)}")
158
+ # print(" Possible missing repetitions detected")
159
+
160
+ # return text
161
+
162
+
163
+ # def detect_number_patterns(text: str) -> List[str]:
164
+ # """Detect if text contains Arabic number words"""
165
+ # arabic_numbers = [
166
+ # 'صفر', 'زيرو', 'واحد', 'اثنين', 'ثلاثة', 'أربعة',
167
+ # 'خمسة', 'ستة', 'سبعة', 'ثمانية', 'تسعة'
168
+ # ]
169
+
170
+ # words = text.split()
171
+ # detected = [w for w in words if w in arabic_numbers]
172
+
173
+ # if detected:
174
+ # print(f"🔢 Detected number words: {' '.join(detected)}")
175
+
176
+ # return detected
177
+
178
+
179
+ # # ===== MAIN EXECUTION =====
180
+ # if __name__ == "__main__":
181
+ # # ===== STEP 1: Record audio =====
182
+ # print("🎙️ Recording... Speak Arabic now!")
183
+ # print("💡 TIP: For repeated numbers, pause slightly between each repetition")
184
+ # print(" Example: 'زيرو [pause] زيرو [pause] واحد [pause] واحد'\n")
185
+
186
+ # audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
187
+ # sd.wait()
188
+ # wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
189
+ # print(f"✅ Recording finished. Saved as {OUTPUT_FILE}\n")
190
+
191
+ # # ===== STEP 2: Initialize transcriber =====
192
+ # model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
193
+ # transcriber = RepetitionAwareTranscriber(model_path)
194
+
195
+ # # ===== STEP 3: Transcribe with all methods =====
196
+ # results = transcriber.transcribe_with_all_methods(OUTPUT_FILE)
197
+
198
+ # # ===== STEP 4: Display all results =====
199
+ # print("\n" + "="*60)
200
+ # print("📝 FINAL RESULTS:")
201
+ # print("="*60)
202
+
203
+ # for method, transcription in results.items():
204
+ # print(f"\n{method.upper()}:")
205
+ # print(f" {transcription}")
206
+ # detect_number_patterns(transcription)
207
+
208
+ # # ===== STEP 5: Post-processing analysis =====
209
+ # print("\n" + "="*60)
210
+ # print("🔍 POST-PROCESSING ANALYSIS:")
211
+ # print("="*60)
212
+
213
+ # best_transcription = results['beam_standard']
214
+ # processed = post_process_repetitions(best_transcription, DURATION)
215
+
216
+ # print(f"\nBest transcription: {best_transcription}")
217
+ # print(f"Word count: {len(best_transcription.split())}")
218
+ # print(f"Speaking rate: {len(best_transcription.split()) / DURATION:.2f} words/sec")
219
+
220
+ # # ===== STEP 6: Recommendations =====
221
+ # print("\n" + "="*60)
222
+ # print("💡 RECOMMENDATIONS:")
223
+ # print("="*60)
224
+ # print("1. Compare all method outputs above")
225
+ # print("2. If all methods miss repetitions, the issue is in the trained model")
226
+ # print("3. Consider retraining with more repetitive sequences in training data")
227
+ # print("4. When speaking, add slight pauses between repeated words")
228
+ # print("5. If transcribing phone numbers, use digit-by-digit model instead")
229
+
230
+
231
+ import sounddevice as sd
232
+ import scipy.io.wavfile as wav
233
+ import nemo.collections.asr as nemo_asr
234
+ import torch
235
+ import numpy as np
236
+ from typing import List, Tuple
237
+
238
+ # ===== SETTINGS =====
239
+ SAMPLE_RATE = 16000
240
+ DURATION = 10 # seconds
241
+ OUTPUT_FILE = "arabic_recording.wav"
242
+
243
+ class RepetitionAwareTranscriber:
244
+ def __init__(self, model_path: str):
245
+ """Initialize ASR model with repetition-aware configuration"""
246
+ print("📥 Loading Arabic ASR model...")
247
+ # Try to load as Hybrid RNNT-CTC first (better for repetitions!)
248
+ try:
249
+ self.asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(model_path)
250
+ self.model_type = "hybrid_rnnt_ctc"
251
+ print("✅ Loaded as Hybrid RNNT-CTC model (excellent for repetitions!)")
252
+ except:
253
+ try:
254
+ self.asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(model_path)
255
+ self.model_type = "rnnt"
256
+ print("✅ Loaded as RNNT model")
257
+ except:
258
+ self.asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
259
+ self.model_type = "ctc"
260
+ print("✅ Loaded as CTC model")
261
+
262
+ self._configure_decoding()
263
+
264
+ def _configure_decoding(self):
265
+ """Configure advanced decoding strategy"""
266
+ decoding_cfg = self.asr_model.cfg.decoding
267
+
268
+ # Use beam search for better sequence modeling
269
+ decoding_cfg.strategy = "beam"
270
+ decoding_cfg.beam.beam_size = 128 # Larger beam for more candidates
271
+ decoding_cfg.beam.return_best_hypothesis = False # Get multiple hypotheses
272
+
273
+ # Language model parameters (if available)
274
+ if hasattr(decoding_cfg.beam, 'beam_alpha'):
275
+ decoding_cfg.beam.beam_alpha = 0.3 # LM weight (lower = less LM influence)
276
+ if hasattr(decoding_cfg.beam, 'beam_beta'):
277
+ decoding_cfg.beam.beam_beta = 0.5 # Word insertion bonus
278
+
279
+ self.asr_model.change_decoding_strategy(decoding_cfg)
280
+
281
+ def transcribe_with_logprobs(self, audio_file: str, temperature: float = 1.0):
282
+ """
283
+ Transcribe with log probabilities and temperature scaling
284
+
285
+ Args:
286
+ audio_file: Path to audio file
287
+ temperature: Controls randomness (lower = more conservative, higher = more diverse)
288
+ 0.5 = more deterministic
289
+ 1.0 = standard
290
+ 1.5 = more exploratory
291
+ """
292
+ print(f"🔍 Transcribing with temperature={temperature}...")
293
+
294
+ # Update temperature in decoding config
295
+ if hasattr(self.asr_model.cfg.decoding, 'temperature'):
296
+ self.asr_model.cfg.decoding.temperature = temperature
297
+ if hasattr(self.asr_model.cfg.decoding.beam, 'softmax_temperature'):
298
+ self.asr_model.cfg.decoding.beam.softmax_temperature = temperature
299
+
300
+ self.asr_model.change_decoding_strategy(self.asr_model.cfg.decoding)
301
+
302
+ # Get multiple hypotheses with their scores
303
+ hypotheses = self.asr_model.transcribe(
304
+ [audio_file],
305
+ batch_size=1,
306
+ return_hypotheses=True,
307
+ num_workers=0
308
+ )
309
+ print(hypotheses)
310
+ # Handle different return types
311
+ if isinstance(hypotheses, list) and len(hypotheses) > 0:
312
+ hyp = hypotheses[0]
313
+
314
+ # Check if it's a Hypothesis object or a list
315
+ if isinstance(hyp, list):
316
+ # It's already a list of transcriptions
317
+ best_text = hyp[0] if len(hyp) > 0 else ""
318
+ print(f"\n📊 Top hypothesis: {best_text}")
319
+ return best_text
320
+ elif hasattr(hyp, 'text'):
321
+ # It's a Hypothesis object
322
+ text = hyp.text
323
+
324
+ # Check for nbest hypotheses
325
+ if hasattr(hyp, 'nbest') and len(hyp.nbest) > 1:
326
+ print(f"\n📊 Top {min(5, len(hyp.nbest))} hypotheses:")
327
+ for i, nbest_hyp in enumerate(hyp.nbest[:5]):
328
+ score = nbest_hyp.score if hasattr(nbest_hyp, 'score') else 'N/A'
329
+ hyp_text = nbest_hyp.text if hasattr(nbest_hyp, 'text') else str(nbest_hyp)
330
+ print(f" {i+1}. [{score}] {hyp_text}")
331
+
332
+ return text
333
+ else:
334
+ # Fallback: convert to string
335
+ return str(hyp)
336
+
337
+ return ""
338
+
339
+ def transcribe_with_frame_analysis(self, audio_file: str):
340
+ """
341
+ Analyze frame-level predictions to detect repetitions
342
+ This examines the raw CTC outputs before collapsing
343
+ """
344
+ print("🔍 Performing frame-level analysis...")
345
+
346
+ # Get log probabilities at frame level
347
+ log_probs = self.asr_model.transcribe(
348
+ [audio_file],
349
+ batch_size=1,
350
+ logprobs=True
351
+ )
352
+
353
+ # Standard transcription
354
+ transcription = self.asr_model.transcribe([audio_file])
355
+
356
+ return transcription[0], log_probs
357
+
358
+ def transcribe_with_all_methods(self, audio_file: str):
359
+ """Try multiple decoding strategies and return all results"""
360
+ results = {}
361
+
362
+ # Method 1: Standard beam search
363
+ print("\n--- Method 1: Standard Beam Search ---")
364
+ results['beam_standard'] = self.transcribe_with_logprobs(audio_file, temperature=1.0)
365
+ print(f"Results with Temp 1.0 : {results['beam_standard']}")
366
+
367
+ # Method 2: Lower temperature (more conservative)
368
+ print("\n--- Method 2: Conservative (temp=0.5) ---")
369
+ results['beam_conservative'] = self.transcribe_with_logprobs(audio_file, temperature=0.5)
370
+ print(f"Results with Temp 0.5 : {results['beam_conservative']}")
371
+ # Method 3: Higher temperature (more exploratory)
372
+ print("\n--- Method 3: Exploratory (temp=1.5) ---")
373
+ results['beam_exploratory'] = self.transcribe_with_logprobs(audio_file, temperature=1.5)
374
+ print(f"Results with Temp 1.5 : {results['beam_exploratory']}")
375
+ # Method 4: Frame-level analysis
376
+ # print("\n--- Method 4: Frame-level Analysis ---")
377
+ # results['frame_analysis'], _ = self.transcribe_with_frame_analysis(audio_file)
378
+
379
+ return results
380
+
381
+
382
+ def post_process_repetitions(text: str, audio_duration: float, expected_word_count: int = None) -> str:
383
+ """
384
+ Heuristic post-processing to restore repetitions
385
+
386
+ Args:
387
+ text: Transcribed text
388
+ audio_duration: Duration of audio in seconds
389
+ expected_word_count: Expected number of words (if known)
390
+ """
391
+ words = text.split()
392
+
393
+ # Calculate speaking rate (words per second)
394
+ speaking_rate = len(words) / audio_duration
395
+
396
+ # Normal Arabic speaking rate is 2-3 words per second
397
+ # For numbers, it's often slower (1-2 words per second)
398
+ # If rate is too high, likely missing repetitions
399
+
400
+ if speaking_rate > 3.0 and expected_word_count:
401
+ print(f"⚠️ Speaking rate unusually high ({speaking_rate:.1f} w/s)")
402
+ print(f" Expected ~{expected_word_count} words, got {len(words)}")
403
+ print(" Possible missing repetitions detected")
404
+
405
+ return text
406
+
407
+
408
+ def detect_number_patterns(text: str) -> List[str]:
409
+ """Detect if text contains Arabic number words"""
410
+ arabic_numbers = [
411
+ 'صفر', 'زيرو', 'واحد', 'اثنين', 'ثلاثة', 'أربعة',
412
+ 'خمسة', 'ستة', 'سبعة', 'ثمانية', 'تسعة'
413
+ ]
414
+
415
+ words = text.split()
416
+ detected = [w for w in words if w in arabic_numbers]
417
+
418
+ if detected:
419
+ print(f"🔢 Detected number words: {' '.join(detected)}")
420
+
421
+ return detected
422
+
423
+
424
+ # ===== MAIN EXECUTION =====
425
+ if __name__ == "__main__":
426
+ # ===== STEP 1: Record audio =====
427
+ print("🎙️ Recording... Speak Arabic now!")
428
+ print("💡 TIP: For repeated numbers, pause slightly between each repetition")
429
+ print(" Example: 'زيرو [pause] زيرو [pause] واحد [pause] واحد'\n")
430
+
431
+ audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
432
+ sd.wait()
433
+ wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
434
+ print(f"✅ Recording finished. Saved as {OUTPUT_FILE}\n")
435
+
436
+ # ===== STEP 2: Initialize transcriber =====
437
+ model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
438
+ transcriber = RepetitionAwareTranscriber(model_path)
439
+
440
+ # ===== STEP 3: Transcribe with all methods =====
441
+ results = transcriber.transcribe_with_all_methods(OUTPUT_FILE)
442
+
443
+ # ===== STEP 4: Display all results =====
444
+ print("\n" + "="*60)
445
+ print("📝 FINAL RESULTS:")
446
+ print("="*60)
447
+
448
+ for method, transcription in results.items():
449
+ print(f"\n{method.upper()}:")
450
+ print(f" {transcription}")
451
+ detect_number_patterns(transcription)
452
+
453
+ # ===== STEP 5: Post-processing analysis =====
454
+ print("\n" + "="*60)
455
+ print("🔍 POST-PROCESSING ANALYSIS:")
456
+ print("="*60)
457
+
458
+ best_transcription = results['beam_standard']
459
+ processed = post_process_repetitions(best_transcription, DURATION)
460
+
461
+ print(f"\nBest transcription: {best_transcription}")
462
+ print(f"Word count: {len(best_transcription.split())}")
463
+ print(f"Speaking rate: {len(best_transcription.split()) / DURATION:.2f} words/sec")
464
+
465
+ # ===== STEP 6: Recommendations =====
466
+ print("\n" + "="*60)
467
+ print("💡 RECOMMENDATIONS:")
468
+ print("="*60)
469
+ print("1. Compare all method outputs above")
470
+ print("2. If all methods miss repetitions, the issue is in the trained model")
471
+ print("3. Consider retraining with more repetitive sequences in training data")
472
+ print("4. When speaking, add slight pauses between repeated words")
473
+ print("5. If transcribing phone numbers, use digit-by-digit model instead")
train_manifest.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
train_split.jsonl ADDED
The diff for this file is too large to render. See raw diff