Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +6 -0
- Extracting_tokenizer_dir_from_Nemo_model.py +126 -0
- StartingServer.txt +2 -0
- UploadingtoGitlab.txt +4 -0
- WER_CER_eval.py +123 -0
- WER_calc.py +64 -0
- app_api_2.py +345 -0
- continue_finetuning_nemo.py +199 -0
- converting_dataset_to_8khz.py +95 -0
- data_tts/gcloud_tts_sample_001.wav +0 -0
- data_tts/gcloud_tts_sample_002.wav +0 -0
- data_tts/gcloud_tts_sample_053.wav +0 -0
- data_tts/gcloud_tts_sample_060.wav +0 -0
- data_tts/gcloud_tts_sample_1065.wav +3 -0
- data_tts/gcloud_tts_sample_1067.wav +3 -0
- data_tts/gcloud_tts_sample_107.wav +3 -0
- data_tts/gcloud_tts_sample_1078.wav +3 -0
- data_tts/gcloud_tts_sample_1080.wav +3 -0
- data_tts/gcloud_tts_sample_1082.wav +3 -0
- data_tts/gcloud_tts_sample_1189.wav +0 -0
- data_tts/gcloud_tts_sample_1190.wav +0 -0
- data_tts/gcloud_tts_sample_1191.wav +0 -0
- data_tts/gcloud_tts_sample_1192.wav +0 -0
- data_tts/gcloud_tts_sample_1193.wav +0 -0
- data_tts/gcloud_tts_sample_1221.wav +0 -0
- data_tts/gcloud_tts_sample_1222.wav +0 -0
- data_tts/gcloud_tts_sample_1236.wav +0 -0
- data_tts/gcloud_tts_sample_1241.wav +0 -0
- data_tts/gcloud_tts_sample_1277.wav +0 -0
- data_tts/gcloud_tts_sample_1278.wav +0 -0
- data_tts/gcloud_tts_sample_1279.wav +0 -0
- data_tts/gcloud_tts_sample_1280.wav +0 -0
- data_tts/gcloud_tts_sample_1286.wav +0 -0
- data_tts/gcloud_tts_sample_1287.wav +0 -0
- data_tts/gcloud_tts_sample_1295.wav +0 -0
- data_tts/gcloud_tts_sample_1296.wav +0 -0
- data_tts/gcloud_tts_sample_1297.wav +0 -0
- data_tts/gcloud_tts_sample_1304.wav +0 -0
- data_tts/gcloud_tts_sample_1305.wav +0 -0
- data_tts/gcloud_tts_sample_1306.wav +0 -0
- data_tts/gcloud_tts_sample_1313.wav +0 -0
- data_tts/gcloud_tts_sample_1314.wav +0 -0
- data_tts/gcloud_tts_sample_1322.wav +0 -0
- eval_manifest.jsonl +163 -0
- evaluation_results.csv +164 -0
- finetune_asr.py +711 -0
- testing_main.py +192 -0
- testing_main_v2.py +473 -0
- train_manifest.jsonl +0 -0
- train_split.jsonl +0 -0
.gitattributes
CHANGED
|
@@ -968,3 +968,9 @@ data_tts/gcloud_tts_sample_1073.wav filter=lfs diff=lfs merge=lfs -text
|
|
| 968 |
data_tts/gcloud_tts_sample_1075.wav filter=lfs diff=lfs merge=lfs -text
|
| 969 |
data_tts/gcloud_tts_sample_1072.wav filter=lfs diff=lfs merge=lfs -text
|
| 970 |
data_tts/gcloud_tts_sample_1079.wav filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 968 |
data_tts/gcloud_tts_sample_1075.wav filter=lfs diff=lfs merge=lfs -text
|
| 969 |
data_tts/gcloud_tts_sample_1072.wav filter=lfs diff=lfs merge=lfs -text
|
| 970 |
data_tts/gcloud_tts_sample_1079.wav filter=lfs diff=lfs merge=lfs -text
|
| 971 |
+
data_tts/gcloud_tts_sample_107.wav filter=lfs diff=lfs merge=lfs -text
|
| 972 |
+
data_tts/gcloud_tts_sample_1065.wav filter=lfs diff=lfs merge=lfs -text
|
| 973 |
+
data_tts/gcloud_tts_sample_1078.wav filter=lfs diff=lfs merge=lfs -text
|
| 974 |
+
data_tts/gcloud_tts_sample_1067.wav filter=lfs diff=lfs merge=lfs -text
|
| 975 |
+
data_tts/gcloud_tts_sample_1082.wav filter=lfs diff=lfs merge=lfs -text
|
| 976 |
+
data_tts/gcloud_tts_sample_1080.wav filter=lfs diff=lfs merge=lfs -text
|
Extracting_tokenizer_dir_from_Nemo_model.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Run this script FIRST to extract the tokenizer from the .nemo file
|
| 3 |
+
This creates the tokenizer folder that the training script needs
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import tarfile
|
| 7 |
+
import zipfile
|
| 8 |
+
import shutil
|
| 9 |
+
|
| 10 |
+
MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
|
| 11 |
+
OUTPUT_DIR = "tokenizer"
|
| 12 |
+
|
| 13 |
+
print("🔹 Detecting .nemo file format...")
|
| 14 |
+
|
| 15 |
+
def try_extract_tokenizer():
|
| 16 |
+
"""Try different methods to extract tokenizer"""
|
| 17 |
+
|
| 18 |
+
# Method 1: Try as regular tar (no compression)
|
| 19 |
+
try:
|
| 20 |
+
print("Trying: Regular tar format...")
|
| 21 |
+
with tarfile.open(MODEL_PATH, 'r:') as tar:
|
| 22 |
+
return extract_from_tar(tar)
|
| 23 |
+
except Exception as e:
|
| 24 |
+
print(f" ✗ Not a regular tar: {e}")
|
| 25 |
+
|
| 26 |
+
# Method 2: Try as gzipped tar
|
| 27 |
+
try:
|
| 28 |
+
print("Trying: Gzipped tar format...")
|
| 29 |
+
with tarfile.open(MODEL_PATH, 'r:gz') as tar:
|
| 30 |
+
return extract_from_tar(tar)
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f" ✗ Not gzipped tar: {e}")
|
| 33 |
+
|
| 34 |
+
# Method 3: Try as zip file
|
| 35 |
+
try:
|
| 36 |
+
print("Trying: ZIP format...")
|
| 37 |
+
with zipfile.ZipFile(MODEL_PATH, 'r') as zf:
|
| 38 |
+
return extract_from_zip(zf)
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f" ✗ Not a ZIP file: {e}")
|
| 41 |
+
|
| 42 |
+
# Method 4: Try auto-detect
|
| 43 |
+
try:
|
| 44 |
+
print("Trying: Auto-detect format...")
|
| 45 |
+
with tarfile.open(MODEL_PATH, 'r:*') as tar:
|
| 46 |
+
return extract_from_tar(tar)
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f" ✗ Auto-detect failed: {e}")
|
| 49 |
+
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
def extract_from_tar(tar):
|
| 53 |
+
"""Extract tokenizer files from tar archive"""
|
| 54 |
+
tokenizer_files = [m for m in tar.getmembers() if 'tokenizer' in m.name.lower()]
|
| 55 |
+
|
| 56 |
+
if not tokenizer_files:
|
| 57 |
+
print("\n📋 Available files in archive:")
|
| 58 |
+
for member in tar.getmembers()[:20]: # Show first 20
|
| 59 |
+
print(f" - {member.name}")
|
| 60 |
+
if len(tar.getmembers()) > 20:
|
| 61 |
+
print(f" ... and {len(tar.getmembers()) - 20} more files")
|
| 62 |
+
return False
|
| 63 |
+
|
| 64 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 65 |
+
|
| 66 |
+
for member in tokenizer_files:
|
| 67 |
+
# Extract to temp directory
|
| 68 |
+
tar.extract(member, path="temp_extract")
|
| 69 |
+
|
| 70 |
+
# Move to tokenizer directory
|
| 71 |
+
src = os.path.join("temp_extract", member.name)
|
| 72 |
+
if os.path.isfile(src):
|
| 73 |
+
dst = os.path.join(OUTPUT_DIR, os.path.basename(member.name))
|
| 74 |
+
shutil.copy2(src, dst)
|
| 75 |
+
print(f"✅ Extracted: {os.path.basename(member.name)}")
|
| 76 |
+
|
| 77 |
+
# Cleanup
|
| 78 |
+
if os.path.exists("temp_extract"):
|
| 79 |
+
shutil.rmtree("temp_extract")
|
| 80 |
+
|
| 81 |
+
return True
|
| 82 |
+
|
| 83 |
+
def extract_from_zip(zf):
|
| 84 |
+
"""Extract tokenizer files from zip archive"""
|
| 85 |
+
tokenizer_files = [n for n in zf.namelist() if 'tokenizer' in n.lower()]
|
| 86 |
+
|
| 87 |
+
if not tokenizer_files:
|
| 88 |
+
print("\n📋 Available files in archive:")
|
| 89 |
+
for name in zf.namelist()[:20]:
|
| 90 |
+
print(f" - {name}")
|
| 91 |
+
if len(zf.namelist()) > 20:
|
| 92 |
+
print(f" ... and {len(zf.namelist()) - 20} more files")
|
| 93 |
+
return False
|
| 94 |
+
|
| 95 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 96 |
+
|
| 97 |
+
for name in tokenizer_files:
|
| 98 |
+
# Extract file
|
| 99 |
+
zf.extract(name, path="temp_extract")
|
| 100 |
+
|
| 101 |
+
# Move to tokenizer directory
|
| 102 |
+
src = os.path.join("temp_extract", name)
|
| 103 |
+
if os.path.isfile(src):
|
| 104 |
+
dst = os.path.join(OUTPUT_DIR, os.path.basename(name))
|
| 105 |
+
shutil.copy2(src, dst)
|
| 106 |
+
print(f"✅ Extracted: {os.path.basename(name)}")
|
| 107 |
+
|
| 108 |
+
# Cleanup
|
| 109 |
+
if os.path.exists("temp_extract"):
|
| 110 |
+
shutil.rmtree("temp_extract")
|
| 111 |
+
|
| 112 |
+
return True
|
| 113 |
+
|
| 114 |
+
# Try extraction
|
| 115 |
+
success = try_extract_tokenizer()
|
| 116 |
+
|
| 117 |
+
if success:
|
| 118 |
+
print(f"\n✅ Tokenizer extracted to: {OUTPUT_DIR}")
|
| 119 |
+
print("\n📁 Tokenizer files:")
|
| 120 |
+
for file in os.listdir(OUTPUT_DIR):
|
| 121 |
+
print(f" - {file}")
|
| 122 |
+
print("\n✅ Now you can run the training script!")
|
| 123 |
+
else:
|
| 124 |
+
print("\n❌ Could not extract tokenizer from .nemo file")
|
| 125 |
+
print("\n🔧 Alternative solution: The training script will use the embedded tokenizer")
|
| 126 |
+
print(" No action needed - proceed with training!")
|
StartingServer.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python -m streamlit run app.py
|
| 2 |
+
python -m uvicorn app_api:app --host 0.0.0.0 --port 8070 --reload
|
UploadingtoGitlab.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
cd existing_repo
|
| 2 |
+
git remote add origin https://gitlab.expertflow.com/bot/ai/contextual_asr.git
|
| 3 |
+
git branch -M main
|
| 4 |
+
git push -uf origin main
|
WER_CER_eval.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import torch
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import Levenshtein as lev
|
| 6 |
+
from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
|
| 7 |
+
from nemo.collections.asr.metrics.wer import word_error_rate # ✅ Keep this
|
| 8 |
+
|
| 9 |
+
# ==========================
|
| 10 |
+
# CONFIGURATION
|
| 11 |
+
# ==========================
|
| 12 |
+
MODEL_PATH = "output_finetuned/finetuned_model_best.nemo"
|
| 13 |
+
EVAL_MANIFEST = "eval_manifest.jsonl"
|
| 14 |
+
|
| 15 |
+
# ==========================
|
| 16 |
+
# LOAD MODEL
|
| 17 |
+
# ==========================
|
| 18 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 19 |
+
print(f"Loading model on: {device}")
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
model = EncDecHybridRNNTCTCBPEModel.restore_from(restore_path=MODEL_PATH, map_location=device)
|
| 23 |
+
model = model.to(device)
|
| 24 |
+
model.eval()
|
| 25 |
+
print("✅ Model loaded successfully.")
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f"❌ Failed to load model: {e}")
|
| 28 |
+
exit()
|
| 29 |
+
|
| 30 |
+
# ==========================
|
| 31 |
+
# LOAD MANIFEST
|
| 32 |
+
# ==========================
|
| 33 |
+
def load_manifest(manifest_path):
|
| 34 |
+
"""Load audio paths and text from a JSONL manifest file."""
|
| 35 |
+
data = []
|
| 36 |
+
with open(manifest_path, "r", encoding="utf-8") as f:
|
| 37 |
+
for line in f:
|
| 38 |
+
try:
|
| 39 |
+
item = json.loads(line.strip())
|
| 40 |
+
audio_path = item["audio_filepath"]
|
| 41 |
+
text = item.get("text", "").strip()
|
| 42 |
+
if os.path.exists(audio_path) and text:
|
| 43 |
+
data.append((audio_path, text))
|
| 44 |
+
else:
|
| 45 |
+
print(f"⚠️ Skipping invalid entry: {audio_path}")
|
| 46 |
+
except json.JSONDecodeError as e:
|
| 47 |
+
print(f"❌ Invalid JSON line: {e}")
|
| 48 |
+
print(f"\n📁 Loaded {len(data)} valid samples from manifest.")
|
| 49 |
+
return data
|
| 50 |
+
|
| 51 |
+
# ==========================
|
| 52 |
+
# CER FUNCTION
|
| 53 |
+
# ==========================
|
| 54 |
+
def calculate_cer(reference, hypothesis):
|
| 55 |
+
"""Compute Character Error Rate (CER)."""
|
| 56 |
+
reference = reference.replace(" ", "")
|
| 57 |
+
hypothesis = hypothesis.replace(" ", "")
|
| 58 |
+
if len(reference) == 0:
|
| 59 |
+
return 0.0
|
| 60 |
+
return lev.distance(reference, hypothesis) / len(reference)
|
| 61 |
+
|
| 62 |
+
# ==========================
|
| 63 |
+
# EVALUATION FUNCTION
|
| 64 |
+
# ==========================
|
| 65 |
+
def evaluate_model(model, dataset):
|
| 66 |
+
total_wer, total_cer = 0.0, 0.0
|
| 67 |
+
results = []
|
| 68 |
+
|
| 69 |
+
for i, (audio_path, expected_text) in enumerate(dataset, 1):
|
| 70 |
+
print(f"\n🔍 [{i}/{len(dataset)}] Evaluating: {audio_path}")
|
| 71 |
+
|
| 72 |
+
with torch.no_grad():
|
| 73 |
+
output = model.transcribe([audio_path])
|
| 74 |
+
if isinstance(output, tuple):
|
| 75 |
+
prediction_list = output[0]
|
| 76 |
+
else:
|
| 77 |
+
prediction_list = output
|
| 78 |
+
prediction = prediction_list[0] if isinstance(prediction_list, list) else prediction_list
|
| 79 |
+
|
| 80 |
+
# Compute WER & CER
|
| 81 |
+
wer = word_error_rate([expected_text], [prediction])
|
| 82 |
+
cer = calculate_cer(expected_text, prediction)
|
| 83 |
+
|
| 84 |
+
print(f"Expected : {expected_text}")
|
| 85 |
+
print(f"Predicted: {prediction}")
|
| 86 |
+
print(f"WER={wer:.3f}, CER={cer:.3f}")
|
| 87 |
+
|
| 88 |
+
results.append({
|
| 89 |
+
"audio": os.path.basename(audio_path),
|
| 90 |
+
"expected": expected_text,
|
| 91 |
+
"predicted": prediction,
|
| 92 |
+
"WER": wer,
|
| 93 |
+
"CER": cer
|
| 94 |
+
})
|
| 95 |
+
|
| 96 |
+
total_wer += wer
|
| 97 |
+
total_cer += cer
|
| 98 |
+
|
| 99 |
+
avg_wer = total_wer / len(dataset)
|
| 100 |
+
avg_cer = total_cer / len(dataset)
|
| 101 |
+
accuracy = (1 - avg_wer) * 100 # ✅ Calculate accuracy percentage
|
| 102 |
+
print("\n==============================")
|
| 103 |
+
print(f"📊 Average WER: {avg_wer:.3f}")
|
| 104 |
+
print(f"🎯 Accuracy: {accuracy:.2f}%") # ✅ Added this line
|
| 105 |
+
print(f"📊 Average CER: {avg_cer:.3f}")
|
| 106 |
+
print("==============================")
|
| 107 |
+
|
| 108 |
+
return results, avg_wer, avg_cer
|
| 109 |
+
|
| 110 |
+
# ==========================
|
| 111 |
+
# RUN EVALUATION
|
| 112 |
+
# ==========================
|
| 113 |
+
if __name__ == "__main__":
|
| 114 |
+
dataset = load_manifest(EVAL_MANIFEST)
|
| 115 |
+
if not dataset:
|
| 116 |
+
print("❌ No valid data found in manifest.")
|
| 117 |
+
exit()
|
| 118 |
+
|
| 119 |
+
results, avg_wer, avg_cer = evaluate_model(model, dataset)
|
| 120 |
+
|
| 121 |
+
df = pd.DataFrame(results)
|
| 122 |
+
df.to_csv("evaluation_results.csv", index=False, encoding="utf-8-sig")
|
| 123 |
+
print("\n💾 Results saved to: evaluation_results.csv")
|
WER_calc.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
|
| 4 |
+
from nemo.collections.asr.metrics.wer import word_error_rate
|
| 5 |
+
|
| 6 |
+
# ==========================
|
| 7 |
+
# CONFIGURATION
|
| 8 |
+
# ==========================
|
| 9 |
+
MODEL_PATH = "output_finetuned/finetuned_model_best.nemo"
|
| 10 |
+
SAMPLE_AUDIO = "arabic_recording.wav"
|
| 11 |
+
EXPECTED_TEXT = "زيرو واحد واحد واحد واحد واحد واحد اتنين اربعة ستة"
|
| 12 |
+
|
| 13 |
+
# ==========================
|
| 14 |
+
# LOAD MODEL
|
| 15 |
+
# ==========================
|
| 16 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 17 |
+
print(f"Loading model on: {device}")
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
model = EncDecHybridRNNTCTCBPEModel.restore_from(restore_path=MODEL_PATH, map_location=device)
|
| 21 |
+
model.eval()
|
| 22 |
+
print("✅ Model loaded successfully.")
|
| 23 |
+
except Exception as e:
|
| 24 |
+
print(f"❌ Failed to load model: {e}")
|
| 25 |
+
exit()
|
| 26 |
+
|
| 27 |
+
# ==========================
|
| 28 |
+
# TEST FUNCTION
|
| 29 |
+
# ==========================
|
| 30 |
+
def test_model(model, sample_audio, expected_text):
|
| 31 |
+
if not os.path.exists(sample_audio):
|
| 32 |
+
print(f"❌ Audio file not found: {sample_audio}")
|
| 33 |
+
return
|
| 34 |
+
|
| 35 |
+
print(f"\n🔍 Testing on: {sample_audio}")
|
| 36 |
+
|
| 37 |
+
# Transcribe
|
| 38 |
+
with torch.no_grad():
|
| 39 |
+
output = model.transcribe([sample_audio])
|
| 40 |
+
|
| 41 |
+
# Handle different return types
|
| 42 |
+
if isinstance(output, tuple):
|
| 43 |
+
# Sometimes returns (predictions, tokens)
|
| 44 |
+
prediction_list = output[0]
|
| 45 |
+
else:
|
| 46 |
+
prediction_list = output
|
| 47 |
+
|
| 48 |
+
# Ensure it's a single string
|
| 49 |
+
prediction = prediction_list[0] if isinstance(prediction_list, list) else prediction_list
|
| 50 |
+
|
| 51 |
+
# Display results
|
| 52 |
+
print(f"\nPredicted: {prediction}")
|
| 53 |
+
print(f"Expected : {expected_text}")
|
| 54 |
+
|
| 55 |
+
# Compute WER
|
| 56 |
+
wer = word_error_rate([expected_text], [prediction])
|
| 57 |
+
print(f"\n📊 Word Error Rate (WER): {wer:.3f}")
|
| 58 |
+
return prediction, wer
|
| 59 |
+
|
| 60 |
+
# ==========================
|
| 61 |
+
# RUN TEST
|
| 62 |
+
# ==========================
|
| 63 |
+
if __name__ == "__main__":
|
| 64 |
+
prediction, wer = test_model(model, SAMPLE_AUDIO, EXPECTED_TEXT)
|
app_api_2.py
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# from fastapi import FastAPI, File, UploadFile
|
| 2 |
+
# from fastapi.responses import JSONResponse
|
| 3 |
+
# import uvicorn
|
| 4 |
+
# import tempfile
|
| 5 |
+
# import nemo.collections.asr as nemo_asr
|
| 6 |
+
# import re
|
| 7 |
+
# import os
|
| 8 |
+
# import librosa
|
| 9 |
+
# import soundfile as sf
|
| 10 |
+
|
| 11 |
+
# # ===== Arabic number mapping (expanded) =====
|
| 12 |
+
# arabic_numbers = {
|
| 13 |
+
# "صفر": "0", "زيرو": "0", "٠": "0", "زيو": "0", "زير": "0",
|
| 14 |
+
# "واحد": "1", "واحدة": "1", "١": "1",
|
| 15 |
+
# "اتنين": "2", "اثنين": "2", "اثنان": "2", "٢": "2",
|
| 16 |
+
# "تلاتة": "3", "ثلاثة": "3", "٣": "3","ثلاث": "3","تلات": "3",
|
| 17 |
+
# "اربعة": "4", "أربعة": "4", "٤": "4",
|
| 18 |
+
# "خمسة": "5", "٥": "5","خمسه": "5",
|
| 19 |
+
# "ستة": "6", "٦": "6",
|
| 20 |
+
# "سبعة": "7", "٧": "7","سبعه": "7",
|
| 21 |
+
# "تمانية": "8", "ثمانية": "8", "٨": "8",
|
| 22 |
+
# "تسعة": "9", "٩": "9",
|
| 23 |
+
# "عشرة": "10", "١٠": "10","عشره": "10",
|
| 24 |
+
# "حداشر": "11", "احد عشر": "11", "احداشر": "11",
|
| 25 |
+
# "اتناشر": "12", "اثنا عشر": "12",
|
| 26 |
+
# "تلتاشر": "13", "ثلاثة عشر": "13",
|
| 27 |
+
# "اربعتاشر": "14", "أربعة عشر": "14",
|
| 28 |
+
# "خمستاشر": "15", "خمسة عشر": "15",
|
| 29 |
+
# "ستاشر": "16", "ستة عشر": "16",
|
| 30 |
+
# "سبعتاشر": "17", "سبعة عشر": "17",
|
| 31 |
+
# "طمنتاشر": "18", "ثمانية عشر": "18",
|
| 32 |
+
# "تسعتاشر": "19", "تسعة عشر": "19",
|
| 33 |
+
# "عشرين": "20", "٢٠": "20",
|
| 34 |
+
# "تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
|
| 35 |
+
# "اربعين": "40", "أربعين": "40", "٤٠": "40",
|
| 36 |
+
# "خمسين": "50", "٥٠": "50",
|
| 37 |
+
# "ستين": "60", "٦٠": "60",
|
| 38 |
+
# "سبعين": "70", "٧٠": "70",
|
| 39 |
+
# "تمانين": "80", "ثمانين": "80", "٨٠": "80",
|
| 40 |
+
# "تسعين": "90", "٩٠": "90",
|
| 41 |
+
# "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
|
| 42 |
+
# "ميتين": "200", "مائتين": "200",
|
| 43 |
+
# "تلاتمية": "300", "ثلاثمائة": "300",
|
| 44 |
+
# "اربعمية": "400", "أربعمائة": "400",
|
| 45 |
+
# "خمسمية": "500", "خمسمائة": "500",
|
| 46 |
+
# "ستمية": "600", "ستمائة": "600",
|
| 47 |
+
# "سبعمية": "700", "سبعمائة": "700",
|
| 48 |
+
# "تمانمية": "800", "ثمانمائة": "800",
|
| 49 |
+
# "تسعمية": "900", "تسعمائة": "900",
|
| 50 |
+
# "ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
|
| 51 |
+
# "ألفين": "2000", "الفين": "2000",
|
| 52 |
+
# "تلات تلاف": "3000", "ثلاثة آلاف": "3000",
|
| 53 |
+
# "اربعة آلاف": "4000", "أربعة آلاف": "4000",
|
| 54 |
+
# "خمسة آلاف": "5000",
|
| 55 |
+
# "ستة آلاف": "6000",
|
| 56 |
+
# "سبعة آلاف": "7000",
|
| 57 |
+
# "تمانية آلاف": "8000", "ثمانية آلاف": "8000",
|
| 58 |
+
# "تسعة آلاف": "9000",
|
| 59 |
+
# "عشرة آلاف": "10000",
|
| 60 |
+
# "مية ألف": "100000", "مائة ألف": "100000",
|
| 61 |
+
# "مليون": "1000000", "ملايين": "1000000",
|
| 62 |
+
# "مليار": "1000000000"
|
| 63 |
+
# }
|
| 64 |
+
|
| 65 |
+
# # ===== Helpers =====
|
| 66 |
+
# def normalize_arabic(text: str) -> str:
|
| 67 |
+
# diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
|
| 68 |
+
# text = re.sub(diacritics, '', text)
|
| 69 |
+
# text = re.sub(r'[إأآا]', 'ا', text)
|
| 70 |
+
# text = re.sub(r'ى', 'ي', text)
|
| 71 |
+
# text = re.sub(r'ؤ', 'و', text)
|
| 72 |
+
# text = re.sub(r'ئ', 'ي', text)
|
| 73 |
+
# text = re.sub(r'ة', 'ه', text)
|
| 74 |
+
# return text
|
| 75 |
+
|
| 76 |
+
# def replace_arabic_numbers(text: str) -> str:
|
| 77 |
+
# for word, digit in arabic_numbers.items():
|
| 78 |
+
# text = re.sub(fr"(?:^|\s){word}(?:$|\s)", f" {digit} ", text)
|
| 79 |
+
# return " ".join(text.split())
|
| 80 |
+
|
| 81 |
+
# def join_digit_sequences(text: str) -> str:
|
| 82 |
+
# tokens = text.split()
|
| 83 |
+
# out, buffer = [], []
|
| 84 |
+
# for tok in tokens:
|
| 85 |
+
# if tok.isdigit() and len(tok) == 1:
|
| 86 |
+
# buffer.append(tok)
|
| 87 |
+
# else:
|
| 88 |
+
# if buffer:
|
| 89 |
+
# out.append("".join(buffer))
|
| 90 |
+
# buffer = []
|
| 91 |
+
# out.append(tok)
|
| 92 |
+
# if buffer:
|
| 93 |
+
# out.append("".join(buffer))
|
| 94 |
+
# return " ".join(out)
|
| 95 |
+
|
| 96 |
+
# def ensure_16k_wav(input_path, output_path):
|
| 97 |
+
# y, sr = librosa.load(input_path, sr=16000, mono=True)
|
| 98 |
+
# sf.write(output_path, y, 16000)
|
| 99 |
+
|
| 100 |
+
# # ===== FastAPI app =====
|
| 101 |
+
# app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic digit conversion")
|
| 102 |
+
|
| 103 |
+
# @app.on_event("startup")
|
| 104 |
+
# def load_model():
|
| 105 |
+
# global asr_model
|
| 106 |
+
# model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/asr-egyptian-nemo-v2.0.nemo"
|
| 107 |
+
# asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
|
| 108 |
+
|
| 109 |
+
# @app.post("/transcribe")
|
| 110 |
+
# async def transcribe_audio(file: UploadFile = File(...)):
|
| 111 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
| 112 |
+
# tmp.write(await file.read())
|
| 113 |
+
# tmp_path = tmp.name
|
| 114 |
+
|
| 115 |
+
# # Resample to 16kHz
|
| 116 |
+
# resampled_path = tmp_path.replace(".wav", "_16k.wav")
|
| 117 |
+
# ensure_16k_wav(tmp_path, resampled_path)
|
| 118 |
+
|
| 119 |
+
# try:
|
| 120 |
+
# result = asr_model.transcribe([resampled_path])
|
| 121 |
+
# raw_text = result[0].text
|
| 122 |
+
|
| 123 |
+
# raw_text = normalize_arabic(raw_text)
|
| 124 |
+
# cleaned_text = replace_arabic_numbers(raw_text)
|
| 125 |
+
# cleaned_text = join_digit_sequences(cleaned_text)
|
| 126 |
+
|
| 127 |
+
# return JSONResponse(content={"transcription": cleaned_text})
|
| 128 |
+
|
| 129 |
+
# finally:
|
| 130 |
+
# os.remove(tmp_path)
|
| 131 |
+
# if os.path.exists(resampled_path):
|
| 132 |
+
# os.remove(resampled_path)
|
| 133 |
+
|
| 134 |
+
# @app.post("/transcribe-bytes")
|
| 135 |
+
# async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
|
| 136 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
| 137 |
+
# tmp.write(audio_bytes)
|
| 138 |
+
# tmp_path = tmp.name
|
| 139 |
+
|
| 140 |
+
# resampled_path = tmp_path.replace(".wav", "_16k.wav")
|
| 141 |
+
# ensure_16k_wav(tmp_path, resampled_path)
|
| 142 |
+
|
| 143 |
+
# try:
|
| 144 |
+
# result = asr_model.transcribe([resampled_path])
|
| 145 |
+
# raw_text = result[0].text
|
| 146 |
+
|
| 147 |
+
# raw_text = normalize_arabic(raw_text)
|
| 148 |
+
# cleaned_text = replace_arabic_numbers(raw_text)
|
| 149 |
+
# cleaned_text = join_digit_sequences(cleaned_text)
|
| 150 |
+
|
| 151 |
+
# return JSONResponse(content={"transcription": cleaned_text})
|
| 152 |
+
|
| 153 |
+
# finally:
|
| 154 |
+
# os.remove(tmp_path)
|
| 155 |
+
# if os.path.exists(resampled_path):
|
| 156 |
+
# os.remove(resampled_path)
|
| 157 |
+
|
| 158 |
+
# if __name__ == "__main__":
|
| 159 |
+
# uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
|
| 160 |
+
from fastapi import FastAPI, File, UploadFile
|
| 161 |
+
from fastapi.responses import JSONResponse
|
| 162 |
+
import uvicorn
|
| 163 |
+
import tempfile
|
| 164 |
+
import nemo.collections.asr as nemo_asr
|
| 165 |
+
import re
|
| 166 |
+
import os
|
| 167 |
+
import librosa
|
| 168 |
+
import soundfile as sf
|
| 169 |
+
from omegaconf import OmegaConf
|
| 170 |
+
# ===== Arabic + English number mapping (expanded) =====
|
| 171 |
+
arabic_numbers = {
|
| 172 |
+
"صفر": "0", "زيرو": "0", "زيو": "0", "زير": "0", "٠": "0",
|
| 173 |
+
"واحد": "1", "واحدة": "1", "واحده": "1", "١": "1",
|
| 174 |
+
"اثنين": "2", "اثنان": "2", "اتنين": "2", "٢": "2",
|
| 175 |
+
"ثلاثة": "3", "ثلاث": "3", "تلاتة": "3", "تلات": "3", "ثلاثه": "3", "تلاته": "3",
|
| 176 |
+
"أربعة": "4", "اربعة": "4", "٤": "4","أربعه": "4","اربعه": "4",
|
| 177 |
+
"خمسة": "5", "خمسه": "5", "٥": "5",
|
| 178 |
+
"ستة": "6", "ست": "6", "٦": "6","سته": "6",
|
| 179 |
+
"سبعة": "7", "سبعه": "7", "٧": "7",
|
| 180 |
+
"ثمانية": "8", "تمانية": "8", "تمنية": "8", "ثمان": "8", "٨": "8","تمانيه": "8",
|
| 181 |
+
"تسعة": "9", "تسعه": "9", "٩": "9"
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
# ===== Helpers =====
|
| 186 |
+
def normalize_arabic(text: str) -> str:
|
| 187 |
+
diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
|
| 188 |
+
text = re.sub(diacritics, '', text)
|
| 189 |
+
text = re.sub(r'[إأآا]', 'ا', text)
|
| 190 |
+
text = re.sub(r'ى', 'ي', text)
|
| 191 |
+
text = re.sub(r'ؤ', 'و', text)
|
| 192 |
+
text = re.sub(r'ئ', 'ي', text)
|
| 193 |
+
text = re.sub(r'ة', 'ه', text)
|
| 194 |
+
return text
|
| 195 |
+
|
| 196 |
+
def replace_arabic_numbers(text: str) -> str:
|
| 197 |
+
# Replace Arabic words 0-9 with digits
|
| 198 |
+
for word, digit in arabic_numbers.items():
|
| 199 |
+
text = re.sub(rf'\b{re.escape(word)}\b', digit, text)
|
| 200 |
+
return text
|
| 201 |
+
|
| 202 |
+
def join_digit_sequences(text: str) -> str:
|
| 203 |
+
# Merge consecutive digits into single numbers
|
| 204 |
+
tokens = text.split()
|
| 205 |
+
out, buffer = [], []
|
| 206 |
+
for tok in tokens:
|
| 207 |
+
if tok.isdigit():
|
| 208 |
+
buffer.append(tok)
|
| 209 |
+
else:
|
| 210 |
+
if buffer:
|
| 211 |
+
out.append("".join(buffer))
|
| 212 |
+
buffer = []
|
| 213 |
+
out.append(tok)
|
| 214 |
+
if buffer:
|
| 215 |
+
out.append("".join(buffer))
|
| 216 |
+
return " ".join(out)
|
| 217 |
+
|
| 218 |
+
def ensure_16k_wav(input_path, output_path):
|
| 219 |
+
y, sr = librosa.load(input_path, sr=16000, mono=True)
|
| 220 |
+
sf.write(output_path, y, 16000)
|
| 221 |
+
|
| 222 |
+
# ===== FastAPI app =====
|
| 223 |
+
app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic/English digit conversion")
|
| 224 |
+
|
| 225 |
+
@app.on_event("startup")
|
| 226 |
+
def load_model():
|
| 227 |
+
global asr_model
|
| 228 |
+
model_path = "output_finetuned/finetuned_model_best.nemo"
|
| 229 |
+
asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(model_path)
|
| 230 |
+
# Add this right after loading the model to see what's actually available:
|
| 231 |
+
print("Available greedy parameters:")
|
| 232 |
+
print(OmegaConf.to_yaml(asr_model.cfg.decoding.greedy))
|
| 233 |
+
|
| 234 |
+
# ===== STEP 3: Configure for LITERAL transcription =====
|
| 235 |
+
print("🔍 Configuring greedy decoding for literal output...")
|
| 236 |
+
|
| 237 |
+
# Set struct mode to False temporarily to allow modifications
|
| 238 |
+
OmegaConf.set_struct(asr_model.cfg.decoding, False)
|
| 239 |
+
OmegaConf.set_struct(asr_model.cfg.decoding.greedy, False)
|
| 240 |
+
|
| 241 |
+
decoding_cfg = asr_model.cfg.decoding
|
| 242 |
+
decoding_cfg.strategy = "maes"
|
| 243 |
+
|
| 244 |
+
# Now try setting the parameters
|
| 245 |
+
try:
|
| 246 |
+
decoding_cfg.greedy.max_symbols_per_step = 300
|
| 247 |
+
print(f"✓ max_symbols_per_step: {decoding_cfg.greedy.max_symbols_per_step}")
|
| 248 |
+
except:
|
| 249 |
+
print("⚠ Could not set max_symbols_per_step")
|
| 250 |
+
|
| 251 |
+
decoding_cfg.greedy.max_symbols = 500
|
| 252 |
+
decoding_cfg.greedy.loop_labels = True
|
| 253 |
+
decoding_cfg.greedy.preserve_alignments = True
|
| 254 |
+
decoding_cfg.preserve_alignments = True
|
| 255 |
+
decoding_cfg.compute_timestamps = True
|
| 256 |
+
decoding_cfg.temperature = 1.3
|
| 257 |
+
|
| 258 |
+
decoding_cfg.beam.beam_size = 64
|
| 259 |
+
decoding_cfg.beam.softmax_temperature = 1.3
|
| 260 |
+
decoding_cfg.beam.search_type = "beam"
|
| 261 |
+
print(f"✓ max_symbols: {decoding_cfg.greedy.max_symbols}")
|
| 262 |
+
print(f"✓ loop_labels: {decoding_cfg.greedy.loop_labels}")
|
| 263 |
+
print(f"✓ temperature: {decoding_cfg.temperature}")
|
| 264 |
+
|
| 265 |
+
# Re-enable struct mode
|
| 266 |
+
OmegaConf.set_struct(asr_model.cfg.decoding, True)
|
| 267 |
+
OmegaConf.set_struct(asr_model.cfg.decoding.greedy, True)
|
| 268 |
+
|
| 269 |
+
# Apply configuration
|
| 270 |
+
asr_model.change_decoding_strategy(decoding_cfg)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
@app.post("/transcribe")
|
| 274 |
+
async def transcribe_audio(file: UploadFile = File(...)):
|
| 275 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
| 276 |
+
tmp.write(await file.read())
|
| 277 |
+
tmp_path = tmp.name
|
| 278 |
+
|
| 279 |
+
resampled_path = tmp_path.replace(".wav", "_16k.wav")
|
| 280 |
+
ensure_16k_wav(tmp_path, resampled_path)
|
| 281 |
+
|
| 282 |
+
try:
|
| 283 |
+
result = asr_model.transcribe([resampled_path])
|
| 284 |
+
|
| 285 |
+
# Handle NeMo tuple/list structure robustly
|
| 286 |
+
if isinstance(result, tuple):
|
| 287 |
+
result = result[0] # take first element if tuple
|
| 288 |
+
if isinstance(result, list):
|
| 289 |
+
raw_text = result[0]
|
| 290 |
+
else:
|
| 291 |
+
raw_text = str(result)
|
| 292 |
+
|
| 293 |
+
# Normalize and replace Arabic numerals
|
| 294 |
+
raw_text = normalize_arabic(raw_text)
|
| 295 |
+
cleaned_text = replace_arabic_numbers(raw_text)
|
| 296 |
+
cleaned_text = join_digit_sequences(cleaned_text)
|
| 297 |
+
|
| 298 |
+
print("📝 Cleaned Transcription:", cleaned_text) # for debug
|
| 299 |
+
return JSONResponse(content={"transcription": cleaned_text})
|
| 300 |
+
|
| 301 |
+
finally:
|
| 302 |
+
os.remove(tmp_path)
|
| 303 |
+
if os.path.exists(resampled_path):
|
| 304 |
+
os.remove(resampled_path)
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
@app.post("/transcribe-bytes")
|
| 308 |
+
async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
|
| 309 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
| 310 |
+
tmp.write(audio_bytes)
|
| 311 |
+
tmp_path = tmp.name
|
| 312 |
+
|
| 313 |
+
resampled_path = tmp_path.replace(".wav", "_16k.wav")
|
| 314 |
+
ensure_16k_wav(tmp_path, resampled_path)
|
| 315 |
+
|
| 316 |
+
try:
|
| 317 |
+
result = asr_model.transcribe([resampled_path])
|
| 318 |
+
# Robust extraction
|
| 319 |
+
if isinstance(result, list):
|
| 320 |
+
# if first element is also a list, flatten it
|
| 321 |
+
first = result[0]
|
| 322 |
+
if isinstance(first, list):
|
| 323 |
+
raw_text = first[0]
|
| 324 |
+
elif isinstance(first, str):
|
| 325 |
+
raw_text = first
|
| 326 |
+
elif hasattr(first, "text"): # sometimes result contains objects with 'text'
|
| 327 |
+
raw_text = first.text
|
| 328 |
+
else:
|
| 329 |
+
raw_text = str(first) # fallback to string
|
| 330 |
+
else:
|
| 331 |
+
raw_text = str(result)
|
| 332 |
+
#print("Raw text:", raw_text)
|
| 333 |
+
|
| 334 |
+
raw_text = normalize_arabic(raw_text)
|
| 335 |
+
cleaned_text = replace_arabic_numbers(raw_text)
|
| 336 |
+
cleaned_text = join_digit_sequences(cleaned_text)
|
| 337 |
+
|
| 338 |
+
return JSONResponse(content={"transcription": cleaned_text})
|
| 339 |
+
finally:
|
| 340 |
+
os.remove(tmp_path)
|
| 341 |
+
if os.path.exists(resampled_path):
|
| 342 |
+
os.remove(resampled_path)
|
| 343 |
+
|
| 344 |
+
if __name__ == "__main__":
|
| 345 |
+
uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
|
continue_finetuning_nemo.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import io
|
| 3 |
+
import json
|
| 4 |
+
import torch
|
| 5 |
+
from pytorch_lightning import Trainer
|
| 6 |
+
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
|
| 7 |
+
from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
|
| 8 |
+
from omegaconf import open_dict , DictConfig
|
| 9 |
+
|
| 10 |
+
# ============================================================
|
| 11 |
+
# Environment Fixes (Windows / CUDA)
|
| 12 |
+
# ============================================================
|
| 13 |
+
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
| 14 |
+
os.environ["NUMBA_CUDA_USE_NVIDIA_BINDING"] = "1"
|
| 15 |
+
os.environ["NUMBA_DISABLE_JIT"] = "0"
|
| 16 |
+
os.environ["NUMBA_CUDA_DRIVER"] = "cuda"
|
| 17 |
+
|
| 18 |
+
# Uncomment to use GPU (recommended for RTX 3070)
|
| 19 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 20 |
+
|
| 21 |
+
# ============================================================
|
| 22 |
+
# UTF-8 Fix for Manifest
|
| 23 |
+
# ============================================================
|
| 24 |
+
manifest_path = "train_manifest.jsonl"
|
| 25 |
+
with io.open(manifest_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 26 |
+
content = f.read()
|
| 27 |
+
with io.open(manifest_path, 'w', encoding='utf-8') as f:
|
| 28 |
+
f.write(content)
|
| 29 |
+
print("✅ train_manifest.jsonl converted to UTF-8")
|
| 30 |
+
|
| 31 |
+
# Patch builtins.open for UTF-8
|
| 32 |
+
import builtins
|
| 33 |
+
_old_open = open
|
| 34 |
+
def open_utf8(file, *args, **kwargs):
|
| 35 |
+
if isinstance(file, str) and file.endswith('.jsonl') and 'encoding' not in kwargs:
|
| 36 |
+
kwargs['encoding'] = 'utf-8'
|
| 37 |
+
return _old_open(file, *args, **kwargs)
|
| 38 |
+
builtins.open = open_utf8
|
| 39 |
+
|
| 40 |
+
# ============================================================
|
| 41 |
+
# Validate Manifest
|
| 42 |
+
# ============================================================
|
| 43 |
+
def validate_manifest(manifest_path):
|
| 44 |
+
count = 0
|
| 45 |
+
with open(manifest_path, "r", encoding="utf-8") as f:
|
| 46 |
+
for i, line in enumerate(f, 1):
|
| 47 |
+
try:
|
| 48 |
+
item = json.loads(line.strip())
|
| 49 |
+
assert os.path.exists(item["audio_filepath"]), f"Missing: {item['audio_filepath']}"
|
| 50 |
+
assert "text" in item and item["text"].strip(), "Empty text"
|
| 51 |
+
count += 1
|
| 52 |
+
except Exception as e:
|
| 53 |
+
print(f"❌ Line {i} error: {e}")
|
| 54 |
+
print(f" Content: {line[:100]}")
|
| 55 |
+
print(f"✅ Valid entries: {count}")
|
| 56 |
+
return count
|
| 57 |
+
|
| 58 |
+
valid_count = validate_manifest(manifest_path)
|
| 59 |
+
if valid_count == 0:
|
| 60 |
+
raise ValueError("No valid training samples found!")
|
| 61 |
+
|
| 62 |
+
# ============================================================
|
| 63 |
+
# Paths and Hyperparameters
|
| 64 |
+
# ============================================================
|
| 65 |
+
BASE_MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
|
| 66 |
+
SAVE_DIR = "output_finetuned"
|
| 67 |
+
LAST_CKPT = os.path.join(SAVE_DIR, "last.ckpt")
|
| 68 |
+
|
| 69 |
+
BATCH_SIZE = 4
|
| 70 |
+
ADDITIONAL_EPOCHS = 50
|
| 71 |
+
LEARNING_RATE = 1e-5
|
| 72 |
+
WARMUP_STEPS = 500
|
| 73 |
+
WEIGHT_DECAY = 0.00001
|
| 74 |
+
|
| 75 |
+
os.makedirs(SAVE_DIR, exist_ok=True)
|
| 76 |
+
|
| 77 |
+
# ============================================================
|
| 78 |
+
# Load Model
|
| 79 |
+
# ============================================================
|
| 80 |
+
print("🔹 Loading pretrained or last fine-tuned model...")
|
| 81 |
+
model = EncDecHybridRNNTCTCBPEModel.restore_from(BASE_MODEL_PATH)
|
| 82 |
+
|
| 83 |
+
# ============================================================
|
| 84 |
+
# Tokenizer Fix
|
| 85 |
+
# ============================================================
|
| 86 |
+
with open_dict(model.cfg):
|
| 87 |
+
tokenizer_dir = os.path.join(os.path.dirname(BASE_MODEL_PATH), "tokenizer")
|
| 88 |
+
os.makedirs(tokenizer_dir, exist_ok=True)
|
| 89 |
+
model.cfg.tokenizer.dir = tokenizer_dir
|
| 90 |
+
model.cfg.tokenizer.type = "bpe"
|
| 91 |
+
|
| 92 |
+
if 'validation_ds' in model.cfg:
|
| 93 |
+
model.cfg.validation_ds.manifest_filepath = None
|
| 94 |
+
if 'test_ds' in model.cfg:
|
| 95 |
+
model.cfg.test_ds.manifest_filepath = None
|
| 96 |
+
|
| 97 |
+
# ============================================================
|
| 98 |
+
# Setup Training Data
|
| 99 |
+
# ============================================================
|
| 100 |
+
train_ds_config = {
|
| 101 |
+
"manifest_filepath": manifest_path,
|
| 102 |
+
"batch_size": BATCH_SIZE,
|
| 103 |
+
"shuffle": True,
|
| 104 |
+
"num_workers": 0,
|
| 105 |
+
"pin_memory": False,
|
| 106 |
+
"sample_rate": 16000,
|
| 107 |
+
"max_duration": 20.0,
|
| 108 |
+
"min_duration": 0.5,
|
| 109 |
+
"trim_silence": True,
|
| 110 |
+
"use_start_end_token": True,
|
| 111 |
+
"normalize_transcripts": True,
|
| 112 |
+
"parser": "ar",
|
| 113 |
+
}
|
| 114 |
+
model.setup_training_data(train_ds_config)
|
| 115 |
+
|
| 116 |
+
# ============================================================
|
| 117 |
+
# Optimizer & Scheduler
|
| 118 |
+
# ============================================================
|
| 119 |
+
with open_dict(model.cfg):
|
| 120 |
+
model.cfg.optim.name = "adamw"
|
| 121 |
+
model.cfg.optim.lr = LEARNING_RATE
|
| 122 |
+
model.cfg.optim.betas = [0.9, 0.98]
|
| 123 |
+
model.cfg.optim.weight_decay = WEIGHT_DECAY
|
| 124 |
+
model.cfg.optim.eps = 1e-8
|
| 125 |
+
model.cfg.optim.sched = {
|
| 126 |
+
"name": "CosineAnnealing",
|
| 127 |
+
"warmup_steps": WARMUP_STEPS,
|
| 128 |
+
"min_lr": 1e-7,
|
| 129 |
+
"last_epoch": -1,
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
# ============================================================
|
| 133 |
+
# Callbacks
|
| 134 |
+
# ============================================================
|
| 135 |
+
checkpoint_callback = ModelCheckpoint(
|
| 136 |
+
dirpath=SAVE_DIR,
|
| 137 |
+
filename='continued-{epoch:02d}-{train_loss:.4f}',
|
| 138 |
+
save_top_k=3,
|
| 139 |
+
monitor='train_loss',
|
| 140 |
+
mode='min',
|
| 141 |
+
save_last=True,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
early_stop_callback = EarlyStopping(
|
| 145 |
+
monitor='train_loss',
|
| 146 |
+
patience=20,
|
| 147 |
+
mode='min',
|
| 148 |
+
verbose=True,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
lr_monitor = LearningRateMonitor(logging_interval='step')
|
| 152 |
+
|
| 153 |
+
# ============================================================
|
| 154 |
+
# Determine Max Epochs Based on Last Checkpoint
|
| 155 |
+
# ============================================================
|
| 156 |
+
# ============================================================
|
| 157 |
+
# Allow loading full NeMo checkpoint (trusted source)
|
| 158 |
+
# ============================================================
|
| 159 |
+
torch.serialization.add_safe_globals([DictConfig])
|
| 160 |
+
|
| 161 |
+
if os.path.exists(LAST_CKPT):
|
| 162 |
+
ckpt_data = torch.load(LAST_CKPT, map_location="cpu", weights_only=False)
|
| 163 |
+
last_epoch = ckpt_data.get("epoch", 0)
|
| 164 |
+
new_max_epochs = last_epoch + ADDITIONAL_EPOCHS
|
| 165 |
+
print(f"🧩 Last checkpoint epoch: {last_epoch} → continuing up to {new_max_epochs} epochs total.")
|
| 166 |
+
else:
|
| 167 |
+
new_max_epochs = ADDITIONAL_EPOCHS
|
| 168 |
+
|
| 169 |
+
# ============================================================
|
| 170 |
+
# Trainer
|
| 171 |
+
# ============================================================
|
| 172 |
+
trainer = Trainer(
|
| 173 |
+
accelerator="gpu" if torch.cuda.is_available() else "cpu",
|
| 174 |
+
devices=1,
|
| 175 |
+
max_epochs=new_max_epochs,
|
| 176 |
+
log_every_n_steps=1,
|
| 177 |
+
enable_checkpointing=True,
|
| 178 |
+
default_root_dir=SAVE_DIR,
|
| 179 |
+
callbacks=[checkpoint_callback, early_stop_callback, lr_monitor],
|
| 180 |
+
gradient_clip_val=1.0,
|
| 181 |
+
accumulate_grad_batches=4,
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
# ============================================================
|
| 185 |
+
# Continue Training
|
| 186 |
+
# ============================================================
|
| 187 |
+
if os.path.exists(LAST_CKPT):
|
| 188 |
+
print(f"🚀 Continuing training from checkpoint: {LAST_CKPT}")
|
| 189 |
+
trainer.fit(model, ckpt_path=LAST_CKPT)
|
| 190 |
+
else:
|
| 191 |
+
print("⚠️ No checkpoint found, training from base model...")
|
| 192 |
+
trainer.fit(model)
|
| 193 |
+
|
| 194 |
+
# ============================================================
|
| 195 |
+
# Save Final Model
|
| 196 |
+
# ============================================================
|
| 197 |
+
final_model_path = os.path.join(SAVE_DIR, "finetuned_model_continued.nemo")
|
| 198 |
+
model.save_to(final_model_path)
|
| 199 |
+
print(f"\n✅ Continued fine-tuned model saved to: {final_model_path}")
|
converting_dataset_to_8khz.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import subprocess
|
| 4 |
+
import soundfile as sf
|
| 5 |
+
|
| 6 |
+
# ==============================
|
| 7 |
+
# CONFIGURATION
|
| 8 |
+
# ==============================
|
| 9 |
+
input_folder = "data_tts_evaluation"
|
| 10 |
+
output_folder = "data_tts_eval_8k_ulaw"
|
| 11 |
+
old_manifest = "eval_manifest.jsonl" # Optional: use old text references
|
| 12 |
+
new_manifest = "eval_manifest_8k_ulaw.jsonl"
|
| 13 |
+
|
| 14 |
+
# Create output folder if it doesn’t exist
|
| 15 |
+
os.makedirs(output_folder, exist_ok=True)
|
| 16 |
+
|
| 17 |
+
# Supported audio formats
|
| 18 |
+
valid_ext = (".wav", ".mp3", ".flac", ".ogg", ".m4a")
|
| 19 |
+
|
| 20 |
+
# ==============================
|
| 21 |
+
# Load Texts from Old Manifest (if exists)
|
| 22 |
+
# ==============================
|
| 23 |
+
text_map = {}
|
| 24 |
+
if os.path.exists(old_manifest):
|
| 25 |
+
print(f"🔹 Loading existing manifest: {old_manifest}")
|
| 26 |
+
with open(old_manifest, "r", encoding="utf-8") as f:
|
| 27 |
+
for line in f:
|
| 28 |
+
try:
|
| 29 |
+
item = json.loads(line.strip())
|
| 30 |
+
# Extract filename without extension for mapping
|
| 31 |
+
key = os.path.splitext(os.path.basename(item["audio_filepath"]))[0]
|
| 32 |
+
text_map[key] = item.get("text", "")
|
| 33 |
+
except Exception as e:
|
| 34 |
+
print(f"⚠️ Error reading line: {e}")
|
| 35 |
+
|
| 36 |
+
# ==============================
|
| 37 |
+
# CONVERSION LOOP + MANIFEST CREATION
|
| 38 |
+
# ==============================
|
| 39 |
+
converted_entries = []
|
| 40 |
+
|
| 41 |
+
for filename in os.listdir(input_folder):
|
| 42 |
+
if not filename.lower().endswith(valid_ext):
|
| 43 |
+
continue
|
| 44 |
+
|
| 45 |
+
input_path = os.path.join(input_folder, filename)
|
| 46 |
+
base_name = os.path.splitext(filename)[0]
|
| 47 |
+
output_name = base_name + "_8k_ulaw.wav"
|
| 48 |
+
output_path = os.path.join(output_folder, output_name)
|
| 49 |
+
|
| 50 |
+
# FFmpeg command: convert to mono 8kHz u-law
|
| 51 |
+
cmd = [
|
| 52 |
+
"ffmpeg",
|
| 53 |
+
"-y", # overwrite
|
| 54 |
+
"-i", input_path,
|
| 55 |
+
"-ar", "8000", # 8kHz
|
| 56 |
+
"-ac", "1", # mono
|
| 57 |
+
"-c:a", "pcm_mulaw",
|
| 58 |
+
output_path
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 63 |
+
|
| 64 |
+
# Get duration of new file
|
| 65 |
+
data, samplerate = sf.read(output_path)
|
| 66 |
+
duration = round(len(data) / samplerate, 2)
|
| 67 |
+
|
| 68 |
+
# Get text (if exists from old manifest)
|
| 69 |
+
text = text_map.get(base_name, "")
|
| 70 |
+
|
| 71 |
+
# Add entry to new manifest
|
| 72 |
+
converted_entries.append({
|
| 73 |
+
"audio_filepath": output_path.replace("\\", "/"),
|
| 74 |
+
"duration": duration,
|
| 75 |
+
"text": text
|
| 76 |
+
})
|
| 77 |
+
|
| 78 |
+
print(f"✅ Converted: {filename} → {output_name} ({duration}s)")
|
| 79 |
+
|
| 80 |
+
except subprocess.CalledProcessError as e:
|
| 81 |
+
print(f"❌ Error converting {filename}: {e.stderr.decode('utf-8', errors='ignore')}")
|
| 82 |
+
|
| 83 |
+
# ==============================
|
| 84 |
+
# SAVE NEW MANIFEST
|
| 85 |
+
# ==============================
|
| 86 |
+
if converted_entries:
|
| 87 |
+
with open(new_manifest, "w", encoding="utf-8") as f:
|
| 88 |
+
for entry in converted_entries:
|
| 89 |
+
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
| 90 |
+
print(f"\n💾 Manifest saved to: {new_manifest}")
|
| 91 |
+
print(f"📊 Total entries: {len(converted_entries)}")
|
| 92 |
+
else:
|
| 93 |
+
print("⚠️ No audio files converted or manifest empty!")
|
| 94 |
+
|
| 95 |
+
print(f"\n🎯 Conversion complete! {len(converted_entries)} files saved in '{output_folder}'.")
|
data_tts/gcloud_tts_sample_001.wav
ADDED
|
Binary file (94.6 kB). View file
|
|
|
data_tts/gcloud_tts_sample_002.wav
ADDED
|
Binary file (82.5 kB). View file
|
|
|
data_tts/gcloud_tts_sample_053.wav
ADDED
|
Binary file (98.5 kB). View file
|
|
|
data_tts/gcloud_tts_sample_060.wav
ADDED
|
Binary file (99.7 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1065.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3299bdc8d6906ab1152f326a4b2839966a842deab4762c837aa073b7c4b286dd
|
| 3 |
+
size 183062
|
data_tts/gcloud_tts_sample_1067.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:211ce2911a62fc66fd143e6434b4f053f0ccde0ce5b007f0a99c98a5caeefa8f
|
| 3 |
+
size 133318
|
data_tts/gcloud_tts_sample_107.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c840bba5d03b45f415219171e226055fc94735f43187c816f4d723a9eb162d7a
|
| 3 |
+
size 158232
|
data_tts/gcloud_tts_sample_1078.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1774d6b6b212dac76b782be61c89a466a900e2ab9620b223e78df14c70f30d3d
|
| 3 |
+
size 172362
|
data_tts/gcloud_tts_sample_1080.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fae59de9a6431baadb0b50aa9ff7e2adc12808f498827c3913e4519adcc51849
|
| 3 |
+
size 179680
|
data_tts/gcloud_tts_sample_1082.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9dfddaf1051afae6e4fa8298fe948d6ffcfc4681ef35c883195969c8ba5a22c1
|
| 3 |
+
size 162944
|
data_tts/gcloud_tts_sample_1189.wav
ADDED
|
Binary file (77.5 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1190.wav
ADDED
|
Binary file (69.7 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1191.wav
ADDED
|
Binary file (77 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1192.wav
ADDED
|
Binary file (97.9 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1193.wav
ADDED
|
Binary file (74.6 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1221.wav
ADDED
|
Binary file (99.7 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1222.wav
ADDED
|
Binary file (95.8 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1236.wav
ADDED
|
Binary file (83.7 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1241.wav
ADDED
|
Binary file (99.7 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1277.wav
ADDED
|
Binary file (32.9 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1278.wav
ADDED
|
Binary file (48.7 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1279.wav
ADDED
|
Binary file (66.2 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1280.wav
ADDED
|
Binary file (83.7 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1286.wav
ADDED
|
Binary file (47.6 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1287.wav
ADDED
|
Binary file (75.2 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1295.wav
ADDED
|
Binary file (43 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1296.wav
ADDED
|
Binary file (75.1 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1297.wav
ADDED
|
Binary file (95.8 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1304.wav
ADDED
|
Binary file (41.8 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1305.wav
ADDED
|
Binary file (64.5 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1306.wav
ADDED
|
Binary file (88.5 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1313.wav
ADDED
|
Binary file (50 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1314.wav
ADDED
|
Binary file (81.7 kB). View file
|
|
|
data_tts/gcloud_tts_sample_1322.wav
ADDED
|
Binary file (51.5 kB). View file
|
|
|
eval_manifest.jsonl
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_000.wav", "duration": 1.6, "text": "علاء سيد عبد الله"}
|
| 2 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_001.wav", "duration": 2.0, "text": "محمد أحمد عبد الرحمن"}
|
| 3 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_002.wav", "duration": 2.7, "text": "كريم محمود عبد الغفار"}
|
| 4 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_003.wav", "duration": 1.7, "text": "يوسف علي عبد الحليم"}
|
| 5 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_004.wav", "duration": 1.6, "text": "مصطفى طارق حسن"}
|
| 6 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_005.wav", "duration": 2.55, "text": "إبراهيم محمد عبد العزيز"}
|
| 7 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_006.wav", "duration": 2.85, "text": "خالد عمر عبد السميع"}
|
| 8 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_007.wav", "duration": 1.6, "text": "أحمد سامي حسين"}
|
| 9 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_008.wav", "duration": 2.0, "text": "محمود ناصر عبد اللطيف"}
|
| 10 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_009.wav", "duration": 1.9, "text": "عمر عبد الله محمد"}
|
| 11 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_010.wav", "duration": 1.65, "text": "مينا فادي نصيف"}
|
| 12 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_011.wav", "duration": 1.65, "text": "بيتر عادل صليب"}
|
| 13 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_012.wav", "duration": 1.65, "text": "جرجس سامح حكيم"}
|
| 14 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_013.wav", "duration": 1.75, "text": "رامي فوزي بشارة"}
|
| 15 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_014.wav", "duration": 1.7, "text": "فادي منير عوض"}
|
| 16 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_015.wav", "duration": 1.5, "text": "مريم يوسف فؤاد"}
|
| 17 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_016.wav", "duration": 2.0, "text": "نانسي شريف عياد"}
|
| 18 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_017.wav", "duration": 1.35, "text": "كيرلس ممدوح سمعان"}
|
| 19 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_018.wav", "duration": 1.65, "text": "هالة فؤاد حبيب"}
|
| 20 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_019.wav", "duration": 1.7, "text": "مارجريت جرجس فخري"}
|
| 21 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_020.wav", "duration": 1.8, "text": "ريم أحمد عبد الباري"}
|
| 22 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_021.wav", "duration": 1.9, "text": "شروق محمد عبد الرحيم"}
|
| 23 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_022.wav", "duration": 1.65, "text": "إيمان حسن مصطفى"}
|
| 24 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_023.wav", "duration": 2.5, "text": "فاطمة الزهراء عبد الله"}
|
| 25 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_024.wav", "duration": 2.7, "text": "سارة خالد عبد الباقي"}
|
| 26 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_025.wav", "duration": 1.8, "text": "ندى إبراهيم حسن"}
|
| 27 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_026.wav", "duration": 1.45, "text": "دينا محمود فوزي"}
|
| 28 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_027.wav", "duration": 2.15, "text": "لبنى عبد الرحمن السيد"}
|
| 29 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_028.wav", "duration": 1.7, "text": "آية طارق عبد الجليل"}
|
| 30 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_029.wav", "duration": 1.85, "text": "أسماء علي إبراهيم"}
|
| 31 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_030.wav", "duration": 1.9, "text": "أحمد عصام عبد الرحمن"}
|
| 32 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_031.wav", "duration": 1.75, "text": "نور هشام عبد الله"}
|
| 33 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_032.wav", "duration": 1.4, "text": "نجلاء سامي فؤاد"}
|
| 34 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_033.wav", "duration": 1.7, "text": "رنا علاء الدين أحمد"}
|
| 35 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_034.wav", "duration": 2.55, "text": "عادل فخري سمعان"}
|
| 36 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_035.wav", "duration": 1.4, "text": "بولا هاني رزق"}
|
| 37 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_036.wav", "duration": 1.45, "text": "مينا يوسف بشاي"}
|
| 38 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_037.wav", "duration": 1.75, "text": "أبانوب فادي كامل"}
|
| 39 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_038.wav", "duration": 2.25, "text": "مارينا جرجس جاد"}
|
| 40 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_039.wav", "duration": 2.1, "text": "كريستين فؤاد صموئيل"}
|
| 41 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_040.wav", "duration": 2.1, "text": "سليم أحمد عبد المقصود"}
|
| 42 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_041.wav", "duration": 1.95, "text": "عمار محمد عبد الرحيم"}
|
| 43 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_042.wav", "duration": 2.6, "text": "أنس عبد الله محمود"}
|
| 44 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_043.wav", "duration": 1.7, "text": "زياد عمرو ناصر"}
|
| 45 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_044.wav", "duration": 2.0, "text": "أمير يوسف عبد الغفار"}
|
| 46 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_045.wav", "duration": 2.4, "text": "خالد مصطفى عبد الحميد"}
|
| 47 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_046.wav", "duration": 1.75, "text": "جرجس عادل لبيب"}
|
| 48 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_047.wav", "duration": 1.4, "text": "بولا فخري بطرس"}
|
| 49 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_048.wav", "duration": 1.75, "text": "مارينا فادي صادق"}
|
| 50 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_049.wav", "duration": 2.3, "text": "جوليان جورج عزيز"}
|
| 51 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_050.wav", "duration": 1.45, "text": "نادر سامي رزق"}
|
| 52 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_051.wav", "duration": 1.75, "text": "عبد الرحمن أحمد عبد الله"}
|
| 53 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_052.wav", "duration": 1.5, "text": "محمد طه السيد"}
|
| 54 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_053.wav", "duration": 1.75, "text": "أحمد ياسر مصطفى"}
|
| 55 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_054.wav", "duration": 2.1, "text": "سيد عبد الفتاح عبد الغني"}
|
| 56 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_055.wav", "duration": 3.05, "text": "محمد رمضان عبد الحكيم"}
|
| 57 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_056.wav", "duration": 2.1, "text": "عبد الله حمدي عبد الفتاح"}
|
| 58 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_057.wav", "duration": 2.1, "text": "أيمن جمال عبد الناصر"}
|
| 59 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_058.wav", "duration": 2.45, "text": "أحمد عبد الرازق حسن"}
|
| 60 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_059.wav", "duration": 1.65, "text": "محمود خالد محمد"}
|
| 61 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_060.wav", "duration": 1.6, "text": "مروان عماد عبد الله"}
|
| 62 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_061.wav", "duration": 2.7, "text": "عبد الرحمن محمد شريف"}
|
| 63 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_062.wav", "duration": 1.95, "text": "أحمد محروس عبد اللطيف"}
|
| 64 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_063.wav", "duration": 2.4, "text": "مصطفى عبد القادر عبد السميع"}
|
| 65 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_064.wav", "duration": 1.9, "text": "عبد العزيز حسن عبد الله"}
|
| 66 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_065.wav", "duration": 2.25, "text": "مينا شنودة فخري"}
|
| 67 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_066.wav", "duration": 1.5, "text": "بولا يوسف بطرس"}
|
| 68 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_067.wav", "duration": 1.45, "text": "فادي عادل رسمي"}
|
| 69 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_068.wav", "duration": 2.2, "text": "جرجس فوزي منصور"}
|
| 70 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_069.wav", "duration": 1.95, "text": "كيرلس رأفت نجيب"}
|
| 71 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_070.wav", "duration": 1.5, "text": "مارينا جورج عادل"}
|
| 72 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_071.wav", "duration": 1.85, "text": "ديفيد ماهر منير"}
|
| 73 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_072.wav", "duration": 1.75, "text": "كارولين فادي شكر"}
|
| 74 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_073.wav", "duration": 1.9, "text": "مريم سامي فؤاد"}
|
| 75 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_074.wav", "duration": 1.6, "text": "أندرو فؤاد رزق"}
|
| 76 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_075.wav", "duration": 1.9, "text": "ريهام عبد الله محمد"}
|
| 77 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_076.wav", "duration": 2.7, "text": "سارة عماد حسن"}
|
| 78 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_077.wav", "duration": 1.9, "text": "ميادة عبد الحميد ناصر"}
|
| 79 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_078.wav", "duration": 1.7, "text": "آية أحمد عبد الله"}
|
| 80 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_079.wav", "duration": 1.95, "text": "نورهان عبد الفتاح علي"}
|
| 81 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_080.wav", "duration": 1.35, "text": "هدير خالد حسن"}
|
| 82 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_081.wav", "duration": 1.9, "text": "بسمة إبراهيم عبد الغني"}
|
| 83 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_082.wav", "duration": 2.3, "text": "أسماء طارق عبد الرحمن"}
|
| 84 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_083.wav", "duration": 1.75, "text": "يمنى محمد عبد الحليم"}
|
| 85 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_084.wav", "duration": 2.25, "text": "صفاء عبد الرحمن السيد"}
|
| 86 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_085.wav", "duration": 1.9, "text": "منال أحمد حسن"}
|
| 87 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_086.wav", "duration": 2.65, "text": "رحمة عبد الله محمود"}
|
| 88 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_087.wav", "duration": 2.35, "text": "ياسمين خالد عبد الرحمن"}
|
| 89 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_088.wav", "duration": 2.2, "text": "شيماء أحمد عبد الغفار"}
|
| 90 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_089.wav", "duration": 1.95, "text": "علا سامي عبد المقصود"}
|
| 91 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_090.wav", "duration": 1.95, "text": "رغدة علي عبد الباري"}
|
| 92 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_091.wav", "duration": 1.95, "text": "هايدي جرجس بطرس"}
|
| 93 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_092.wav", "duration": 1.5, "text": "نيرمين مينا فؤاد"}
|
| 94 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_093.wav", "duration": 1.7, "text": "جيسيكا بولا منصور"}
|
| 95 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_094.wav", "duration": 1.65, "text": "ماريان يوسف رسمي"}
|
| 96 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_095.wav", "duration": 1.5, "text": "كارين فادي شنودة"}
|
| 97 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_096.wav", "duration": 1.75, "text": "أميرة أحمد عبد الله"}
|
| 98 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_097.wav", "duration": 2.3, "text": "نورا إبراهيم حسن"}
|
| 99 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_098.wav", "duration": 1.6, "text": "هبة طارق عبد الرحمن"}
|
| 100 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_099.wav", "duration": 1.65, "text": "دعاء عبد الله السيد"}
|
| 101 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_100.wav", "duration": 1.9, "text": "عبير خالد عبد العزيز"}
|
| 102 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_101.wav", "duration": 1.7, "text": "خلود ناصر عبد الغفار"}
|
| 103 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_102.wav", "duration": 2.25, "text": "جيهان عبد الرحمن محمود"}
|
| 104 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_103.wav", "duration": 7.6, "text": "اثنين ثلاثة زيرو واحد واحد اثنين زيرو سبعة واحد زيرو زيرو واحد واحد اثنين"}
|
| 105 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_104.wav", "duration": 7.35, "text": "اثنين خمسة زيرو تسعة زيرو خمسة اثنين واحد واحد تسعة زيرو زيرو ثلاثة"}
|
| 106 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_105.wav", "duration": 7.15, "text": "ثلاثة زيرو واحد واحد اثنين ثلاثة زيرو سبعة واحد سبعة تسعة زيرو خمسة"}
|
| 107 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_106.wav", "duration": 6.75, "text": "اثنين تسعة زيرو سبعة واحد واحد زيرو اثنين واحد زيرو خمسة زيرو اثنين"}
|
| 108 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_107.wav", "duration": 8.2, "text": "ثلاثة واحد زيرو اثنين واحد اثنين زيرو تسعة واحد زيرو سبعة زيرو واحد"}
|
| 109 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_108.wav", "duration": 6.9, "text": "ثلاثة اثنين زيرو ثلاثة واحد واحد زيرو سبعة واحد تسعة زيرو زيرو أربعة"}
|
| 110 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_109.wav", "duration": 6.7, "text": "اثنين ثمانية زيرو تسعة واحد واحد زيرو خمسة واحد زيرو زيرو زيرو ستة"}
|
| 111 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_110.wav", "duration": 8.25, "text": "ثلاثة زيرو واحد واحد زيرو تسعة زيرو تسعة واحد ثمانية زيرو زيرو سبعة"}
|
| 112 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_111.wav", "duration": 5.25, "text": "اثنين خمسة زيرو اثنين زيرو اثنين زيرو سبعة واحد زيرو زيرو زيرو خمسة"}
|
| 113 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_112.wav", "duration": 7.95, "text": "ثلاثة واحد زيرو سبعة واحد زيرو زيرو خمسة واحد زيرو تسعة زيرو واحد"}
|
| 114 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_113.wav", "duration": 5.45, "text": "اثنين ستة زيرو واحد زيرو ثلاثة زيرو سبعة واحد زيرو زيرو زيرو تسعة"}
|
| 115 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_114.wav", "duration": 7.3, "text": "اثنين تسعة زيرو ثمانية واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو اثنين"}
|
| 116 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_115.wav", "duration": 6.35, "text": "ثلاثة اثنين زيرو سبعة واحد واحد زيرو تسعة واحد زيرو زيرو زيرو خمسة"}
|
| 117 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_116.wav", "duration": 7.15, "text": "ثلاثة واحد زيرو واحد واحد اثنين زيرو خمسة واحد زيرو زيرو زيرو أربعة"}
|
| 118 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_117.wav", "duration": 7.3, "text": "اثنين ثمانية زيرو تسعة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو اثنين"}
|
| 119 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_118.wav", "duration": 7.65, "text": "ثلاثة واحد زيرو خمسة واحد اثنين زيرو تسعة واحد زيرو تسعة زيرو ثلاثة"}
|
| 120 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_119.wav", "duration": 7.45, "text": "اثنين تسعة زيرو اثنين واحد واحد زيرو ثمانية واحد زيرو زيرو زيرو خمسة"}
|
| 121 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_120.wav", "duration": 8.25, "text": "ثلاثة اثنين زيرو واحد واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو تسعة"}
|
| 122 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_121.wav", "duration": 6.25, "text": "اثنين واحد زيرو ثلاثة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو أربعة"}
|
| 123 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_122.wav", "duration": 6.65, "text": "ثلاثة اثنين زيرو ثمانية واحد واحد زيرو خمسة واحد زيرو تسعة زيرو أربعة"}
|
| 124 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_123.wav", "duration": 6.7, "text": "زيرو واحد زيرو واحد اثنين ثلاثة أربعة خمسة ستة سبعة ثمانية"}
|
| 125 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_124.wav", "duration": 5.65, "text": "زيرو واحد واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين"}
|
| 126 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_125.wav", "duration": 5.15, "text": "زيرو واحد اثنين زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة"}
|
| 127 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_126.wav", "duration": 5.15, "text": "زيرو واحد خمسة سبعة ثمانية تسعة أربعة ثلاثة اثنين واحد زيرو"}
|
| 128 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_127.wav", "duration": 5.6, "text": "زيرو واحد زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين واحد"}
|
| 129 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_128.wav", "duration": 7.25, "text": "زيرو واحد واحد زيرو أربعة خمسة ستة سبعة ثمانية تسعة زيرو اثنين"}
|
| 130 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_129.wav", "duration": 8.25, "text": "زيرو واحد اثنين واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين"}
|
| 131 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_130.wav", "duration": 8.9, "text": "زيرو واحد خمسة زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين"}
|
| 132 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_131.wav", "duration": 7.3, "text": "زيرو واحد واحد خمسة ستة سبعة ثمانية تسعة أربعة ثلاثة اثنين زيرو"}
|
| 133 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_132.wav", "duration": 7.15, "text": "زيرو واحد اثنين أربعة خمسة ستة سبعة ثمانية تسعة زيرو واحد اثنين"}
|
| 134 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_133.wav", "duration": 2.1, "text": "في حادث عربية عند كوبري عباس"}
|
| 135 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_134.wav", "duration": 2.05, "text": "فيه حريق في عمارة في شارع فيصل"}
|
| 136 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_135.wav", "duration": 1.5, "text": "لقيت طفل تاه في المول"}
|
| 137 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_136.wav", "duration": 2.0, "text": "في خناقة كبيرة في ميدان الجيزة"}
|
| 138 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_137.wav", "duration": 2.75, "text": "عربية مقلوبة على الطريق الدائري"}
|
| 139 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_138.wav", "duration": 1.6, "text": "في صوت ضرب نار في الهرم"}
|
| 140 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_139.wav", "duration": 1.7, "text": "جارتي وقعت من البلكونة"}
|
| 141 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_140.wav", "duration": 2.95, "text": "حصلت سرقة في الشارع عند السوبرماركت"}
|
| 142 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_141.wav", "duration": 2.25, "text": "في واحد بيعتدي على بنت في الشارع"}
|
| 143 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_142.wav", "duration": 2.4, "text": "حصل انفجار صغير في محل الغاز"}
|
| 144 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_143.wav", "duration": 2.2, "text": "شفت عربية بتخبط موتوسيكل وهربت"}
|
| 145 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_144.wav", "duration": 2.35, "text": "طفل محبوس في الأسانسير"}
|
| 146 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_145.wav", "duration": 2.4, "text": "في شاب مصاب قدام محطة المترو"}
|
| 147 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_146.wav", "duration": 1.95, "text": "العربية عطلت في نص الطريق"}
|
| 148 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_147.wav", "duration": 1.9, "text": "في تسريب غاز في العمارة"}
|
| 149 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_148.wav", "duration": 2.9, "text": "واحد كبير في السن مغمى عليه في المسجد"}
|
| 150 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_149.wav", "duration": 3.0, "text": "حصلت مشاجرة بالسكاكين في السوق"}
|
| 151 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_150.wav", "duration": 2.55, "text": "عربية إسعاف اتأخرت على المكان"}
|
| 152 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_151.wav", "duration": 2.95, "text": "فيه كلب شرس بيهاجم الناس في الشارع"}
|
| 153 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_152.wav", "duration": 3.7, "text": "في بنت اتخطفِت من عند المدرسة"}
|
| 154 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_153.wav", "duration": 2.85, "text": "في حادث تصادم في محور 26 يوليو"}
|
| 155 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_154.wav", "duration": 4.15, "text": "واحد وقع من فوق سلم البيت"}
|
| 156 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_155.wav", "duration": 1.65, "text": "النور قاطع في الشارع كله"}
|
| 157 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_156.wav", "duration": 1.85, "text": "صوت انفجار جامد في المنطقة"}
|
| 158 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_157.wav", "duration": 3.05, "text": "العربية دخلت في محل في الهرم"}
|
| 159 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_158.wav", "duration": 1.75, "text": "طفلة ضايعة في المول"}
|
| 160 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_159.wav", "duration": 2.9, "text": "في تسريب مياه من الدور الرابع"}
|
| 161 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_160.wav", "duration": 2.15, "text": "خناقة بين الجيران فوق السطح"}
|
| 162 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_161.wav", "duration": 3.35, "text": "فيه عربية مركونة غلط قافلة الشارع"}
|
| 163 |
+
{"audio_filepath": "data_tts_evaluation\\openai_tts_sample_162.wav", "duration": 3.45, "text": "الغاز بيخرج من البوتاجاز ومفيش حد في الشقة"}
|
evaluation_results.csv
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio,expected,predicted,WER,CER
|
| 2 |
+
openai_tts_sample_000.wav,علاء سيد عبد الله,علاء سيد عبد الله,0.0,0.0
|
| 3 |
+
openai_tts_sample_001.wav,محمد أحمد عبد الرحمن,محمد أحمد عبد الرحمن,0.0,0.0
|
| 4 |
+
openai_tts_sample_002.wav,كريم محمود عبد الغفار,كريم محمود عبد الغفار,0.0,0.0
|
| 5 |
+
openai_tts_sample_003.wav,يوسف علي عبد الحليم,يوسف علي عبد الحليم,0.0,0.0
|
| 6 |
+
openai_tts_sample_004.wav,مصطفى طارق حسن,مصطفى طارق حسن,0.0,0.0
|
| 7 |
+
openai_tts_sample_005.wav,إبراهيم محمد عبد العزيز,إبراهيم محمد عبد العزيز,0.0,0.0
|
| 8 |
+
openai_tts_sample_006.wav,خالد عمر عبد السميع,خالد عمر عبد السميع,0.0,0.0
|
| 9 |
+
openai_tts_sample_007.wav,أحمد سامي حسين,أحمد سامي حسين,0.0,0.0
|
| 10 |
+
openai_tts_sample_008.wav,محمود ناصر عبد اللطيف,محمود ناصر عبد اللطيف,0.0,0.0
|
| 11 |
+
openai_tts_sample_009.wav,عمر عبد الله محمد,عمر عبد الله محمد,0.0,0.0
|
| 12 |
+
openai_tts_sample_010.wav,مينا فادي نصيف,مينا فادي نصيف,0.0,0.0
|
| 13 |
+
openai_tts_sample_011.wav,بيتر عادل صليب,بيتر عادل صليب,0.0,0.0
|
| 14 |
+
openai_tts_sample_012.wav,جرجس سامح حكيم,جرجس سامح حكيم,0.0,0.0
|
| 15 |
+
openai_tts_sample_013.wav,رامي فوزي بشارة,رامي فوزي بشارة,0.0,0.0
|
| 16 |
+
openai_tts_sample_014.wav,فادي منير عوض,فادي منير عوض,0.0,0.0
|
| 17 |
+
openai_tts_sample_015.wav,مريم يوسف فؤاد,مريم يوسف فؤاد,0.0,0.0
|
| 18 |
+
openai_tts_sample_016.wav,نانسي شريف عياد,نانسي شريف عياد,0.0,0.0
|
| 19 |
+
openai_tts_sample_017.wav,كيرلس ممدوح سمعان,كيرلس ممدوح سمعان,0.0,0.0
|
| 20 |
+
openai_tts_sample_018.wav,هالة فؤاد حبيب,هانا فؤاد حبيب,0.3333333333333333,0.16666666666666666
|
| 21 |
+
openai_tts_sample_019.wav,مارجريت جرجس فخري,ماربريت جرجس فخري,0.3333333333333333,0.06666666666666667
|
| 22 |
+
openai_tts_sample_020.wav,ريم أحمد عبد الباري,ريم أحمد عبد الباري,0.0,0.0
|
| 23 |
+
openai_tts_sample_021.wav,شروق محمد عبد الرحيم,شروق محمد عبد الرحيم,0.0,0.0
|
| 24 |
+
openai_tts_sample_022.wav,إيمان حسن مصطفى,إيمان حسن مصطفى,0.0,0.0
|
| 25 |
+
openai_tts_sample_023.wav,فاطمة الزهراء عبد الله,فاطمة زحراء عبد الله,0.25,0.15789473684210525
|
| 26 |
+
openai_tts_sample_024.wav,سارة خالد عبد الباقي,سارة خالد عبد الباقي,0.0,0.0
|
| 27 |
+
openai_tts_sample_025.wav,ندى إبراهيم حسن,ندى إبراهيم حسن,0.0,0.0
|
| 28 |
+
openai_tts_sample_026.wav,دينا محمود فوزي,دينا محمود فوزي,0.0,0.0
|
| 29 |
+
openai_tts_sample_027.wav,لبنى عبد الرحمن السيد,لبنى عبد الرحمن السيد,0.0,0.0
|
| 30 |
+
openai_tts_sample_028.wav,آية طارق عبد الجليل,آية طارق عبد الجليل,0.0,0.0
|
| 31 |
+
openai_tts_sample_029.wav,أسماء علي إبراهيم,أسماء علي إبراهيم,0.0,0.0
|
| 32 |
+
openai_tts_sample_030.wav,أحمد عصام عبد الرحمن,أحمد عصام عبد الرحمن,0.0,0.0
|
| 33 |
+
openai_tts_sample_031.wav,نور هشام عبد الله,نور هشام عبد الله,0.0,0.0
|
| 34 |
+
openai_tts_sample_032.wav,نجلاء سامي فؤاد,نجلاء سامي فؤاد,0.0,0.0
|
| 35 |
+
openai_tts_sample_033.wav,رنا علاء الدين أحمد,رنا علاء الدين أحمد,0.0,0.0
|
| 36 |
+
openai_tts_sample_034.wav,عادل فخري سمعان,عادل فخري سمعان,0.0,0.0
|
| 37 |
+
openai_tts_sample_035.wav,بولا هاني رزق,بولا هاني رزق,0.0,0.0
|
| 38 |
+
openai_tts_sample_036.wav,مينا يوسف بشاي,مينا يوسف بيشاي,0.3333333333333333,0.08333333333333333
|
| 39 |
+
openai_tts_sample_037.wav,أبانوب فادي كامل,أبانوب فادي كامل,0.0,0.0
|
| 40 |
+
openai_tts_sample_038.wav,مارينا جرجس جاد,مارينا جرجس كاد,0.3333333333333333,0.07692307692307693
|
| 41 |
+
openai_tts_sample_039.wav,كريستين فؤاد صموئيل,كريستين فؤاد صموئيل,0.0,0.0
|
| 42 |
+
openai_tts_sample_040.wav,سليم أحمد عبد المقصود,سليم أحمد عبد المقصود,0.0,0.0
|
| 43 |
+
openai_tts_sample_041.wav,عمار محمد عبد الرحيم,أنار محمد عبد الرحيم,0.25,0.11764705882352941
|
| 44 |
+
openai_tts_sample_042.wav,أنس عبد الله محمود,أنس عبد الله محمود,0.0,0.0
|
| 45 |
+
openai_tts_sample_043.wav,زياد عمرو ناصر,زياد عمرو ناصر,0.0,0.0
|
| 46 |
+
openai_tts_sample_044.wav,أمير يوسف عبد الغفار,أمير يوسف عبد الغفار,0.0,0.0
|
| 47 |
+
openai_tts_sample_045.wav,خالد مصطفى عبد الحميد,خالد مصطفى عبد الحميد,0.0,0.0
|
| 48 |
+
openai_tts_sample_046.wav,جرجس عادل لبيب,جرجس عادل لبيب,0.0,0.0
|
| 49 |
+
openai_tts_sample_047.wav,بولا فخري بطرس,ولا فخري بطرس,0.3333333333333333,0.08333333333333333
|
| 50 |
+
openai_tts_sample_048.wav,مارينا فادي صادق,مارينا فادي صادق,0.0,0.0
|
| 51 |
+
openai_tts_sample_049.wav,جوليان جورج عزيز,جوليان جورج عزيز,0.0,0.0
|
| 52 |
+
openai_tts_sample_050.wav,نادر سامي رزق,نادر سامي رزق,0.0,0.0
|
| 53 |
+
openai_tts_sample_051.wav,عبد الرحمن أحمد عبد الله,عبد الرحمن أحمد عبد الله,0.0,0.0
|
| 54 |
+
openai_tts_sample_052.wav,محمد طه السيد,محمد طه السيد,0.0,0.0
|
| 55 |
+
openai_tts_sample_053.wav,أحمد ياسر مصطفى,أحمد ياسر مصطفى,0.0,0.0
|
| 56 |
+
openai_tts_sample_054.wav,سيد عبد الفتاح عبد الغني,سيد عبد الفتاح عبد الغني,0.0,0.0
|
| 57 |
+
openai_tts_sample_055.wav,محمد رمضان عبد الحكيم,محمد رمضان عبد الحكيم,0.0,0.0
|
| 58 |
+
openai_tts_sample_056.wav,عبد الله حمدي عبد الفتاح,عبد الله حمدي عبد الفتاح,0.0,0.0
|
| 59 |
+
openai_tts_sample_057.wav,أيمن جمال عبد الناصر,أيمن جمال عبد الناصر,0.0,0.0
|
| 60 |
+
openai_tts_sample_058.wav,أحمد عبد الرازق حسن,أحمد عبد الرازق حسن,0.0,0.0
|
| 61 |
+
openai_tts_sample_059.wav,محمود خالد محمد,محمود خالد محمد,0.0,0.0
|
| 62 |
+
openai_tts_sample_060.wav,مروان عماد عبد الله,مروان عماد عبد الله,0.0,0.0
|
| 63 |
+
openai_tts_sample_061.wav,عبد الرحمن محمد شريف,عبد الرحمن محمد شريف,0.0,0.0
|
| 64 |
+
openai_tts_sample_062.wav,أحمد محروس عبد اللطيف,أحمد محروس عبد اللطيف,0.0,0.0
|
| 65 |
+
openai_tts_sample_063.wav,مصطفى عبد القادر عبد السميع,مصطفى عبد القادر عبد السميع,0.0,0.0
|
| 66 |
+
openai_tts_sample_064.wav,عبد العزيز حسن عبد الله,عبد العزيز حسن عبد الله,0.0,0.0
|
| 67 |
+
openai_tts_sample_065.wav,مينا شنودة فخري,مينا شنودة فخري,0.0,0.0
|
| 68 |
+
openai_tts_sample_066.wav,بولا يوسف بطرس,بولا يوسف بطرس,0.0,0.0
|
| 69 |
+
openai_tts_sample_067.wav,فادي عادل رسمي,فادي عادل لسمي,0.3333333333333333,0.08333333333333333
|
| 70 |
+
openai_tts_sample_068.wav,جرجس فوزي منصور,جرجس فوزي منصور,0.0,0.0
|
| 71 |
+
openai_tts_sample_069.wav,كيرلس رأفت نجيب,كيرلس رأفت نجيب,0.0,0.0
|
| 72 |
+
openai_tts_sample_070.wav,مارينا جورج عادل,مارينا جورج عادل,0.0,0.0
|
| 73 |
+
openai_tts_sample_071.wav,ديفيد ماهر منير,ديفيد ماهر منير,0.0,0.0
|
| 74 |
+
openai_tts_sample_072.wav,كارولين فادي شكر,كارولين فادي شكر,0.0,0.0
|
| 75 |
+
openai_tts_sample_073.wav,مريم سامي فؤاد,مريم سامي فؤاد,0.0,0.0
|
| 76 |
+
openai_tts_sample_074.wav,أندرو فؤاد رزق,أندرو فؤاد رزق,0.0,0.0
|
| 77 |
+
openai_tts_sample_075.wav,ريهام عبد الله محمد,ريهام عبد الله محمد,0.0,0.0
|
| 78 |
+
openai_tts_sample_076.wav,سارة عماد حسن,سارة عماد حسن,0.0,0.0
|
| 79 |
+
openai_tts_sample_077.wav,ميادة عبد الحميد ناصر,مادة عبد الحميد ناصر,0.25,0.05555555555555555
|
| 80 |
+
openai_tts_sample_078.wav,آية أحمد عبد الله,آية أحمد عبد الله,0.0,0.0
|
| 81 |
+
openai_tts_sample_079.wav,نورهان عبد الفتاح علي,نرهان عبد الفتاح علي,0.25,0.05555555555555555
|
| 82 |
+
openai_tts_sample_080.wav,هدير خالد حسن,هدير خالد حسن,0.0,0.0
|
| 83 |
+
openai_tts_sample_081.wav,بسمة إبراهيم عبد الغني,بسمة إبراهيم عبد الغني,0.0,0.0
|
| 84 |
+
openai_tts_sample_082.wav,أسماء طارق عبد الرحمن,أسماء طارق عبد الرحمن,0.0,0.0
|
| 85 |
+
openai_tts_sample_083.wav,يمنى محمد عبد الحليم,يمنى محمد عبد الحليم,0.0,0.0
|
| 86 |
+
openai_tts_sample_084.wav,صفاء عبد الرحمن السيد,صفاء عبد الرحمن السيد,0.0,0.0
|
| 87 |
+
openai_tts_sample_085.wav,منال أحمد حسن,منال أحمد حسن,0.0,0.0
|
| 88 |
+
openai_tts_sample_086.wav,رحمة عبد الله محمود,رحمة عبد الله محمود,0.0,0.0
|
| 89 |
+
openai_tts_sample_087.wav,ياسمين خالد عبد الرحمن,ياسمين خالد عبد الرحمن,0.0,0.0
|
| 90 |
+
openai_tts_sample_088.wav,شيماء أحمد عبد الغفار,شيماء أحمد عبد الغفار,0.0,0.0
|
| 91 |
+
openai_tts_sample_089.wav,علا سامي عبد المقصود,علا سامي عبد المقصود,0.0,0.0
|
| 92 |
+
openai_tts_sample_090.wav,رغدة علي عبد الباري,رغدة علي عبد الباري,0.0,0.0
|
| 93 |
+
openai_tts_sample_091.wav,هايدي جرجس بطرس,هايدي جرجس بطرس,0.0,0.0
|
| 94 |
+
openai_tts_sample_092.wav,نيرمين مينا فؤاد,نرمين مينا فؤاد,0.3333333333333333,0.07142857142857142
|
| 95 |
+
openai_tts_sample_093.wav,جيسيكا بولا منصور,كيسيك بولا منصور,0.3333333333333333,0.13333333333333333
|
| 96 |
+
openai_tts_sample_094.wav,ماريان يوسف رسمي,ماريان يوسف رسمي,0.0,0.0
|
| 97 |
+
openai_tts_sample_095.wav,كارين فادي شنودة,كريم فادي شنودة,0.3333333333333333,0.14285714285714285
|
| 98 |
+
openai_tts_sample_096.wav,أميرة أحمد عبد الله,أميرة أحمد عبد الله,0.0,0.0
|
| 99 |
+
openai_tts_sample_097.wav,نورا إبراهيم حسن,نورا إبراهيم حسن,0.0,0.0
|
| 100 |
+
openai_tts_sample_098.wav,هبة طارق عبد الرحمن,هبة طارق عبد الرحمن,0.0,0.0
|
| 101 |
+
openai_tts_sample_099.wav,دعاء عبد الله السيد,دعاء عبد الله السيد,0.0,0.0
|
| 102 |
+
openai_tts_sample_100.wav,عبير خالد عبد العزيز,أمير خالد عبد العزيز,0.25,0.11764705882352941
|
| 103 |
+
openai_tts_sample_101.wav,خلود ناصر عبد الغفار,ولود ناصر عبد الغفار,0.25,0.058823529411764705
|
| 104 |
+
openai_tts_sample_102.wav,جيهان عبد الرحمن محمود,جيهان عبد الرحمن محمود,0.0,0.0
|
| 105 |
+
openai_tts_sample_103.wav,اثنين ثلاثة زيرو واحد واحد اثنين زيرو سبعة واحد زيرو زيرو واحد واحد اثنين,اثنين ثلاثة زيرو واحد واحد اثنين زيرو سبعة واحد زيرو زيرو واحد واحد اثنين,0.0,0.0
|
| 106 |
+
openai_tts_sample_104.wav,اثنين خمسة زيرو تسعة زيرو خمسة اثنين واحد واحد تسعة زيرو زيرو ثلاثة,اتنين خمسة زيرو تسعة زيرو خمسة اتنين واحد واحد تسعة زيرو زيرو تلاتة,0.23076923076923078,0.07272727272727272
|
| 107 |
+
openai_tts_sample_105.wav,ثلاثة زيرو واحد واحد اثنين ثلاثة زيرو سبعة واحد سبعة تسعة زيرو خمسة,ثلاثة زيرو واحد واحد اثنين ثلاثة زيرو سبعة واحد سبعة تسعة زيرو خمسة,0.0,0.0
|
| 108 |
+
openai_tts_sample_106.wav,اثنين تسعة زيرو سبعة واحد واحد زيرو اثنين واحد زيرو خمسة زيرو اثنين,اتنين تسعة زيرو سبعة واحد واحد زيرو اثنين واحد واحد زيرو خمسة زيرو اثنين,0.14285714285714285,0.09090909090909091
|
| 109 |
+
openai_tts_sample_107.wav,ثلاثة واحد زيرو اثنين واحد اثنين زيرو تسعة واحد زيرو سبعة زيرو واحد,ثلاثة واحد زيرو اثنين واحد اثنين زيرو تسعة واحد زيرو سبعة زيرو واحد,0.0,0.0
|
| 110 |
+
openai_tts_sample_108.wav,ثلاثة اثنين زيرو ثلاثة واحد واحد زيرو سبعة واحد تسعة زيرو زيرو أربعة,ثلاثة اتنين زيرو ثلاثة واحد زيرو سبعة واحد تسعة زيرو زيرو أربعة,0.16666666666666666,0.08928571428571429
|
| 111 |
+
openai_tts_sample_109.wav,اثنين ثمانية زيرو تسعة واحد واحد زيرو خمسة واحد زيرو زيرو زيرو ستة,اتنين تمانية زيرو تسعة واحد زيرو خمسة واحد زيرو زيرو زيرو ستة,0.25,0.1111111111111111
|
| 112 |
+
openai_tts_sample_110.wav,ثلاثة زيرو واحد واحد زيرو تسعة زيرو تسعة واحد ثمانية زيرو زيرو سبعة,ثلاثة زيرو واحد واحد زيرو تسعة زيرو تسعة واحد ثمانية زيرو سبعة,0.08333333333333333,0.07272727272727272
|
| 113 |
+
openai_tts_sample_111.wav,اثنين خمسة زيرو اثنين زيرو اثنين زيرو سبعة واحد زيرو زيرو زيرو خمسة,اتنين خمسة زيرو اتنين زيرو اتنين زيرو سبعة واحد زيرو زيرو زيرو خمسة,0.23076923076923078,0.05454545454545454
|
| 114 |
+
openai_tts_sample_112.wav,ثلاثة واحد زيرو سبعة واحد زيرو زيرو خمسة واحد زيرو تسعة زيرو واحد,ثلاثة واحد زيرو سبعة واحد زيرو زيرو خمسة واحد زيرو تسعة زيرو واحد,0.0,0.0
|
| 115 |
+
openai_tts_sample_113.wav,اثنين ستة زيرو واحد زيرو ثلاثة زيرو سبعة واحد زيرو زيرو زيرو تسعة,اتنين ستة زيرو واحد زيرو ثلاثة زيرو سبعة واحد زيرو زيرو زيرو تسعة,0.07692307692307693,0.018867924528301886
|
| 116 |
+
openai_tts_sample_114.wav,اثنين تسعة زيرو ثمانية واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو اثنين,اتنين تسعة زيرو تمانية واحد اثنين زيرو اربعة واحد زيرو زيرو اتنين,0.4166666666666667,0.13793103448275862
|
| 117 |
+
openai_tts_sample_115.wav,ثلاثة اثنين زيرو سبعة واحد واحد زيرو تسعة واحد زيرو زيرو زيرو خمسة,ثلاثة اثنين زيرو سبعة واحد زيرو زيرو زيرو خمسة,0.4444444444444444,0.2962962962962963
|
| 118 |
+
openai_tts_sample_116.wav,ثلاثة واحد زيرو واحد واحد اثنين زيرو خمسة واحد زيرو زيرو زيرو أربعة,ثلاثة زيرو واحد واحد اثنين زيرو خمسة واحد زيرو زيرو زيرو أربعة,0.08333333333333333,0.07272727272727272
|
| 119 |
+
openai_tts_sample_117.wav,اثنين ثمانية زيرو تسعة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو اثنين,اتنين تمانية زيرو تسعة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو اتنين,0.23076923076923078,0.05357142857142857
|
| 120 |
+
openai_tts_sample_118.wav,ثلاثة واحد زيرو خمسة واحد اثنين زيرو تسعة واحد زيرو تسعة زيرو ثلاثة,تلاتة واحد زيرو خمسة واحد اتنين زيرو تسعة واحد زيرو تسعة,0.36363636363636365,0.21818181818181817
|
| 121 |
+
openai_tts_sample_119.wav,اثنين تسعة زيرو اثنين واحد واحد زيرو ثمانية واحد زيرو زيرو زيرو خمسة,اتنين تسعة زيرو اثنين واحد واحد زيرو اثنين زيرو زيرو خمسة,0.36363636363636365,0.23214285714285715
|
| 122 |
+
openai_tts_sample_120.wav,ثلاثة اثنين زيرو واحد واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو تسعة,ثلاثة اثنين زيرو واحد واحد اثنين زيرو أربعة واحد زيرو زيرو زيرو تسعة,0.0,0.0
|
| 123 |
+
openai_tts_sample_121.wav,اثنين واحد زيرو ثلاثة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو أربعة,اتنين واحد زيرو ثلاثة واحد زيرو زيرو سبعة واحد زيرو زيرو زيرو أربعة,0.07692307692307693,0.01818181818181818
|
| 124 |
+
openai_tts_sample_122.wav,ثلاثة اثنين زيرو ثمانية واحد واحد زيرو خمسة واحد زيرو تسعة زيرو أربعة,ثلاثة اتنين زيرو تمانية واحد زيرو خمسة واحد زيرو تسعة زيرو أربعة,0.25,0.10526315789473684
|
| 125 |
+
openai_tts_sample_123.wav,زيرو واحد زيرو واحد اثنين ثلاثة أربعة خمسة ستة سبعة ثمانية,زيرو واحد زيرو واحد اثنين تلاتة أربعة خمسة ستة سبعة تمانية,0.18181818181818182,0.0625
|
| 126 |
+
openai_tts_sample_124.wav,زيرو واحد واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,زيرو واحد واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,0.0,0.0
|
| 127 |
+
openai_tts_sample_125.wav,زيرو واحد اثنين زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة,زيرو واحد اتنين زيرو تسعة تمانية سبعة ستة خمسة أربعة تلاتة,0.2727272727272727,0.08333333333333333
|
| 128 |
+
openai_tts_sample_126.wav,زيرو واحد خمسة سبعة ثمانية تسعة أربعة ثلاثة اثنين واحد زيرو,زيرو واحد خمسة سبعة اتنين تسعة أربعة تلاتة اتنين واحد زيرو,0.2727272727272727,0.14285714285714285
|
| 129 |
+
openai_tts_sample_127.wav,زيرو واحد زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين واحد,زيرو واحد زيرو تسعة تمانية سبعة ستة خمسة أربعة تلاتة اتنين واحد,0.25,0.07692307692307693
|
| 130 |
+
openai_tts_sample_128.wav,زيرو واحد واحد زيرو أربعة خمسة ستة سبعة ثمانية تسعة زيرو اثنين,زيرو واحد واحد زيرو أربعة خمسة ستة سبعة ثمانية تسعة زيرو اثنين,0.0,0.0
|
| 131 |
+
openai_tts_sample_129.wav,زيرو واحد اثنين واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,زيرو واحد اثنين واحد تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,0.0,0.0
|
| 132 |
+
openai_tts_sample_130.wav,زيرو واحد خمسة زيرو تسعة ثمانية سبعة ستة خمسة أربعة ثلاثة اثنين,زيرو واحد خمسة زيرو تسعة تمانية سبعة ستة خمسة أربعة تلاتة اتنين,0.25,0.07692307692307693
|
| 133 |
+
openai_tts_sample_131.wav,زيرو واحد واحد خمسة ستة سبعة ثمانية تسعة أربعة ثلاثة اثنين زيرو,زيرو واحد واحد خمسة ستة سبعة ثمانية تسعة أربعة ثلاثة اثنين زيرو,0.0,0.0
|
| 134 |
+
openai_tts_sample_132.wav,زيرو واحد اثنين أربعة خمسة ستة سبعة ثمانية تسعة زيرو واحد اثنين,زيرو واحد اتنين أربعة خمسة ستة سبعة ثمانية تسعة زيرو واحد اتنين,0.16666666666666666,0.038461538461538464
|
| 135 |
+
openai_tts_sample_133.wav,في حادث عربية عند كوبري عباس,في حادث عربية عند كوبري عباس,0.0,0.0
|
| 136 |
+
openai_tts_sample_134.wav,فيه حريق في عمارة في شارع فيصل,فيه حريق في عمارة في شارع فيصل.,0.14285714285714285,0.041666666666666664
|
| 137 |
+
openai_tts_sample_135.wav,لقيت طفل تاه في المول,ماجيت طفل تايه في المول,0.4,0.23529411764705882
|
| 138 |
+
openai_tts_sample_136.wav,في خناقة كبيرة في ميدان الجيزة,في خناقة كبيرة في ميدان الجيزة.,0.16666666666666666,0.04
|
| 139 |
+
openai_tts_sample_137.wav,ع��بية مقلوبة على الطريق الدائري,عربية مقلوبة على الطريق الدائري.,0.2,0.037037037037037035
|
| 140 |
+
openai_tts_sample_138.wav,في صوت ضرب نار في الهرم,في سود ضرب نار في الهرم.,0.3333333333333333,0.16666666666666666
|
| 141 |
+
openai_tts_sample_139.wav,جارتي وقعت من البلكونة,جارتي وقعت من البلكونة,0.0,0.0
|
| 142 |
+
openai_tts_sample_140.wav,حصلت سرقة في الشارع عند السوبرماركت,حصلت سرقة في الشارع عند السوبرماركت,0.0,0.0
|
| 143 |
+
openai_tts_sample_141.wav,في واحد بيعتدي على بنت في الشارع,في واحد بيعتدي على بنت في الشارع,0.0,0.0
|
| 144 |
+
openai_tts_sample_142.wav,حصل انفجار صغير في محل الغاز,حصل انفجار صغير في محل الغاز,0.0,0.0
|
| 145 |
+
openai_tts_sample_143.wav,شفت عربية بتخبط موتوسيكل وهربت,شفت عربية بتخبط موتوسيكل وهربت.,0.2,0.038461538461538464
|
| 146 |
+
openai_tts_sample_144.wav,طفل محبوس في الأسانسير,طفل محبوس في الأسانسير,0.0,0.0
|
| 147 |
+
openai_tts_sample_145.wav,في شاب مصاب قدام محطة المترو,في شاب مصاب قدام محطة المترو.,0.16666666666666666,0.043478260869565216
|
| 148 |
+
openai_tts_sample_146.wav,العربية عطلت في نص الطريق,العربية عطلت في نص الطريق.,0.2,0.047619047619047616
|
| 149 |
+
openai_tts_sample_147.wav,في تسريب غاز في العمارة,في تسريب غاز في العمارة.,0.2,0.05263157894736842
|
| 150 |
+
openai_tts_sample_148.wav,واحد كبير في السن مغمى عليه في المسجد,واحد كبير في السن مغمى عليه في المسجد,0.0,0.0
|
| 151 |
+
openai_tts_sample_149.wav,حصلت مشاجرة بالسكاكين في السوق,حصلت مشاجرة بالسكاكين في السوق,0.0,0.0
|
| 152 |
+
openai_tts_sample_150.wav,عربية إسعاف اتأخرت على المكان,عربية اسعاف اتأخرت على المكان.,0.4,0.08
|
| 153 |
+
openai_tts_sample_151.wav,فيه كلب شرس بيهاجم الناس في الشارع,فيه كلب شرس بيهاجم الناس في الشارع,0.0,0.0
|
| 154 |
+
openai_tts_sample_152.wav,في بنت اتخطفِت من عند المدرسة,في بنت اتخطفت من عند المدرسة,0.16666666666666666,0.041666666666666664
|
| 155 |
+
openai_tts_sample_153.wav,في حادث تصادم في محور 26 يوليو,في حادث تصادم في محور ⁇ يوليو.,0.2857142857142857,0.125
|
| 156 |
+
openai_tts_sample_154.wav,واحد وقع من فوق سلم البيت,واحد وقع من فوق سلم البيت,0.0,0.0
|
| 157 |
+
openai_tts_sample_155.wav,النور قاطع في الشارع كله,النور قاطع في الشارع كله,0.0,0.0
|
| 158 |
+
openai_tts_sample_156.wav,صوت انفجار جامد في المنطقة,صوت انفجار جامد في المنطقة,0.0,0.0
|
| 159 |
+
openai_tts_sample_157.wav,العربية دخلت في محل في الهرم,العربية دخلت في محل في الهرم.,0.16666666666666666,0.043478260869565216
|
| 160 |
+
openai_tts_sample_158.wav,طفلة ضايعة في المول,طفلة ضايعة في المول,0.0,0.0
|
| 161 |
+
openai_tts_sample_159.wav,في تسريب مياه من الدور الرابع,في تسريب مياه من الدور الرابع.,0.16666666666666666,0.041666666666666664
|
| 162 |
+
openai_tts_sample_160.wav,خناقة بين الجيران فوق السطح,خناقة بين الجيران فوق السطح,0.0,0.0
|
| 163 |
+
openai_tts_sample_161.wav,فيه عربية مركونة غلط قافلة الشارع,فيه عربية مركونة غلط قافل الشارع,0.16666666666666666,0.03571428571428571
|
| 164 |
+
openai_tts_sample_162.wav,الغاز بيخرج من البوتاجاز ومفيش حد في الشقة,الغاز بيخرج من البوتاجاز ومفيش حد في الشقة,0.0,0.0
|
finetune_asr.py
ADDED
|
@@ -0,0 +1,711 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import io
|
| 3 |
+
import json
|
| 4 |
+
import torch
|
| 5 |
+
from pytorch_lightning import Trainer
|
| 6 |
+
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
|
| 7 |
+
from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
|
| 8 |
+
from omegaconf import OmegaConf, open_dict
|
| 9 |
+
|
| 10 |
+
# ============================================
|
| 11 |
+
# CRITICAL: Windows CUDA/Numba Fix
|
| 12 |
+
# ============================================
|
| 13 |
+
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
| 14 |
+
os.environ["NUMBA_CUDA_USE_NVIDIA_BINDING"] = "1"
|
| 15 |
+
os.environ["NUMBA_DISABLE_JIT"] = "0"
|
| 16 |
+
os.environ["NUMBA_CUDA_DRIVER"] = "cuda"
|
| 17 |
+
|
| 18 |
+
# Force CPU for RNNT loss on Windows (prevents access violation)
|
| 19 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
| 20 |
+
|
| 21 |
+
# ============================================
|
| 22 |
+
# UTF-8 Encoding Fix
|
| 23 |
+
# ============================================
|
| 24 |
+
manifest_path = "train_manifest.jsonl"
|
| 25 |
+
|
| 26 |
+
with io.open(manifest_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 27 |
+
content = f.read()
|
| 28 |
+
with io.open(manifest_path, 'w', encoding='utf-8') as f:
|
| 29 |
+
f.write(content)
|
| 30 |
+
print("✅ train_manifest.jsonl converted to UTF-8")
|
| 31 |
+
|
| 32 |
+
# Patch builtins.open for UTF-8
|
| 33 |
+
import builtins
|
| 34 |
+
_old_open = open
|
| 35 |
+
def open_utf8(file, *args, **kwargs):
|
| 36 |
+
if isinstance(file, str) and file.endswith('.jsonl') and 'encoding' not in kwargs:
|
| 37 |
+
kwargs['encoding'] = 'utf-8'
|
| 38 |
+
return _old_open(file, *args, **kwargs)
|
| 39 |
+
builtins.open = open_utf8
|
| 40 |
+
|
| 41 |
+
# ============================================
|
| 42 |
+
# Validate Manifest
|
| 43 |
+
# ============================================
|
| 44 |
+
def validate_manifest(manifest_path):
|
| 45 |
+
count = 0
|
| 46 |
+
with open(manifest_path, "r", encoding="utf-8") as f:
|
| 47 |
+
for i, line in enumerate(f, 1):
|
| 48 |
+
try:
|
| 49 |
+
item = json.loads(line.strip())
|
| 50 |
+
assert os.path.exists(item["audio_filepath"]), f"Missing: {item['audio_filepath']}"
|
| 51 |
+
assert "text" in item and item["text"].strip(), "Empty text"
|
| 52 |
+
count += 1
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"❌ Line {i} error: {e}")
|
| 55 |
+
print(f" Content: {line[:100]}")
|
| 56 |
+
print(f"✅ Valid entries: {count}")
|
| 57 |
+
return count
|
| 58 |
+
|
| 59 |
+
valid_count = validate_manifest(manifest_path)
|
| 60 |
+
if valid_count == 0:
|
| 61 |
+
raise ValueError("No valid training samples found!")
|
| 62 |
+
|
| 63 |
+
# ============================================
|
| 64 |
+
# Configuration (OPTIMIZED FOR ACCURACY)
|
| 65 |
+
# ============================================
|
| 66 |
+
BASE_MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
|
| 67 |
+
# TRAIN_MANIFEST = "train_manifest_hf_converted.jsonl"
|
| 68 |
+
TRAIN_MANIFEST = "train_manifest.jsonl"
|
| 69 |
+
SAVE_DIR = "output_finetuned"
|
| 70 |
+
|
| 71 |
+
# OPTIMIZED HYPERPARAMETERS FOR LOWER WER
|
| 72 |
+
BATCH_SIZE = 8 #4 # Slightly smaller for better gradient stability
|
| 73 |
+
MAX_EPOCHS = 250 # More epochs for better convergence
|
| 74 |
+
LEARNING_RATE = 5e-5#3e-5#1e-5 # Lower LR prevents catastrophic forgetting
|
| 75 |
+
WARMUP_STEPS = 500 # Gradual warmup for stability
|
| 76 |
+
WEIGHT_DECAY = 0.00001 # Lighter regularization
|
| 77 |
+
ACCUMULATE_GRAD_BATCHES = 4 # Effective batch = 8*4 = 32
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
os.makedirs(SAVE_DIR, exist_ok=True)
|
| 81 |
+
|
| 82 |
+
# ============================================
|
| 83 |
+
# Load Model and Fix Tokenizer Path
|
| 84 |
+
# ============================================
|
| 85 |
+
print("🔹 Loading pretrained model...")
|
| 86 |
+
model = EncDecHybridRNNTCTCBPEModel.restore_from(BASE_MODEL_PATH)
|
| 87 |
+
|
| 88 |
+
# CRITICAL FIX: Set tokenizer directory to current model directory
|
| 89 |
+
print("🔹 Fixing tokenizer configuration...")
|
| 90 |
+
with open_dict(model.cfg):
|
| 91 |
+
# Set tokenizer directory to the extracted location
|
| 92 |
+
tokenizer_dir = os.path.join(os.path.dirname(BASE_MODEL_PATH), "tokenizer")
|
| 93 |
+
os.makedirs(tokenizer_dir, exist_ok=True)
|
| 94 |
+
|
| 95 |
+
if hasattr(model, 'tokenizer'):
|
| 96 |
+
print(f"ℹ️ Using existing SentencePiece tokenizer at: {tokenizer_dir}")
|
| 97 |
+
model.cfg.tokenizer.dir = tokenizer_dir
|
| 98 |
+
model.cfg.tokenizer.type = "bpe"
|
| 99 |
+
else:
|
| 100 |
+
print("⚠️ No tokenizer object found in model — check model restoration path.")
|
| 101 |
+
|
| 102 |
+
# Disable validation/test datasets (will add custom validation)
|
| 103 |
+
if 'validation_ds' in model.cfg:
|
| 104 |
+
model.cfg.validation_ds.manifest_filepath = None
|
| 105 |
+
if 'test_ds' in model.cfg:
|
| 106 |
+
model.cfg.test_ds.manifest_filepath = None
|
| 107 |
+
|
| 108 |
+
# ============================================
|
| 109 |
+
# Setup Training Data (OPTIMIZED)
|
| 110 |
+
# ============================================
|
| 111 |
+
print("🔹 Setting up training data...")
|
| 112 |
+
train_ds_config = {
|
| 113 |
+
"manifest_filepath": TRAIN_MANIFEST,
|
| 114 |
+
"batch_size": BATCH_SIZE,
|
| 115 |
+
"shuffle": True,
|
| 116 |
+
"num_workers": 0,
|
| 117 |
+
"pin_memory": False,
|
| 118 |
+
"sample_rate": 16000,
|
| 119 |
+
"max_duration": 20.0, # Limit very long utterances
|
| 120 |
+
"min_duration": 0.5, # Filter out very short clips
|
| 121 |
+
"trim_silence": True,
|
| 122 |
+
"use_start_end_token": True,
|
| 123 |
+
# CRITICAL: Enable these for better training
|
| 124 |
+
"normalize_transcripts": True,
|
| 125 |
+
"parser": "ar", # Arabic language parser
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
model.setup_training_data(train_ds_config)
|
| 129 |
+
|
| 130 |
+
# ============================================
|
| 131 |
+
# Configure Optimizer (OPTIMIZED FOR CONVERGENCE)
|
| 132 |
+
# ============================================
|
| 133 |
+
print("🔹 Configuring optimizer...")
|
| 134 |
+
with open_dict(model.cfg):
|
| 135 |
+
# Use AdamW with lower learning rate for fine-tuning
|
| 136 |
+
model.cfg.optim.name = "adamw"
|
| 137 |
+
model.cfg.optim.lr = LEARNING_RATE
|
| 138 |
+
model.cfg.optim.betas = [0.9, 0.98] # Better for transformers
|
| 139 |
+
model.cfg.optim.weight_decay = WEIGHT_DECAY
|
| 140 |
+
model.cfg.optim.eps = 1e-8
|
| 141 |
+
|
| 142 |
+
# Add learning rate scheduling for better convergence
|
| 143 |
+
model.cfg.optim.sched = {
|
| 144 |
+
"name": "CosineAnnealing",
|
| 145 |
+
"warmup_steps": 1000,
|
| 146 |
+
"warmup_ratio": None,
|
| 147 |
+
"min_lr": 1e-7,
|
| 148 |
+
"last_epoch": -1,
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
# CRITICAL: Disable aggressive augmentation during fine-tuning
|
| 152 |
+
if 'spec_augment' in model.cfg:
|
| 153 |
+
model.cfg.spec_augment.freq_masks = 0 # Reduce from default
|
| 154 |
+
model.cfg.spec_augment.time_masks = 0 # Reduce from default
|
| 155 |
+
model.cfg.spec_augment.freq_width = 15 # Reduce masking
|
| 156 |
+
model.cfg.spec_augment.time_width = 0.03 # Reduce masking
|
| 157 |
+
|
| 158 |
+
# ============================================
|
| 159 |
+
# Configure Loss Weights for Hybrid Model (OPTIMIZED)
|
| 160 |
+
# ============================================
|
| 161 |
+
print("🔹 Optimizing loss weights...")
|
| 162 |
+
# For Hybrid RNNT-CTC models, balance the losses
|
| 163 |
+
if hasattr(model, 'loss_alpha'):
|
| 164 |
+
# 0.5 = balanced, adjust based on your data
|
| 165 |
+
# Higher CTC weight (0.7-0.9) often works better for fine-tuning
|
| 166 |
+
model.loss_alpha = 0.9 # 70% CTC, 30% RNNT
|
| 167 |
+
print(f" Loss alpha set to: {model.loss_alpha}")
|
| 168 |
+
|
| 169 |
+
# ============================================
|
| 170 |
+
# Callbacks for Best Model Selection
|
| 171 |
+
# ============================================
|
| 172 |
+
print("🔹 Setting up model checkpointing...")
|
| 173 |
+
|
| 174 |
+
# Save best model based on training loss (since no validation set)
|
| 175 |
+
checkpoint_callback = ModelCheckpoint(
|
| 176 |
+
dirpath=SAVE_DIR,
|
| 177 |
+
filename='best-model-{epoch:02d}-{train_loss:.4f}',
|
| 178 |
+
save_top_k=5,
|
| 179 |
+
monitor='train_loss',
|
| 180 |
+
mode='min',
|
| 181 |
+
save_last=True,
|
| 182 |
+
every_n_epochs=2,
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
early_stop_callback = EarlyStopping(
|
| 186 |
+
monitor='train_loss',
|
| 187 |
+
patience=20,
|
| 188 |
+
mode='min',
|
| 189 |
+
verbose=True,
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
# Monitor learning rate
|
| 194 |
+
lr_monitor = LearningRateMonitor(logging_interval='step')
|
| 195 |
+
|
| 196 |
+
# ============================================
|
| 197 |
+
# Trainer Configuration (CPU Mode - OPTIMIZED)
|
| 198 |
+
# ============================================
|
| 199 |
+
print("🔹 Configuring trainer for CPU...")
|
| 200 |
+
trainer = Trainer(
|
| 201 |
+
accelerator="cpu",
|
| 202 |
+
devices=1,
|
| 203 |
+
max_epochs=MAX_EPOCHS,
|
| 204 |
+
log_every_n_steps=1,
|
| 205 |
+
enable_checkpointing=True,
|
| 206 |
+
default_root_dir=SAVE_DIR,
|
| 207 |
+
callbacks=[checkpoint_callback, early_stop_callback, lr_monitor],
|
| 208 |
+
gradient_clip_val=1.0, # Prevent gradient explosion
|
| 209 |
+
gradient_clip_algorithm="norm",
|
| 210 |
+
accumulate_grad_batches=8, # Effective batch size = 4*8 = 32
|
| 211 |
+
val_check_interval=1.0, # Validate every epoch
|
| 212 |
+
enable_progress_bar=True,
|
| 213 |
+
enable_model_summary=True,
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
# ============================================
|
| 217 |
+
# Start Training
|
| 218 |
+
# ============================================
|
| 219 |
+
print("=" * 60)
|
| 220 |
+
print("🚀 STARTING OPTIMIZED FINE-TUNING")
|
| 221 |
+
print("=" * 60)
|
| 222 |
+
print(f" Model: {BASE_MODEL_PATH}")
|
| 223 |
+
print(f" Training samples: {valid_count}")
|
| 224 |
+
print(f" Max epochs: {MAX_EPOCHS}")
|
| 225 |
+
print(f" Batch size: {BATCH_SIZE} (effective: {BATCH_SIZE * 8})")
|
| 226 |
+
print(f" Learning rate: {LEARNING_RATE}")
|
| 227 |
+
print(f" Warmup steps: {WARMUP_STEPS}")
|
| 228 |
+
print(f" Loss weighting: CTC={model.loss_alpha if hasattr(model, 'loss_alpha') else 'N/A'}")
|
| 229 |
+
print(f" Early stopping patience: 20 epochs")
|
| 230 |
+
print("=" * 60)
|
| 231 |
+
print("⚠️ CPU training will be slow. For faster training, use Google Colab GPU.")
|
| 232 |
+
print("=" * 60)
|
| 233 |
+
|
| 234 |
+
try:
|
| 235 |
+
trainer.fit(model)
|
| 236 |
+
print("\n✅ Training completed successfully!")
|
| 237 |
+
|
| 238 |
+
# Load the best checkpoint
|
| 239 |
+
best_model_path = checkpoint_callback.best_model_path
|
| 240 |
+
if best_model_path:
|
| 241 |
+
print(f"📊 Best model checkpoint: {best_model_path}")
|
| 242 |
+
print(f" Best loss: {checkpoint_callback.best_model_score:.4f}")
|
| 243 |
+
|
| 244 |
+
# ✅ Safe load for PyTorch 2.6+ (NeMo-compatible)
|
| 245 |
+
import typing # Add this import at the top if not already there
|
| 246 |
+
import omegaconf
|
| 247 |
+
torch.serialization.add_safe_globals([
|
| 248 |
+
omegaconf.dictconfig.DictConfig,
|
| 249 |
+
omegaconf.base.ContainerMetadata,
|
| 250 |
+
omegaconf.listconfig.ListConfig,
|
| 251 |
+
typing.Any, # Add this line
|
| 252 |
+
])
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
checkpoint = torch.load(best_model_path, map_location='cpu', weights_only=False)
|
| 257 |
+
model.load_state_dict(checkpoint['state_dict'])
|
| 258 |
+
|
| 259 |
+
# ✅ Save the fine-tuned model to .nemo format
|
| 260 |
+
output_model_path = os.path.join(SAVE_DIR, "finetuned_model_best.nemo")
|
| 261 |
+
model.save_to(output_model_path)
|
| 262 |
+
print(f"\n💾 Final model saved to: {output_model_path}")
|
| 263 |
+
|
| 264 |
+
# Save training summary
|
| 265 |
+
summary_path = os.path.join(SAVE_DIR, "training_summary.txt")
|
| 266 |
+
with open(summary_path, 'w', encoding='utf-8') as f:
|
| 267 |
+
f.write(f"Training Summary\n")
|
| 268 |
+
f.write(f"================\n")
|
| 269 |
+
f.write(f"Base Model: {BASE_MODEL_PATH}\n")
|
| 270 |
+
f.write(f"Training Samples: {valid_count}\n")
|
| 271 |
+
f.write(f"Final Epochs: {trainer.current_epoch}\n")
|
| 272 |
+
f.write(f"Best Loss: {checkpoint_callback.best_model_score:.4f}\n")
|
| 273 |
+
f.write(f"Learning Rate: {LEARNING_RATE}\n")
|
| 274 |
+
f.write(f"Batch Size: {BATCH_SIZE} (effective: {BATCH_SIZE * 8})\n")
|
| 275 |
+
print(f"📝 Training summary saved to: {summary_path}")
|
| 276 |
+
|
| 277 |
+
print("\n" + "=" * 60)
|
| 278 |
+
print("🎉 OPTIMIZATION COMPLETE!")
|
| 279 |
+
print("=" * 60)
|
| 280 |
+
print("Next steps:")
|
| 281 |
+
print("1. Test your model on validation data to measure WER")
|
| 282 |
+
print("2. If WER is still high, consider:")
|
| 283 |
+
print(" - Increasing training data")
|
| 284 |
+
print(" - Training for more epochs")
|
| 285 |
+
print(" - Adjusting loss_alpha (try 0.5 or 0.9)")
|
| 286 |
+
print(" - Using data augmentation if needed")
|
| 287 |
+
print("=" * 60)
|
| 288 |
+
|
| 289 |
+
except Exception as e:
|
| 290 |
+
print(f"\n❌ Training failed: {e}")
|
| 291 |
+
import traceback
|
| 292 |
+
traceback.print_exc()
|
| 293 |
+
print("\n💡 Troubleshooting tips:")
|
| 294 |
+
print("1. Check if all audio files exist and are valid")
|
| 295 |
+
print("2. Verify manifest format is correct")
|
| 296 |
+
print("3. Ensure sufficient disk space for checkpoints")
|
| 297 |
+
print("4. Try reducing batch_size if out of memory")
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
# import os
|
| 333 |
+
# import io
|
| 334 |
+
# import json
|
| 335 |
+
# import torch
|
| 336 |
+
# from pytorch_lightning import Trainer
|
| 337 |
+
# from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
|
| 338 |
+
# from nemo.collections.asr.models import EncDecHybridRNNTCTCBPEModel
|
| 339 |
+
# from omegaconf import OmegaConf, open_dict
|
| 340 |
+
|
| 341 |
+
# # ============================================
|
| 342 |
+
# # CRITICAL: Windows CUDA/Numba Fix
|
| 343 |
+
# # ============================================
|
| 344 |
+
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
| 345 |
+
# os.environ["NUMBA_CUDA_USE_NVIDIA_BINDING"] = "1"
|
| 346 |
+
# os.environ["NUMBA_DISABLE_JIT"] = "0"
|
| 347 |
+
# os.environ["NUMBA_CUDA_DRIVER"] = "cuda"
|
| 348 |
+
# os.environ["CUDA_VISIBLE_DEVICES"] = "" # Force CPU for Windows stability
|
| 349 |
+
|
| 350 |
+
# # ============================================
|
| 351 |
+
# # UTF-8 Encoding Fix
|
| 352 |
+
# # ============================================
|
| 353 |
+
# manifest_path = "train_manifest.jsonl"
|
| 354 |
+
|
| 355 |
+
# with io.open(manifest_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 356 |
+
# content = f.read()
|
| 357 |
+
# with io.open(manifest_path, 'w', encoding='utf-8') as f:
|
| 358 |
+
# f.write(content)
|
| 359 |
+
# print("✅ train_manifest.jsonl converted to UTF-8")
|
| 360 |
+
|
| 361 |
+
# # Patch builtins.open for UTF-8
|
| 362 |
+
# import builtins
|
| 363 |
+
# _old_open = open
|
| 364 |
+
# def open_utf8(file, *args, **kwargs):
|
| 365 |
+
# if isinstance(file, str) and file.endswith('.jsonl') and 'encoding' not in kwargs:
|
| 366 |
+
# kwargs['encoding'] = 'utf-8'
|
| 367 |
+
# return _old_open(file, *args, **kwargs)
|
| 368 |
+
# builtins.open = open_utf8
|
| 369 |
+
|
| 370 |
+
# # ============================================
|
| 371 |
+
# # Validate Manifest (With Optional Validation Split)
|
| 372 |
+
# # ============================================
|
| 373 |
+
# USE_VALIDATION = True # Set to False if you don't want validation split
|
| 374 |
+
|
| 375 |
+
# def validate_manifest(manifest_path, create_val_split=True, val_split=0.1):
|
| 376 |
+
# """Validate manifest and optionally create train/val split"""
|
| 377 |
+
# valid_entries = []
|
| 378 |
+
|
| 379 |
+
# with open(manifest_path, "r", encoding="utf-8") as f:
|
| 380 |
+
# for i, line in enumerate(f, 1):
|
| 381 |
+
# try:
|
| 382 |
+
# item = json.loads(line.strip())
|
| 383 |
+
# assert os.path.exists(item["audio_filepath"]), f"Missing: {item['audio_filepath']}"
|
| 384 |
+
# assert "text" in item and item["text"].strip(), "Empty text"
|
| 385 |
+
|
| 386 |
+
# # Optional: Filter by duration for quality
|
| 387 |
+
# duration = item.get("duration", 0)
|
| 388 |
+
# if 0.5 <= duration <= 20.0: # Keep reasonable lengths
|
| 389 |
+
# valid_entries.append(item)
|
| 390 |
+
# except Exception as e:
|
| 391 |
+
# print(f"⚠️ Skipping line {i}: {e}")
|
| 392 |
+
|
| 393 |
+
# print(f"✅ Total valid entries: {len(valid_entries)}")
|
| 394 |
+
|
| 395 |
+
# if not create_val_split:
|
| 396 |
+
# # Use entire dataset for training
|
| 397 |
+
# print("📊 Using all data for training (no validation split)")
|
| 398 |
+
# return manifest_path, None, len(valid_entries), 0
|
| 399 |
+
|
| 400 |
+
# # Split into train/val
|
| 401 |
+
# import random
|
| 402 |
+
# random.seed(42)
|
| 403 |
+
# random.shuffle(valid_entries)
|
| 404 |
+
|
| 405 |
+
# split_idx = int(len(valid_entries) * (1 - val_split))
|
| 406 |
+
# train_entries = valid_entries[:split_idx]
|
| 407 |
+
# val_entries = valid_entries[split_idx:]
|
| 408 |
+
|
| 409 |
+
# # Save splits
|
| 410 |
+
# train_manifest = "train_split.jsonl"
|
| 411 |
+
# val_manifest = "val_split.jsonl"
|
| 412 |
+
|
| 413 |
+
# with open(train_manifest, "w", encoding="utf-8") as f:
|
| 414 |
+
# for entry in train_entries:
|
| 415 |
+
# f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
| 416 |
+
|
| 417 |
+
# with open(val_manifest, "w", encoding="utf-8") as f:
|
| 418 |
+
# for entry in val_entries:
|
| 419 |
+
# f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
| 420 |
+
|
| 421 |
+
# print(f"📊 Train samples: {len(train_entries)}")
|
| 422 |
+
# print(f"📊 Validation samples: {len(val_entries)}")
|
| 423 |
+
|
| 424 |
+
# return train_manifest, val_manifest, len(train_entries), len(val_entries)
|
| 425 |
+
|
| 426 |
+
# train_manifest, val_manifest, train_count, val_count = validate_manifest(
|
| 427 |
+
# manifest_path,
|
| 428 |
+
# create_val_split=USE_VALIDATION
|
| 429 |
+
# )
|
| 430 |
+
|
| 431 |
+
# if train_count == 0:
|
| 432 |
+
# raise ValueError("No valid training samples found!")
|
| 433 |
+
|
| 434 |
+
# # ============================================
|
| 435 |
+
# # Configuration (OPTIMIZED FOR 4000+ SAMPLES)
|
| 436 |
+
# # ============================================
|
| 437 |
+
# BASE_MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
|
| 438 |
+
# SAVE_DIR = "output_finetuned"
|
| 439 |
+
|
| 440 |
+
# # OPTIMIZED HYPERPARAMETERS FOR LARGE DATASET
|
| 441 |
+
# BATCH_SIZE = 8 # Larger batch for 4000+ samples (adjust based on RAM)
|
| 442 |
+
# MAX_EPOCHS = 100 # Sufficient for convergence with large dataset
|
| 443 |
+
# LEARNING_RATE = 5e-5 # Moderate LR for large dataset
|
| 444 |
+
# WARMUP_RATIO = 0.05 # 5% warmup of total steps
|
| 445 |
+
# WEIGHT_DECAY = 0.0001 # Regularization for generalization
|
| 446 |
+
# ACCUMULATE_GRAD_BATCHES = 4 # Effective batch = 8*4 = 32
|
| 447 |
+
|
| 448 |
+
# os.makedirs(SAVE_DIR, exist_ok=True)
|
| 449 |
+
|
| 450 |
+
# # ============================================
|
| 451 |
+
# # Load Model and Fix Tokenizer Path
|
| 452 |
+
# # ============================================
|
| 453 |
+
# print("🔹 Loading pretrained model...")
|
| 454 |
+
# model = EncDecHybridRNNTCTCBPEModel.restore_from(BASE_MODEL_PATH)
|
| 455 |
+
|
| 456 |
+
# print("🔹 Fixing tokenizer configuration...")
|
| 457 |
+
# with open_dict(model.cfg):
|
| 458 |
+
# tokenizer_dir = os.path.join(os.path.dirname(BASE_MODEL_PATH), "tokenizer")
|
| 459 |
+
# os.makedirs(tokenizer_dir, exist_ok=True)
|
| 460 |
+
|
| 461 |
+
# if hasattr(model, 'tokenizer'):
|
| 462 |
+
# model.cfg.tokenizer.dir = tokenizer_dir
|
| 463 |
+
# model.cfg.tokenizer.type = "bpe"
|
| 464 |
+
|
| 465 |
+
# # CRITICAL: Properly disable validation dataset
|
| 466 |
+
# if 'validation_ds' in model.cfg:
|
| 467 |
+
# model.cfg.validation_ds = None
|
| 468 |
+
|
| 469 |
+
# # Disable test dataset
|
| 470 |
+
# if 'test_ds' in model.cfg:
|
| 471 |
+
# model.cfg.test_ds = None
|
| 472 |
+
|
| 473 |
+
# # ============================================
|
| 474 |
+
# # Setup Training Data (OPTIMIZED FOR ACCURACY)
|
| 475 |
+
# # ============================================
|
| 476 |
+
# print("🔹 Setting up training data...")
|
| 477 |
+
# train_ds_config = {
|
| 478 |
+
# "manifest_filepath": train_manifest,
|
| 479 |
+
# "batch_size": BATCH_SIZE,
|
| 480 |
+
# "shuffle": True,
|
| 481 |
+
# "num_workers": 0,
|
| 482 |
+
# "pin_memory": False,
|
| 483 |
+
# "sample_rate": 16000,
|
| 484 |
+
# "max_duration": 20.0,
|
| 485 |
+
# "min_duration": 0.5,
|
| 486 |
+
# "trim_silence": True,
|
| 487 |
+
# "use_start_end_token": True,
|
| 488 |
+
# "normalize_transcripts": True,
|
| 489 |
+
# "parser": "ar",
|
| 490 |
+
# # Add augmentation for better generalization (light for fine-tuning)
|
| 491 |
+
# "augmentor": None, # Disable for now, can enable if needed
|
| 492 |
+
# }
|
| 493 |
+
|
| 494 |
+
# model.setup_training_data(train_ds_config)
|
| 495 |
+
|
| 496 |
+
# # ============================================
|
| 497 |
+
# # Setup Validation Data (OPTIONAL)
|
| 498 |
+
# # ============================================
|
| 499 |
+
# if USE_VALIDATION and val_manifest:
|
| 500 |
+
# print("🔹 Setting up validation data...")
|
| 501 |
+
# val_ds_config = {
|
| 502 |
+
# "manifest_filepath": val_manifest,
|
| 503 |
+
# "batch_size": BATCH_SIZE,
|
| 504 |
+
# "shuffle": False,
|
| 505 |
+
# "num_workers": 0,
|
| 506 |
+
# "pin_memory": False,
|
| 507 |
+
# "sample_rate": 16000,
|
| 508 |
+
# "normalize_transcripts": True,
|
| 509 |
+
# "parser": "ar",
|
| 510 |
+
# }
|
| 511 |
+
# model.setup_validation_data(val_ds_config)
|
| 512 |
+
# else:
|
| 513 |
+
# print("⚠️ No validation data - monitoring training loss only")
|
| 514 |
+
|
| 515 |
+
# # ============================================
|
| 516 |
+
# # Configure Optimizer (OPTIMIZED FOR CONVERGENCE)
|
| 517 |
+
# # ============================================
|
| 518 |
+
# print("🔹 Configuring optimizer...")
|
| 519 |
+
|
| 520 |
+
# # Calculate total steps for scheduling
|
| 521 |
+
# steps_per_epoch = train_count // (BATCH_SIZE * ACCUMULATE_GRAD_BATCHES)
|
| 522 |
+
# total_steps = steps_per_epoch * MAX_EPOCHS
|
| 523 |
+
# warmup_steps = int(total_steps * WARMUP_RATIO)
|
| 524 |
+
|
| 525 |
+
# print(f" Steps per epoch: {steps_per_epoch}")
|
| 526 |
+
# print(f" Total training steps: {total_steps}")
|
| 527 |
+
# print(f" Warmup steps: {warmup_steps}")
|
| 528 |
+
|
| 529 |
+
# with open_dict(model.cfg):
|
| 530 |
+
# # AdamW optimizer with optimal settings
|
| 531 |
+
# model.cfg.optim.name = "adamw"
|
| 532 |
+
# model.cfg.optim.lr = LEARNING_RATE
|
| 533 |
+
# model.cfg.optim.betas = [0.9, 0.999]
|
| 534 |
+
# model.cfg.optim.weight_decay = WEIGHT_DECAY
|
| 535 |
+
# model.cfg.optim.eps = 1e-8
|
| 536 |
+
|
| 537 |
+
# # Polynomial decay with warmup (better than cosine for fine-tuning)
|
| 538 |
+
# model.cfg.optim.sched = {
|
| 539 |
+
# "name": "PolynomialDecayAnnealing",
|
| 540 |
+
# "warmup_steps": warmup_steps,
|
| 541 |
+
# "warmup_ratio": None,
|
| 542 |
+
# "min_lr": 1e-7,
|
| 543 |
+
# "power": 1.0,
|
| 544 |
+
# "last_epoch": -1,
|
| 545 |
+
# }
|
| 546 |
+
|
| 547 |
+
# # LIGHT augmentation for fine-tuning (prevents overfitting)
|
| 548 |
+
# if 'spec_augment' in model.cfg:
|
| 549 |
+
# model.cfg.spec_augment.freq_masks = 1
|
| 550 |
+
# model.cfg.spec_augment.time_masks = 2
|
| 551 |
+
# model.cfg.spec_augment.freq_width = 10
|
| 552 |
+
# model.cfg.spec_augment.time_width = 0.025
|
| 553 |
+
|
| 554 |
+
# # ============================================
|
| 555 |
+
# # Configure Loss Weights (OPTIMIZED FOR HYBRID)
|
| 556 |
+
# # ============================================
|
| 557 |
+
# print("🔹 Optimizing loss weights...")
|
| 558 |
+
# if hasattr(model, 'loss_alpha'):
|
| 559 |
+
# # For Arabic: CTC often works better for fine-tuning
|
| 560 |
+
# model.loss_alpha = 0.8 # 80% CTC, 20% RNNT
|
| 561 |
+
# print(f" Loss alpha: {model.loss_alpha} (CTC-focused)")
|
| 562 |
+
|
| 563 |
+
# # ============================================
|
| 564 |
+
# # Callbacks for Best Model Selection
|
| 565 |
+
# # ============================================
|
| 566 |
+
# print("🔹 Setting up callbacks...")
|
| 567 |
+
|
| 568 |
+
# # Choose monitor metric based on validation availability
|
| 569 |
+
# monitor_metric = 'val_loss' if USE_VALIDATION else 'train_loss'
|
| 570 |
+
# monitor_mode = 'min'
|
| 571 |
+
|
| 572 |
+
# # Save best model based on available metric
|
| 573 |
+
# checkpoint_callback = ModelCheckpoint(
|
| 574 |
+
# dirpath=SAVE_DIR,
|
| 575 |
+
# filename=f'best-{{epoch:02d}}-{{{monitor_metric}:.4f}}',
|
| 576 |
+
# save_top_k=3,
|
| 577 |
+
# monitor=monitor_metric,
|
| 578 |
+
# mode=monitor_mode,
|
| 579 |
+
# save_last=True,
|
| 580 |
+
# every_n_epochs=1,
|
| 581 |
+
# verbose=True,
|
| 582 |
+
# )
|
| 583 |
+
|
| 584 |
+
# # Early stopping based on available metric
|
| 585 |
+
# early_stop_callback = EarlyStopping(
|
| 586 |
+
# monitor=monitor_metric,
|
| 587 |
+
# patience=15, # Stop if no improvement for 15 epochs
|
| 588 |
+
# mode=monitor_mode,
|
| 589 |
+
# verbose=True,
|
| 590 |
+
# min_delta=0.0001,
|
| 591 |
+
# )
|
| 592 |
+
|
| 593 |
+
# lr_monitor = LearningRateMonitor(logging_interval='step')
|
| 594 |
+
|
| 595 |
+
# # ============================================
|
| 596 |
+
# # Trainer Configuration (OPTIMIZED FOR CPU)
|
| 597 |
+
# # ============================================
|
| 598 |
+
# print("🔹 Configuring trainer...")
|
| 599 |
+
# trainer = Trainer(
|
| 600 |
+
# accelerator="cpu",
|
| 601 |
+
# devices=1,
|
| 602 |
+
# max_epochs=MAX_EPOCHS,
|
| 603 |
+
# log_every_n_steps=5,
|
| 604 |
+
# enable_checkpointing=True,
|
| 605 |
+
# default_root_dir=SAVE_DIR,
|
| 606 |
+
# callbacks=[checkpoint_callback, early_stop_callback, lr_monitor],
|
| 607 |
+
# gradient_clip_val=1.0,
|
| 608 |
+
# gradient_clip_algorithm="norm",
|
| 609 |
+
# accumulate_grad_batches=ACCUMULATE_GRAD_BATCHES,
|
| 610 |
+
# val_check_interval=1.0, # Validate every epoch
|
| 611 |
+
# enable_progress_bar=True,
|
| 612 |
+
# enable_model_summary=True,
|
| 613 |
+
# deterministic=False, # Faster training
|
| 614 |
+
# benchmark=False,
|
| 615 |
+
# )
|
| 616 |
+
|
| 617 |
+
# # ============================================
|
| 618 |
+
# # Start Training
|
| 619 |
+
# # ============================================
|
| 620 |
+
# print("=" * 70)
|
| 621 |
+
# print("🚀 STARTING OPTIMIZED FINE-TUNING FOR 4000+ SAMPLES")
|
| 622 |
+
# print("=" * 70)
|
| 623 |
+
# print(f" Base Model: {BASE_MODEL_PATH}")
|
| 624 |
+
# print(f" Training samples: {train_count}")
|
| 625 |
+
# print(f" Validation samples: {val_count if USE_VALIDATION else 'None (using training loss)'}")
|
| 626 |
+
# print(f" Max epochs: {MAX_EPOCHS}")
|
| 627 |
+
# print(f" Batch size: {BATCH_SIZE} (effective: {BATCH_SIZE * ACCUMULATE_GRAD_BATCHES})")
|
| 628 |
+
# print(f" Learning rate: {LEARNING_RATE}")
|
| 629 |
+
# print(f" Warmup steps: {warmup_steps}")
|
| 630 |
+
# print(f" Weight decay: {WEIGHT_DECAY}")
|
| 631 |
+
# print(f" Loss weighting: CTC={model.loss_alpha if hasattr(model, 'loss_alpha') else 'N/A'}")
|
| 632 |
+
# print(f" Monitoring: {monitor_metric}")
|
| 633 |
+
# print(f" Early stopping: 15 epochs patience")
|
| 634 |
+
# print("=" * 70)
|
| 635 |
+
# print("⏱️ Estimated time: ~{:.1f} hours (depends on CPU)".format(
|
| 636 |
+
# train_count * MAX_EPOCHS / (BATCH_SIZE * 3600 * 0.5) # Rough estimate
|
| 637 |
+
# ))
|
| 638 |
+
# print("=" * 70)
|
| 639 |
+
|
| 640 |
+
# try:
|
| 641 |
+
# trainer.fit(model)
|
| 642 |
+
# print("\n✅ Training completed successfully!")
|
| 643 |
+
|
| 644 |
+
# # Load and save the best checkpoint
|
| 645 |
+
# best_model_path = checkpoint_callback.best_model_path
|
| 646 |
+
# if best_model_path:
|
| 647 |
+
# print(f"\n📊 Best model checkpoint: {best_model_path}")
|
| 648 |
+
# print(f" Best {monitor_metric}: {checkpoint_callback.best_model_score:.4f}")
|
| 649 |
+
|
| 650 |
+
# # Safe load for PyTorch 2.6+
|
| 651 |
+
# import typing
|
| 652 |
+
# import omegaconf
|
| 653 |
+
# torch.serialization.add_safe_globals([
|
| 654 |
+
# omegaconf.dictconfig.DictConfig,
|
| 655 |
+
# omegaconf.base.ContainerMetadata,
|
| 656 |
+
# omegaconf.listconfig.ListConfig,
|
| 657 |
+
# typing.Any,
|
| 658 |
+
# ])
|
| 659 |
+
|
| 660 |
+
# checkpoint = torch.load(best_model_path, map_location='cpu', weights_only=False)
|
| 661 |
+
# model.load_state_dict(checkpoint['state_dict'])
|
| 662 |
+
|
| 663 |
+
# # Save final model
|
| 664 |
+
# output_model_path = os.path.join(SAVE_DIR, "finetuned_model_best.nemo")
|
| 665 |
+
# model.save_to(output_model_path)
|
| 666 |
+
# print(f"\n💾 Final model saved to: {output_model_path}")
|
| 667 |
+
|
| 668 |
+
# # Save training summary
|
| 669 |
+
# summary_path = os.path.join(SAVE_DIR, "training_summary.txt")
|
| 670 |
+
# with open(summary_path, 'w', encoding='utf-8') as f:
|
| 671 |
+
# f.write(f"Training Summary - 4000+ Samples\n")
|
| 672 |
+
# f.write(f"=================================\n")
|
| 673 |
+
# f.write(f"Base Model: {BASE_MODEL_PATH}\n")
|
| 674 |
+
# f.write(f"Training Samples: {train_count}\n")
|
| 675 |
+
# f.write(f"Validation Samples: {val_count if USE_VALIDATION else 'None'}\n")
|
| 676 |
+
# f.write(f"Final Epoch: {trainer.current_epoch}\n")
|
| 677 |
+
# f.write(f"Best {monitor_metric}: {checkpoint_callback.best_model_score:.4f}\n")
|
| 678 |
+
# f.write(f"Learning Rate: {LEARNING_RATE}\n")
|
| 679 |
+
# f.write(f"Batch Size: {BATCH_SIZE} (effective: {BATCH_SIZE * ACCUMULATE_GRAD_BATCHES})\n")
|
| 680 |
+
# f.write(f"Warmup Steps: {warmup_steps}\n")
|
| 681 |
+
# f.write(f"Weight Decay: {WEIGHT_DECAY}\n")
|
| 682 |
+
# print(f"📝 Training summary saved to: {summary_path}")
|
| 683 |
+
|
| 684 |
+
# print("\n" + "=" * 70)
|
| 685 |
+
# print("🎉 TRAINING COMPLETE!")
|
| 686 |
+
# print("=" * 70)
|
| 687 |
+
# print("Next steps:")
|
| 688 |
+
# print("1. Evaluate WER/CER on test set using the best model")
|
| 689 |
+
# print("2. If WER is still high, try:")
|
| 690 |
+
# print(" - Train for more epochs (increase MAX_EPOCHS)")
|
| 691 |
+
# print(" - Adjust loss_alpha (try 0.5 or 0.9)")
|
| 692 |
+
# print(" - Add more training data")
|
| 693 |
+
# print(" - Enable light data augmentation")
|
| 694 |
+
# print("3. Use the validation manifest to monitor overfitting")
|
| 695 |
+
# print("=" * 70)
|
| 696 |
+
|
| 697 |
+
# except KeyboardInterrupt:
|
| 698 |
+
# print("\n⚠️ Training interrupted by user")
|
| 699 |
+
# print("💾 Saving last checkpoint...")
|
| 700 |
+
# if hasattr(checkpoint_callback, 'last_model_path'):
|
| 701 |
+
# print(f" Last checkpoint: {checkpoint_callback.last_model_path}")
|
| 702 |
+
|
| 703 |
+
# except Exception as e:
|
| 704 |
+
# print(f"\n❌ Training failed: {e}")
|
| 705 |
+
# import traceback
|
| 706 |
+
# traceback.print_exc()
|
| 707 |
+
# print("\n💡 Troubleshooting:")
|
| 708 |
+
# print("1. Reduce BATCH_SIZE if out of memory")
|
| 709 |
+
# print("2. Check audio file paths in manifest")
|
| 710 |
+
# print("3. Verify all audio files are valid WAV format")
|
| 711 |
+
# print("4. Ensure sufficient disk space for checkpoints")
|
testing_main.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import sounddevice as sd
|
| 2 |
+
# import scipy.io.wavfile as wav
|
| 3 |
+
# import nemo.collections.asr as nemo_asr
|
| 4 |
+
|
| 5 |
+
# # ===== SETTINGS =====
|
| 6 |
+
# SAMPLE_RATE = 16000
|
| 7 |
+
# DURATION = 10 # seconds
|
| 8 |
+
# OUTPUT_FILE = "arabic_recording.wav"
|
| 9 |
+
|
| 10 |
+
# # ===== STEP 1: Record audio =====
|
| 11 |
+
# print("🎙️ Recording... Speak Arabic now!")
|
| 12 |
+
# audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
|
| 13 |
+
# sd.wait()
|
| 14 |
+
# wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
|
| 15 |
+
# print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")
|
| 16 |
+
|
| 17 |
+
# # ===== STEP 2: Load ASR model =====
|
| 18 |
+
# print("📥 Loading Arabic ASR model...")
|
| 19 |
+
# asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
|
| 20 |
+
# "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
|
| 21 |
+
# )
|
| 22 |
+
|
| 23 |
+
# # ===== STEP 3: Configure Decoding =====
|
| 24 |
+
# print("🔍 Configuring decoding strategy...")
|
| 25 |
+
|
| 26 |
+
# # Get decoding config
|
| 27 |
+
# decoding_cfg = asr_model.cfg.decoding
|
| 28 |
+
|
| 29 |
+
# # Print available parameters to debug
|
| 30 |
+
# print(f"Available decoding strategies: {decoding_cfg.keys() if hasattr(decoding_cfg, 'keys') else 'N/A'}")
|
| 31 |
+
# # After loading the model, add this to inspect the config:
|
| 32 |
+
# print("🔍 Beam config structure:")
|
| 33 |
+
# print(decoding_cfg.beam)
|
| 34 |
+
# # Set beam search strategy
|
| 35 |
+
# decoding_cfg.strategy = "beam"
|
| 36 |
+
# decoding_cfg.beam.beam_size = 128
|
| 37 |
+
# decoding_cfg.beam.return_best_hypothesis = True
|
| 38 |
+
|
| 39 |
+
# # Only set parameters that exist
|
| 40 |
+
# if hasattr(decoding_cfg.beam, 'beam_alpha'):
|
| 41 |
+
# decoding_cfg.beam.beam_alpha = 0.3
|
| 42 |
+
# print("✓ Set beam_alpha")
|
| 43 |
+
|
| 44 |
+
# if hasattr(decoding_cfg.beam, 'beam_beta'):
|
| 45 |
+
# decoding_cfg.beam.beam_beta = 0.5
|
| 46 |
+
# print("✓ Set beam_beta")
|
| 47 |
+
|
| 48 |
+
# # Remove softmax_temperature - it's not supported in this config
|
| 49 |
+
# # If you need temperature sampling, you might need to use a different strategy
|
| 50 |
+
|
| 51 |
+
# # Apply the decoding configuration
|
| 52 |
+
# asr_model.change_decoding_strategy(decoding_cfg)
|
| 53 |
+
|
| 54 |
+
# # ===== STEP 4: Transcribe =====
|
| 55 |
+
# print("🔍 Transcribing...")
|
| 56 |
+
# transcription = asr_model.transcribe(
|
| 57 |
+
# [OUTPUT_FILE],
|
| 58 |
+
# batch_size=1,
|
| 59 |
+
# num_workers=0
|
| 60 |
+
# )
|
| 61 |
+
|
| 62 |
+
# print("📝 Transcription:", transcription[0])
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# import sounddevice as sd
|
| 67 |
+
# import scipy.io.wavfile as wav
|
| 68 |
+
# import nemo.collections.asr as nemo_asr
|
| 69 |
+
|
| 70 |
+
# # ===== SETTINGS =====
|
| 71 |
+
# SAMPLE_RATE = 16000
|
| 72 |
+
# DURATION = 10
|
| 73 |
+
# OUTPUT_FILE = "arabic_recording.wav"
|
| 74 |
+
|
| 75 |
+
# # ===== STEP 1: Record audio =====
|
| 76 |
+
# print("🎙️ Recording... Speak Arabic now!")
|
| 77 |
+
# audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
|
| 78 |
+
# sd.wait()
|
| 79 |
+
# wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
|
| 80 |
+
# print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")
|
| 81 |
+
|
| 82 |
+
# # ===== STEP 2: Load ASR model =====
|
| 83 |
+
# print("📥 Loading Arabic ASR model...")
|
| 84 |
+
# asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
|
| 85 |
+
# "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
|
| 86 |
+
# )
|
| 87 |
+
|
| 88 |
+
# # ===== STEP 3: Configure for LITERAL transcription =====
|
| 89 |
+
# print("🔍 Configuring greedy decoding for literal output...")
|
| 90 |
+
|
| 91 |
+
# decoding_cfg = asr_model.cfg.decoding
|
| 92 |
+
# decoding_cfg.strategy = "greedy"
|
| 93 |
+
|
| 94 |
+
# # CRITICAL: Increase max_symbols to avoid truncating repetitions
|
| 95 |
+
# # The default is only 10, which is very restrictive!
|
| 96 |
+
# decoding_cfg.greedy.max_symbols = 1000 # Allow much longer sequences
|
| 97 |
+
# decoding_cfg.beam.beam_size = 64
|
| 98 |
+
# decoding_cfg.beam.search_type = "beam"
|
| 99 |
+
# print(f"✓ Set max_symbols to {decoding_cfg.greedy.max_symbols}")
|
| 100 |
+
# print("Updated config:", decoding_cfg)
|
| 101 |
+
|
| 102 |
+
# # Apply configuration
|
| 103 |
+
# asr_model.change_decoding_strategy(decoding_cfg)
|
| 104 |
+
|
| 105 |
+
# # ===== STEP 4: Transcribe =====
|
| 106 |
+
# print("🔍 Transcribing...")
|
| 107 |
+
# transcription = asr_model.transcribe(
|
| 108 |
+
# [OUTPUT_FILE],
|
| 109 |
+
# batch_size=1,
|
| 110 |
+
# num_workers=0
|
| 111 |
+
# )
|
| 112 |
+
|
| 113 |
+
# print("📝 Literal Transcription:", transcription[0])
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
import sounddevice as sd
|
| 117 |
+
import scipy.io.wavfile as wav
|
| 118 |
+
import nemo.collections.asr as nemo_asr
|
| 119 |
+
from omegaconf import OmegaConf
|
| 120 |
+
|
| 121 |
+
# ===== SETTINGS =====
|
| 122 |
+
SAMPLE_RATE = 16000
|
| 123 |
+
DURATION = 10
|
| 124 |
+
OUTPUT_FILE = "arabic_recording.wav"
|
| 125 |
+
# ===== STEP 2: Load ASR model =====
|
| 126 |
+
print("📥 Loading Arabic ASR model...")
|
| 127 |
+
asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(
|
| 128 |
+
"C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# Add this right after loading the model to see what's actually available:
|
| 133 |
+
print("Available greedy parameters:")
|
| 134 |
+
print(OmegaConf.to_yaml(asr_model.cfg.decoding.greedy))
|
| 135 |
+
|
| 136 |
+
# ===== STEP 3: Configure for LITERAL transcription =====
|
| 137 |
+
print("🔍 Configuring greedy decoding for literal output...")
|
| 138 |
+
|
| 139 |
+
# Set struct mode to False temporarily to allow modifications
|
| 140 |
+
OmegaConf.set_struct(asr_model.cfg.decoding, False)
|
| 141 |
+
OmegaConf.set_struct(asr_model.cfg.decoding.greedy, False)
|
| 142 |
+
|
| 143 |
+
decoding_cfg = asr_model.cfg.decoding
|
| 144 |
+
decoding_cfg.strategy = "maes"
|
| 145 |
+
|
| 146 |
+
# Now try setting the parameters
|
| 147 |
+
try:
|
| 148 |
+
decoding_cfg.greedy.max_symbols_per_step = 300
|
| 149 |
+
print(f"✓ max_symbols_per_step: {decoding_cfg.greedy.max_symbols_per_step}")
|
| 150 |
+
except:
|
| 151 |
+
print("⚠ Could not set max_symbols_per_step")
|
| 152 |
+
|
| 153 |
+
decoding_cfg.greedy.max_symbols = 500
|
| 154 |
+
decoding_cfg.greedy.loop_labels = True
|
| 155 |
+
decoding_cfg.greedy.preserve_alignments = True
|
| 156 |
+
decoding_cfg.preserve_alignments = True
|
| 157 |
+
decoding_cfg.compute_timestamps = True
|
| 158 |
+
decoding_cfg.temperature = 1.3
|
| 159 |
+
|
| 160 |
+
decoding_cfg.beam.beam_size = 64
|
| 161 |
+
decoding_cfg.beam.softmax_temperature = 1.3
|
| 162 |
+
decoding_cfg.beam.search_type = "beam"
|
| 163 |
+
print(f"✓ max_symbols: {decoding_cfg.greedy.max_symbols}")
|
| 164 |
+
print(f"✓ loop_labels: {decoding_cfg.greedy.loop_labels}")
|
| 165 |
+
print(f"✓ temperature: {decoding_cfg.temperature}")
|
| 166 |
+
|
| 167 |
+
# Re-enable struct mode
|
| 168 |
+
OmegaConf.set_struct(asr_model.cfg.decoding, True)
|
| 169 |
+
OmegaConf.set_struct(asr_model.cfg.decoding.greedy, True)
|
| 170 |
+
|
| 171 |
+
# Apply configuration
|
| 172 |
+
asr_model.change_decoding_strategy(decoding_cfg)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
# ===== STEP 1: Record audio =====
|
| 176 |
+
print("🎙️ Recording... Speak Arabic now!")
|
| 177 |
+
audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
|
| 178 |
+
sd.wait()
|
| 179 |
+
wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
|
| 180 |
+
print(f"✅ Recording finished. Saved as {OUTPUT_FILE}")
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# ===== STEP 4: Transcribe =====
|
| 185 |
+
print("🔍 Transcribing...")
|
| 186 |
+
transcription = asr_model.transcribe(
|
| 187 |
+
[OUTPUT_FILE],
|
| 188 |
+
batch_size=1,
|
| 189 |
+
num_workers=0
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
print("📝 Literal Transcription:", transcription[0])
|
testing_main_v2.py
ADDED
|
@@ -0,0 +1,473 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import sounddevice as sd
|
| 2 |
+
# import scipy.io.wavfile as wav
|
| 3 |
+
# import nemo.collections.asr as nemo_asr
|
| 4 |
+
# import torch
|
| 5 |
+
# import numpy as np
|
| 6 |
+
# from typing import List, Tuple
|
| 7 |
+
|
| 8 |
+
# # ===== SETTINGS =====
|
| 9 |
+
# SAMPLE_RATE = 16000
|
| 10 |
+
# DURATION = 10 # seconds
|
| 11 |
+
# OUTPUT_FILE = "arabic_recording.wav"
|
| 12 |
+
|
| 13 |
+
# class RepetitionAwareTranscriber:
|
| 14 |
+
# def __init__(self, model_path: str):
|
| 15 |
+
# """Initialize ASR model with repetition-aware configuration"""
|
| 16 |
+
# print("📥 Loading Arabic ASR model...")
|
| 17 |
+
# self.asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
|
| 18 |
+
# self._configure_decoding()
|
| 19 |
+
|
| 20 |
+
# def _configure_decoding(self):
|
| 21 |
+
# """Configure advanced decoding strategy"""
|
| 22 |
+
# decoding_cfg = self.asr_model.cfg.decoding
|
| 23 |
+
|
| 24 |
+
# # Use beam search for better sequence modeling
|
| 25 |
+
# decoding_cfg.strategy = "beam"
|
| 26 |
+
# decoding_cfg.beam.beam_size = 128 # Larger beam for more candidates
|
| 27 |
+
# decoding_cfg.beam.return_best_hypothesis = False # Get multiple hypotheses
|
| 28 |
+
|
| 29 |
+
# # Language model parameters (if available)
|
| 30 |
+
# if hasattr(decoding_cfg.beam, 'beam_alpha'):
|
| 31 |
+
# decoding_cfg.beam.beam_alpha = 0.3 # LM weight (lower = less LM influence)
|
| 32 |
+
# if hasattr(decoding_cfg.beam, 'beam_beta'):
|
| 33 |
+
# decoding_cfg.beam.beam_beta = 0.5 # Word insertion bonus
|
| 34 |
+
|
| 35 |
+
# self.asr_model.change_decoding_strategy(decoding_cfg)
|
| 36 |
+
|
| 37 |
+
# def transcribe_with_logprobs(self, audio_file: str, temperature: float = 1.0):
|
| 38 |
+
# """
|
| 39 |
+
# Transcribe with log probabilities and temperature scaling
|
| 40 |
+
|
| 41 |
+
# Args:
|
| 42 |
+
# audio_file: Path to audio file
|
| 43 |
+
# temperature: Controls randomness (lower = more conservative, higher = more diverse)
|
| 44 |
+
# 0.5 = more deterministic
|
| 45 |
+
# 1.0 = standard
|
| 46 |
+
# 1.5 = more exploratory
|
| 47 |
+
# """
|
| 48 |
+
# print(f"🔍 Transcribing with temperature={temperature}...")
|
| 49 |
+
|
| 50 |
+
# # Update temperature in decoding config
|
| 51 |
+
# if hasattr(self.asr_model.cfg.decoding, 'temperature'):
|
| 52 |
+
# self.asr_model.cfg.decoding.temperature = temperature
|
| 53 |
+
# if hasattr(self.asr_model.cfg.decoding.beam, 'softmax_temperature'):
|
| 54 |
+
# self.asr_model.cfg.decoding.beam.softmax_temperature = temperature
|
| 55 |
+
|
| 56 |
+
# self.asr_model.change_decoding_strategy(self.asr_model.cfg.decoding)
|
| 57 |
+
|
| 58 |
+
# # Get multiple hypotheses with their scores
|
| 59 |
+
# hypotheses = self.asr_model.transcribe(
|
| 60 |
+
# [audio_file],
|
| 61 |
+
# batch_size=1,
|
| 62 |
+
# return_hypotheses=True,
|
| 63 |
+
# num_workers=0
|
| 64 |
+
# )
|
| 65 |
+
|
| 66 |
+
# # Handle different return types
|
| 67 |
+
# if isinstance(hypotheses, list) and len(hypotheses) > 0:
|
| 68 |
+
# hyp = hypotheses[0]
|
| 69 |
+
|
| 70 |
+
# # Check if it's a Hypothesis object or a list
|
| 71 |
+
# if isinstance(hyp, list):
|
| 72 |
+
# # It's already a list of transcriptions
|
| 73 |
+
# best_text = hyp[0] if len(hyp) > 0 else ""
|
| 74 |
+
# print(f"\n📊 Top hypothesis: {best_text}")
|
| 75 |
+
# return best_text
|
| 76 |
+
# elif hasattr(hyp, 'text'):
|
| 77 |
+
# # It's a Hypothesis object
|
| 78 |
+
# text = hyp.text
|
| 79 |
+
|
| 80 |
+
# # Check for nbest hypotheses
|
| 81 |
+
# if hasattr(hyp, 'nbest') and len(hyp.nbest) > 1:
|
| 82 |
+
# print(f"\n📊 Top {min(5, len(hyp.nbest))} hypotheses:")
|
| 83 |
+
# for i, nbest_hyp in enumerate(hyp.nbest[:5]):
|
| 84 |
+
# score = nbest_hyp.score if hasattr(nbest_hyp, 'score') else 'N/A'
|
| 85 |
+
# hyp_text = nbest_hyp.text if hasattr(nbest_hyp, 'text') else str(nbest_hyp)
|
| 86 |
+
# print(f" {i+1}. [{score}] {hyp_text}")
|
| 87 |
+
|
| 88 |
+
# return text
|
| 89 |
+
# else:
|
| 90 |
+
# # Fallback: convert to string
|
| 91 |
+
# return str(hyp)
|
| 92 |
+
|
| 93 |
+
# return ""
|
| 94 |
+
|
| 95 |
+
# def transcribe_with_frame_analysis(self, audio_file: str):
|
| 96 |
+
# """
|
| 97 |
+
# Analyze frame-level predictions to detect repetitions
|
| 98 |
+
# This examines the raw CTC outputs before collapsing
|
| 99 |
+
# """
|
| 100 |
+
# print("🔍 Performing frame-level analysis...")
|
| 101 |
+
|
| 102 |
+
# # Get log probabilities at frame level
|
| 103 |
+
# log_probs = self.asr_model.transcribe(
|
| 104 |
+
# [audio_file],
|
| 105 |
+
# batch_size=1,
|
| 106 |
+
# logprobs=True
|
| 107 |
+
# )
|
| 108 |
+
|
| 109 |
+
# # Standard transcription
|
| 110 |
+
# transcription = self.asr_model.transcribe([audio_file])
|
| 111 |
+
|
| 112 |
+
# return transcription[0], log_probs
|
| 113 |
+
|
| 114 |
+
# def transcribe_with_all_methods(self, audio_file: str):
|
| 115 |
+
# """Try multiple decoding strategies and return all results"""
|
| 116 |
+
# results = {}
|
| 117 |
+
|
| 118 |
+
# # Method 1: Standard beam search
|
| 119 |
+
# print("\n--- Method 1: Standard Beam Search ---")
|
| 120 |
+
# results['beam_standard'] = self.transcribe_with_logprobs(audio_file, temperature=1.0)
|
| 121 |
+
|
| 122 |
+
# # Method 2: Lower temperature (more conservative)
|
| 123 |
+
# print("\n--- Method 2: Conservative (temp=0.5) ---")
|
| 124 |
+
# results['beam_conservative'] = self.transcribe_with_logprobs(audio_file, temperature=0.5)
|
| 125 |
+
|
| 126 |
+
# # Method 3: Higher temperature (more exploratory)
|
| 127 |
+
# print("\n--- Method 3: Exploratory (temp=1.5) ---")
|
| 128 |
+
# results['beam_exploratory'] = self.transcribe_with_logprobs(audio_file, temperature=1.5)
|
| 129 |
+
|
| 130 |
+
# # Method 4: Frame-level analysis
|
| 131 |
+
# print("\n--- Method 4: Frame-level Analysis ---")
|
| 132 |
+
# results['frame_analysis'], _ = self.transcribe_with_frame_analysis(audio_file)
|
| 133 |
+
|
| 134 |
+
# return results
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
# def post_process_repetitions(text: str, audio_duration: float, expected_word_count: int = None) -> str:
|
| 138 |
+
# """
|
| 139 |
+
# Heuristic post-processing to restore repetitions
|
| 140 |
+
|
| 141 |
+
# Args:
|
| 142 |
+
# text: Transcribed text
|
| 143 |
+
# audio_duration: Duration of audio in seconds
|
| 144 |
+
# expected_word_count: Expected number of words (if known)
|
| 145 |
+
# """
|
| 146 |
+
# words = text.split()
|
| 147 |
+
|
| 148 |
+
# # Calculate speaking rate (words per second)
|
| 149 |
+
# speaking_rate = len(words) / audio_duration
|
| 150 |
+
|
| 151 |
+
# # Normal Arabic speaking rate is 2-3 words per second
|
| 152 |
+
# # For numbers, it's often slower (1-2 words per second)
|
| 153 |
+
# # If rate is too high, likely missing repetitions
|
| 154 |
+
|
| 155 |
+
# if speaking_rate > 3.0 and expected_word_count:
|
| 156 |
+
# print(f"⚠️ Speaking rate unusually high ({speaking_rate:.1f} w/s)")
|
| 157 |
+
# print(f" Expected ~{expected_word_count} words, got {len(words)}")
|
| 158 |
+
# print(" Possible missing repetitions detected")
|
| 159 |
+
|
| 160 |
+
# return text
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# def detect_number_patterns(text: str) -> List[str]:
|
| 164 |
+
# """Detect if text contains Arabic number words"""
|
| 165 |
+
# arabic_numbers = [
|
| 166 |
+
# 'صفر', 'زيرو', 'واحد', 'اثنين', 'ثلاثة', 'أربعة',
|
| 167 |
+
# 'خمسة', 'ستة', 'سبعة', 'ثمانية', 'تسعة'
|
| 168 |
+
# ]
|
| 169 |
+
|
| 170 |
+
# words = text.split()
|
| 171 |
+
# detected = [w for w in words if w in arabic_numbers]
|
| 172 |
+
|
| 173 |
+
# if detected:
|
| 174 |
+
# print(f"🔢 Detected number words: {' '.join(detected)}")
|
| 175 |
+
|
| 176 |
+
# return detected
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
# # ===== MAIN EXECUTION =====
|
| 180 |
+
# if __name__ == "__main__":
|
| 181 |
+
# # ===== STEP 1: Record audio =====
|
| 182 |
+
# print("🎙️ Recording... Speak Arabic now!")
|
| 183 |
+
# print("💡 TIP: For repeated numbers, pause slightly between each repetition")
|
| 184 |
+
# print(" Example: 'زيرو [pause] زيرو [pause] واحد [pause] واحد'\n")
|
| 185 |
+
|
| 186 |
+
# audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
|
| 187 |
+
# sd.wait()
|
| 188 |
+
# wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
|
| 189 |
+
# print(f"✅ Recording finished. Saved as {OUTPUT_FILE}\n")
|
| 190 |
+
|
| 191 |
+
# # ===== STEP 2: Initialize transcriber =====
|
| 192 |
+
# model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
|
| 193 |
+
# transcriber = RepetitionAwareTranscriber(model_path)
|
| 194 |
+
|
| 195 |
+
# # ===== STEP 3: Transcribe with all methods =====
|
| 196 |
+
# results = transcriber.transcribe_with_all_methods(OUTPUT_FILE)
|
| 197 |
+
|
| 198 |
+
# # ===== STEP 4: Display all results =====
|
| 199 |
+
# print("\n" + "="*60)
|
| 200 |
+
# print("📝 FINAL RESULTS:")
|
| 201 |
+
# print("="*60)
|
| 202 |
+
|
| 203 |
+
# for method, transcription in results.items():
|
| 204 |
+
# print(f"\n{method.upper()}:")
|
| 205 |
+
# print(f" {transcription}")
|
| 206 |
+
# detect_number_patterns(transcription)
|
| 207 |
+
|
| 208 |
+
# # ===== STEP 5: Post-processing analysis =====
|
| 209 |
+
# print("\n" + "="*60)
|
| 210 |
+
# print("🔍 POST-PROCESSING ANALYSIS:")
|
| 211 |
+
# print("="*60)
|
| 212 |
+
|
| 213 |
+
# best_transcription = results['beam_standard']
|
| 214 |
+
# processed = post_process_repetitions(best_transcription, DURATION)
|
| 215 |
+
|
| 216 |
+
# print(f"\nBest transcription: {best_transcription}")
|
| 217 |
+
# print(f"Word count: {len(best_transcription.split())}")
|
| 218 |
+
# print(f"Speaking rate: {len(best_transcription.split()) / DURATION:.2f} words/sec")
|
| 219 |
+
|
| 220 |
+
# # ===== STEP 6: Recommendations =====
|
| 221 |
+
# print("\n" + "="*60)
|
| 222 |
+
# print("💡 RECOMMENDATIONS:")
|
| 223 |
+
# print("="*60)
|
| 224 |
+
# print("1. Compare all method outputs above")
|
| 225 |
+
# print("2. If all methods miss repetitions, the issue is in the trained model")
|
| 226 |
+
# print("3. Consider retraining with more repetitive sequences in training data")
|
| 227 |
+
# print("4. When speaking, add slight pauses between repeated words")
|
| 228 |
+
# print("5. If transcribing phone numbers, use digit-by-digit model instead")
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
import sounddevice as sd
|
| 232 |
+
import scipy.io.wavfile as wav
|
| 233 |
+
import nemo.collections.asr as nemo_asr
|
| 234 |
+
import torch
|
| 235 |
+
import numpy as np
|
| 236 |
+
from typing import List, Tuple
|
| 237 |
+
|
| 238 |
+
# ===== SETTINGS =====
|
| 239 |
+
SAMPLE_RATE = 16000
|
| 240 |
+
DURATION = 10 # seconds
|
| 241 |
+
OUTPUT_FILE = "arabic_recording.wav"
|
| 242 |
+
|
| 243 |
+
class RepetitionAwareTranscriber:
|
| 244 |
+
def __init__(self, model_path: str):
|
| 245 |
+
"""Initialize ASR model with repetition-aware configuration"""
|
| 246 |
+
print("📥 Loading Arabic ASR model...")
|
| 247 |
+
# Try to load as Hybrid RNNT-CTC first (better for repetitions!)
|
| 248 |
+
try:
|
| 249 |
+
self.asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(model_path)
|
| 250 |
+
self.model_type = "hybrid_rnnt_ctc"
|
| 251 |
+
print("✅ Loaded as Hybrid RNNT-CTC model (excellent for repetitions!)")
|
| 252 |
+
except:
|
| 253 |
+
try:
|
| 254 |
+
self.asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(model_path)
|
| 255 |
+
self.model_type = "rnnt"
|
| 256 |
+
print("✅ Loaded as RNNT model")
|
| 257 |
+
except:
|
| 258 |
+
self.asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
|
| 259 |
+
self.model_type = "ctc"
|
| 260 |
+
print("✅ Loaded as CTC model")
|
| 261 |
+
|
| 262 |
+
self._configure_decoding()
|
| 263 |
+
|
| 264 |
+
def _configure_decoding(self):
|
| 265 |
+
"""Configure advanced decoding strategy"""
|
| 266 |
+
decoding_cfg = self.asr_model.cfg.decoding
|
| 267 |
+
|
| 268 |
+
# Use beam search for better sequence modeling
|
| 269 |
+
decoding_cfg.strategy = "beam"
|
| 270 |
+
decoding_cfg.beam.beam_size = 128 # Larger beam for more candidates
|
| 271 |
+
decoding_cfg.beam.return_best_hypothesis = False # Get multiple hypotheses
|
| 272 |
+
|
| 273 |
+
# Language model parameters (if available)
|
| 274 |
+
if hasattr(decoding_cfg.beam, 'beam_alpha'):
|
| 275 |
+
decoding_cfg.beam.beam_alpha = 0.3 # LM weight (lower = less LM influence)
|
| 276 |
+
if hasattr(decoding_cfg.beam, 'beam_beta'):
|
| 277 |
+
decoding_cfg.beam.beam_beta = 0.5 # Word insertion bonus
|
| 278 |
+
|
| 279 |
+
self.asr_model.change_decoding_strategy(decoding_cfg)
|
| 280 |
+
|
| 281 |
+
def transcribe_with_logprobs(self, audio_file: str, temperature: float = 1.0):
|
| 282 |
+
"""
|
| 283 |
+
Transcribe with log probabilities and temperature scaling
|
| 284 |
+
|
| 285 |
+
Args:
|
| 286 |
+
audio_file: Path to audio file
|
| 287 |
+
temperature: Controls randomness (lower = more conservative, higher = more diverse)
|
| 288 |
+
0.5 = more deterministic
|
| 289 |
+
1.0 = standard
|
| 290 |
+
1.5 = more exploratory
|
| 291 |
+
"""
|
| 292 |
+
print(f"🔍 Transcribing with temperature={temperature}...")
|
| 293 |
+
|
| 294 |
+
# Update temperature in decoding config
|
| 295 |
+
if hasattr(self.asr_model.cfg.decoding, 'temperature'):
|
| 296 |
+
self.asr_model.cfg.decoding.temperature = temperature
|
| 297 |
+
if hasattr(self.asr_model.cfg.decoding.beam, 'softmax_temperature'):
|
| 298 |
+
self.asr_model.cfg.decoding.beam.softmax_temperature = temperature
|
| 299 |
+
|
| 300 |
+
self.asr_model.change_decoding_strategy(self.asr_model.cfg.decoding)
|
| 301 |
+
|
| 302 |
+
# Get multiple hypotheses with their scores
|
| 303 |
+
hypotheses = self.asr_model.transcribe(
|
| 304 |
+
[audio_file],
|
| 305 |
+
batch_size=1,
|
| 306 |
+
return_hypotheses=True,
|
| 307 |
+
num_workers=0
|
| 308 |
+
)
|
| 309 |
+
print(hypotheses)
|
| 310 |
+
# Handle different return types
|
| 311 |
+
if isinstance(hypotheses, list) and len(hypotheses) > 0:
|
| 312 |
+
hyp = hypotheses[0]
|
| 313 |
+
|
| 314 |
+
# Check if it's a Hypothesis object or a list
|
| 315 |
+
if isinstance(hyp, list):
|
| 316 |
+
# It's already a list of transcriptions
|
| 317 |
+
best_text = hyp[0] if len(hyp) > 0 else ""
|
| 318 |
+
print(f"\n📊 Top hypothesis: {best_text}")
|
| 319 |
+
return best_text
|
| 320 |
+
elif hasattr(hyp, 'text'):
|
| 321 |
+
# It's a Hypothesis object
|
| 322 |
+
text = hyp.text
|
| 323 |
+
|
| 324 |
+
# Check for nbest hypotheses
|
| 325 |
+
if hasattr(hyp, 'nbest') and len(hyp.nbest) > 1:
|
| 326 |
+
print(f"\n📊 Top {min(5, len(hyp.nbest))} hypotheses:")
|
| 327 |
+
for i, nbest_hyp in enumerate(hyp.nbest[:5]):
|
| 328 |
+
score = nbest_hyp.score if hasattr(nbest_hyp, 'score') else 'N/A'
|
| 329 |
+
hyp_text = nbest_hyp.text if hasattr(nbest_hyp, 'text') else str(nbest_hyp)
|
| 330 |
+
print(f" {i+1}. [{score}] {hyp_text}")
|
| 331 |
+
|
| 332 |
+
return text
|
| 333 |
+
else:
|
| 334 |
+
# Fallback: convert to string
|
| 335 |
+
return str(hyp)
|
| 336 |
+
|
| 337 |
+
return ""
|
| 338 |
+
|
| 339 |
+
def transcribe_with_frame_analysis(self, audio_file: str):
|
| 340 |
+
"""
|
| 341 |
+
Analyze frame-level predictions to detect repetitions
|
| 342 |
+
This examines the raw CTC outputs before collapsing
|
| 343 |
+
"""
|
| 344 |
+
print("🔍 Performing frame-level analysis...")
|
| 345 |
+
|
| 346 |
+
# Get log probabilities at frame level
|
| 347 |
+
log_probs = self.asr_model.transcribe(
|
| 348 |
+
[audio_file],
|
| 349 |
+
batch_size=1,
|
| 350 |
+
logprobs=True
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
# Standard transcription
|
| 354 |
+
transcription = self.asr_model.transcribe([audio_file])
|
| 355 |
+
|
| 356 |
+
return transcription[0], log_probs
|
| 357 |
+
|
| 358 |
+
def transcribe_with_all_methods(self, audio_file: str):
|
| 359 |
+
"""Try multiple decoding strategies and return all results"""
|
| 360 |
+
results = {}
|
| 361 |
+
|
| 362 |
+
# Method 1: Standard beam search
|
| 363 |
+
print("\n--- Method 1: Standard Beam Search ---")
|
| 364 |
+
results['beam_standard'] = self.transcribe_with_logprobs(audio_file, temperature=1.0)
|
| 365 |
+
print(f"Results with Temp 1.0 : {results['beam_standard']}")
|
| 366 |
+
|
| 367 |
+
# Method 2: Lower temperature (more conservative)
|
| 368 |
+
print("\n--- Method 2: Conservative (temp=0.5) ---")
|
| 369 |
+
results['beam_conservative'] = self.transcribe_with_logprobs(audio_file, temperature=0.5)
|
| 370 |
+
print(f"Results with Temp 0.5 : {results['beam_conservative']}")
|
| 371 |
+
# Method 3: Higher temperature (more exploratory)
|
| 372 |
+
print("\n--- Method 3: Exploratory (temp=1.5) ---")
|
| 373 |
+
results['beam_exploratory'] = self.transcribe_with_logprobs(audio_file, temperature=1.5)
|
| 374 |
+
print(f"Results with Temp 1.5 : {results['beam_exploratory']}")
|
| 375 |
+
# Method 4: Frame-level analysis
|
| 376 |
+
# print("\n--- Method 4: Frame-level Analysis ---")
|
| 377 |
+
# results['frame_analysis'], _ = self.transcribe_with_frame_analysis(audio_file)
|
| 378 |
+
|
| 379 |
+
return results
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
def post_process_repetitions(text: str, audio_duration: float, expected_word_count: int = None) -> str:
|
| 383 |
+
"""
|
| 384 |
+
Heuristic post-processing to restore repetitions
|
| 385 |
+
|
| 386 |
+
Args:
|
| 387 |
+
text: Transcribed text
|
| 388 |
+
audio_duration: Duration of audio in seconds
|
| 389 |
+
expected_word_count: Expected number of words (if known)
|
| 390 |
+
"""
|
| 391 |
+
words = text.split()
|
| 392 |
+
|
| 393 |
+
# Calculate speaking rate (words per second)
|
| 394 |
+
speaking_rate = len(words) / audio_duration
|
| 395 |
+
|
| 396 |
+
# Normal Arabic speaking rate is 2-3 words per second
|
| 397 |
+
# For numbers, it's often slower (1-2 words per second)
|
| 398 |
+
# If rate is too high, likely missing repetitions
|
| 399 |
+
|
| 400 |
+
if speaking_rate > 3.0 and expected_word_count:
|
| 401 |
+
print(f"⚠️ Speaking rate unusually high ({speaking_rate:.1f} w/s)")
|
| 402 |
+
print(f" Expected ~{expected_word_count} words, got {len(words)}")
|
| 403 |
+
print(" Possible missing repetitions detected")
|
| 404 |
+
|
| 405 |
+
return text
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
def detect_number_patterns(text: str) -> List[str]:
|
| 409 |
+
"""Detect if text contains Arabic number words"""
|
| 410 |
+
arabic_numbers = [
|
| 411 |
+
'صفر', 'زيرو', 'واحد', 'اثنين', 'ثلاثة', 'أربعة',
|
| 412 |
+
'خمسة', 'ستة', 'سبعة', 'ثمانية', 'تسعة'
|
| 413 |
+
]
|
| 414 |
+
|
| 415 |
+
words = text.split()
|
| 416 |
+
detected = [w for w in words if w in arabic_numbers]
|
| 417 |
+
|
| 418 |
+
if detected:
|
| 419 |
+
print(f"🔢 Detected number words: {' '.join(detected)}")
|
| 420 |
+
|
| 421 |
+
return detected
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
# ===== MAIN EXECUTION =====
|
| 425 |
+
if __name__ == "__main__":
|
| 426 |
+
# ===== STEP 1: Record audio =====
|
| 427 |
+
print("🎙️ Recording... Speak Arabic now!")
|
| 428 |
+
print("💡 TIP: For repeated numbers, pause slightly between each repetition")
|
| 429 |
+
print(" Example: 'زيرو [pause] زيرو [pause] واحد [pause] واحد'\n")
|
| 430 |
+
|
| 431 |
+
audio = sd.rec(int(SAMPLE_RATE * DURATION), samplerate=SAMPLE_RATE, channels=1, dtype='int16')
|
| 432 |
+
sd.wait()
|
| 433 |
+
wav.write(OUTPUT_FILE, SAMPLE_RATE, audio)
|
| 434 |
+
print(f"✅ Recording finished. Saved as {OUTPUT_FILE}\n")
|
| 435 |
+
|
| 436 |
+
# ===== STEP 2: Initialize transcriber =====
|
| 437 |
+
model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Arabic_Contextual_ASR/PreparingDatasetStreamlitApp/4_Finetuning_Nemo_ASR_arabic_names_and_complaints_for_phones/output_finetuned/finetuned_model_best.nemo"
|
| 438 |
+
transcriber = RepetitionAwareTranscriber(model_path)
|
| 439 |
+
|
| 440 |
+
# ===== STEP 3: Transcribe with all methods =====
|
| 441 |
+
results = transcriber.transcribe_with_all_methods(OUTPUT_FILE)
|
| 442 |
+
|
| 443 |
+
# ===== STEP 4: Display all results =====
|
| 444 |
+
print("\n" + "="*60)
|
| 445 |
+
print("📝 FINAL RESULTS:")
|
| 446 |
+
print("="*60)
|
| 447 |
+
|
| 448 |
+
for method, transcription in results.items():
|
| 449 |
+
print(f"\n{method.upper()}:")
|
| 450 |
+
print(f" {transcription}")
|
| 451 |
+
detect_number_patterns(transcription)
|
| 452 |
+
|
| 453 |
+
# ===== STEP 5: Post-processing analysis =====
|
| 454 |
+
print("\n" + "="*60)
|
| 455 |
+
print("🔍 POST-PROCESSING ANALYSIS:")
|
| 456 |
+
print("="*60)
|
| 457 |
+
|
| 458 |
+
best_transcription = results['beam_standard']
|
| 459 |
+
processed = post_process_repetitions(best_transcription, DURATION)
|
| 460 |
+
|
| 461 |
+
print(f"\nBest transcription: {best_transcription}")
|
| 462 |
+
print(f"Word count: {len(best_transcription.split())}")
|
| 463 |
+
print(f"Speaking rate: {len(best_transcription.split()) / DURATION:.2f} words/sec")
|
| 464 |
+
|
| 465 |
+
# ===== STEP 6: Recommendations =====
|
| 466 |
+
print("\n" + "="*60)
|
| 467 |
+
print("💡 RECOMMENDATIONS:")
|
| 468 |
+
print("="*60)
|
| 469 |
+
print("1. Compare all method outputs above")
|
| 470 |
+
print("2. If all methods miss repetitions, the issue is in the trained model")
|
| 471 |
+
print("3. Consider retraining with more repetitive sequences in training data")
|
| 472 |
+
print("4. When speaking, add slight pauses between repeated words")
|
| 473 |
+
print("5. If transcribing phone numbers, use digit-by-digit model instead")
|
train_manifest.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
train_split.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|