Arabic_Finetuned_ASR_Nemo / Extracting_tokenizer_dir_from_Nemo_model.py
alaatiger989's picture
Add files using upload-large-folder tool
b5e57ee verified
"""
Run this script FIRST to extract the tokenizer from the .nemo file
This creates the tokenizer folder that the training script needs
"""
import os
import tarfile
import zipfile
import shutil
MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
OUTPUT_DIR = "tokenizer"
print("🔹 Detecting .nemo file format...")
def try_extract_tokenizer():
"""Try different methods to extract tokenizer"""
# Method 1: Try as regular tar (no compression)
try:
print("Trying: Regular tar format...")
with tarfile.open(MODEL_PATH, 'r:') as tar:
return extract_from_tar(tar)
except Exception as e:
print(f" ✗ Not a regular tar: {e}")
# Method 2: Try as gzipped tar
try:
print("Trying: Gzipped tar format...")
with tarfile.open(MODEL_PATH, 'r:gz') as tar:
return extract_from_tar(tar)
except Exception as e:
print(f" ✗ Not gzipped tar: {e}")
# Method 3: Try as zip file
try:
print("Trying: ZIP format...")
with zipfile.ZipFile(MODEL_PATH, 'r') as zf:
return extract_from_zip(zf)
except Exception as e:
print(f" ✗ Not a ZIP file: {e}")
# Method 4: Try auto-detect
try:
print("Trying: Auto-detect format...")
with tarfile.open(MODEL_PATH, 'r:*') as tar:
return extract_from_tar(tar)
except Exception as e:
print(f" ✗ Auto-detect failed: {e}")
return False
def extract_from_tar(tar):
"""Extract tokenizer files from tar archive"""
tokenizer_files = [m for m in tar.getmembers() if 'tokenizer' in m.name.lower()]
if not tokenizer_files:
print("\n📋 Available files in archive:")
for member in tar.getmembers()[:20]: # Show first 20
print(f" - {member.name}")
if len(tar.getmembers()) > 20:
print(f" ... and {len(tar.getmembers()) - 20} more files")
return False
os.makedirs(OUTPUT_DIR, exist_ok=True)
for member in tokenizer_files:
# Extract to temp directory
tar.extract(member, path="temp_extract")
# Move to tokenizer directory
src = os.path.join("temp_extract", member.name)
if os.path.isfile(src):
dst = os.path.join(OUTPUT_DIR, os.path.basename(member.name))
shutil.copy2(src, dst)
print(f"✅ Extracted: {os.path.basename(member.name)}")
# Cleanup
if os.path.exists("temp_extract"):
shutil.rmtree("temp_extract")
return True
def extract_from_zip(zf):
"""Extract tokenizer files from zip archive"""
tokenizer_files = [n for n in zf.namelist() if 'tokenizer' in n.lower()]
if not tokenizer_files:
print("\n📋 Available files in archive:")
for name in zf.namelist()[:20]:
print(f" - {name}")
if len(zf.namelist()) > 20:
print(f" ... and {len(zf.namelist()) - 20} more files")
return False
os.makedirs(OUTPUT_DIR, exist_ok=True)
for name in tokenizer_files:
# Extract file
zf.extract(name, path="temp_extract")
# Move to tokenizer directory
src = os.path.join("temp_extract", name)
if os.path.isfile(src):
dst = os.path.join(OUTPUT_DIR, os.path.basename(name))
shutil.copy2(src, dst)
print(f"✅ Extracted: {os.path.basename(name)}")
# Cleanup
if os.path.exists("temp_extract"):
shutil.rmtree("temp_extract")
return True
# Try extraction
success = try_extract_tokenizer()
if success:
print(f"\n✅ Tokenizer extracted to: {OUTPUT_DIR}")
print("\n📁 Tokenizer files:")
for file in os.listdir(OUTPUT_DIR):
print(f" - {file}")
print("\n✅ Now you can run the training script!")
else:
print("\n❌ Could not extract tokenizer from .nemo file")
print("\n🔧 Alternative solution: The training script will use the embedded tokenizer")
print(" No action needed - proceed with training!")