|
|
"""
|
|
|
Run this script FIRST to extract the tokenizer from the .nemo file
|
|
|
This creates the tokenizer folder that the training script needs
|
|
|
"""
|
|
|
import os
|
|
|
import tarfile
|
|
|
import zipfile
|
|
|
import shutil
|
|
|
|
|
|
MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo"
|
|
|
OUTPUT_DIR = "tokenizer"
|
|
|
|
|
|
print("🔹 Detecting .nemo file format...")
|
|
|
|
|
|
def try_extract_tokenizer():
|
|
|
"""Try different methods to extract tokenizer"""
|
|
|
|
|
|
|
|
|
try:
|
|
|
print("Trying: Regular tar format...")
|
|
|
with tarfile.open(MODEL_PATH, 'r:') as tar:
|
|
|
return extract_from_tar(tar)
|
|
|
except Exception as e:
|
|
|
print(f" ✗ Not a regular tar: {e}")
|
|
|
|
|
|
|
|
|
try:
|
|
|
print("Trying: Gzipped tar format...")
|
|
|
with tarfile.open(MODEL_PATH, 'r:gz') as tar:
|
|
|
return extract_from_tar(tar)
|
|
|
except Exception as e:
|
|
|
print(f" ✗ Not gzipped tar: {e}")
|
|
|
|
|
|
|
|
|
try:
|
|
|
print("Trying: ZIP format...")
|
|
|
with zipfile.ZipFile(MODEL_PATH, 'r') as zf:
|
|
|
return extract_from_zip(zf)
|
|
|
except Exception as e:
|
|
|
print(f" ✗ Not a ZIP file: {e}")
|
|
|
|
|
|
|
|
|
try:
|
|
|
print("Trying: Auto-detect format...")
|
|
|
with tarfile.open(MODEL_PATH, 'r:*') as tar:
|
|
|
return extract_from_tar(tar)
|
|
|
except Exception as e:
|
|
|
print(f" ✗ Auto-detect failed: {e}")
|
|
|
|
|
|
return False
|
|
|
|
|
|
def extract_from_tar(tar):
|
|
|
"""Extract tokenizer files from tar archive"""
|
|
|
tokenizer_files = [m for m in tar.getmembers() if 'tokenizer' in m.name.lower()]
|
|
|
|
|
|
if not tokenizer_files:
|
|
|
print("\n📋 Available files in archive:")
|
|
|
for member in tar.getmembers()[:20]:
|
|
|
print(f" - {member.name}")
|
|
|
if len(tar.getmembers()) > 20:
|
|
|
print(f" ... and {len(tar.getmembers()) - 20} more files")
|
|
|
return False
|
|
|
|
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
|
|
|
|
for member in tokenizer_files:
|
|
|
|
|
|
tar.extract(member, path="temp_extract")
|
|
|
|
|
|
|
|
|
src = os.path.join("temp_extract", member.name)
|
|
|
if os.path.isfile(src):
|
|
|
dst = os.path.join(OUTPUT_DIR, os.path.basename(member.name))
|
|
|
shutil.copy2(src, dst)
|
|
|
print(f"✅ Extracted: {os.path.basename(member.name)}")
|
|
|
|
|
|
|
|
|
if os.path.exists("temp_extract"):
|
|
|
shutil.rmtree("temp_extract")
|
|
|
|
|
|
return True
|
|
|
|
|
|
def extract_from_zip(zf):
|
|
|
"""Extract tokenizer files from zip archive"""
|
|
|
tokenizer_files = [n for n in zf.namelist() if 'tokenizer' in n.lower()]
|
|
|
|
|
|
if not tokenizer_files:
|
|
|
print("\n📋 Available files in archive:")
|
|
|
for name in zf.namelist()[:20]:
|
|
|
print(f" - {name}")
|
|
|
if len(zf.namelist()) > 20:
|
|
|
print(f" ... and {len(zf.namelist()) - 20} more files")
|
|
|
return False
|
|
|
|
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
|
|
|
|
for name in tokenizer_files:
|
|
|
|
|
|
zf.extract(name, path="temp_extract")
|
|
|
|
|
|
|
|
|
src = os.path.join("temp_extract", name)
|
|
|
if os.path.isfile(src):
|
|
|
dst = os.path.join(OUTPUT_DIR, os.path.basename(name))
|
|
|
shutil.copy2(src, dst)
|
|
|
print(f"✅ Extracted: {os.path.basename(name)}")
|
|
|
|
|
|
|
|
|
if os.path.exists("temp_extract"):
|
|
|
shutil.rmtree("temp_extract")
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
success = try_extract_tokenizer()
|
|
|
|
|
|
if success:
|
|
|
print(f"\n✅ Tokenizer extracted to: {OUTPUT_DIR}")
|
|
|
print("\n📁 Tokenizer files:")
|
|
|
for file in os.listdir(OUTPUT_DIR):
|
|
|
print(f" - {file}")
|
|
|
print("\n✅ Now you can run the training script!")
|
|
|
else:
|
|
|
print("\n❌ Could not extract tokenizer from .nemo file")
|
|
|
print("\n🔧 Alternative solution: The training script will use the embedded tokenizer")
|
|
|
print(" No action needed - proceed with training!") |