""" Run this script FIRST to extract the tokenizer from the .nemo file This creates the tokenizer folder that the training script needs """ import os import tarfile import zipfile import shutil MODEL_PATH = "stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo" OUTPUT_DIR = "tokenizer" print("šŸ”¹ Detecting .nemo file format...") def try_extract_tokenizer(): """Try different methods to extract tokenizer""" # Method 1: Try as regular tar (no compression) try: print("Trying: Regular tar format...") with tarfile.open(MODEL_PATH, 'r:') as tar: return extract_from_tar(tar) except Exception as e: print(f" āœ— Not a regular tar: {e}") # Method 2: Try as gzipped tar try: print("Trying: Gzipped tar format...") with tarfile.open(MODEL_PATH, 'r:gz') as tar: return extract_from_tar(tar) except Exception as e: print(f" āœ— Not gzipped tar: {e}") # Method 3: Try as zip file try: print("Trying: ZIP format...") with zipfile.ZipFile(MODEL_PATH, 'r') as zf: return extract_from_zip(zf) except Exception as e: print(f" āœ— Not a ZIP file: {e}") # Method 4: Try auto-detect try: print("Trying: Auto-detect format...") with tarfile.open(MODEL_PATH, 'r:*') as tar: return extract_from_tar(tar) except Exception as e: print(f" āœ— Auto-detect failed: {e}") return False def extract_from_tar(tar): """Extract tokenizer files from tar archive""" tokenizer_files = [m for m in tar.getmembers() if 'tokenizer' in m.name.lower()] if not tokenizer_files: print("\nšŸ“‹ Available files in archive:") for member in tar.getmembers()[:20]: # Show first 20 print(f" - {member.name}") if len(tar.getmembers()) > 20: print(f" ... and {len(tar.getmembers()) - 20} more files") return False os.makedirs(OUTPUT_DIR, exist_ok=True) for member in tokenizer_files: # Extract to temp directory tar.extract(member, path="temp_extract") # Move to tokenizer directory src = os.path.join("temp_extract", member.name) if os.path.isfile(src): dst = os.path.join(OUTPUT_DIR, os.path.basename(member.name)) shutil.copy2(src, dst) print(f"āœ… Extracted: {os.path.basename(member.name)}") # Cleanup if os.path.exists("temp_extract"): shutil.rmtree("temp_extract") return True def extract_from_zip(zf): """Extract tokenizer files from zip archive""" tokenizer_files = [n for n in zf.namelist() if 'tokenizer' in n.lower()] if not tokenizer_files: print("\nšŸ“‹ Available files in archive:") for name in zf.namelist()[:20]: print(f" - {name}") if len(zf.namelist()) > 20: print(f" ... and {len(zf.namelist()) - 20} more files") return False os.makedirs(OUTPUT_DIR, exist_ok=True) for name in tokenizer_files: # Extract file zf.extract(name, path="temp_extract") # Move to tokenizer directory src = os.path.join("temp_extract", name) if os.path.isfile(src): dst = os.path.join(OUTPUT_DIR, os.path.basename(name)) shutil.copy2(src, dst) print(f"āœ… Extracted: {os.path.basename(name)}") # Cleanup if os.path.exists("temp_extract"): shutil.rmtree("temp_extract") return True # Try extraction success = try_extract_tokenizer() if success: print(f"\nāœ… Tokenizer extracted to: {OUTPUT_DIR}") print("\nšŸ“ Tokenizer files:") for file in os.listdir(OUTPUT_DIR): print(f" - {file}") print("\nāœ… Now you can run the training script!") else: print("\nāŒ Could not extract tokenizer from .nemo file") print("\nšŸ”§ Alternative solution: The training script will use the embedded tokenizer") print(" No action needed - proceed with training!")