refactor(distiller): improve beam distillation and tokenlearn integration

This commit introduces separate Beam functions for distillation and training, enabling more modular and controllable workflows. It also enhances tokenlearn integration by using persistent directories for caching and checkpointing, and improves error handling for training failures.

The changes also include validation of the model to check vocab and embedding sizes match, which can highlight issues in downstream usage.

Files changed (2) hide show

patches/tokenlearn.patch +25 -0
src/distiller/distill.py +381 -256

patches/tokenlearn.patch ADDED Viewed

	@@ -0,0 +1,25 @@

+--- a/tokenlearn/pretrain.py
++++ b/tokenlearn/pretrain.py
+@@ -38,7 +38,10 @@ class FinetunableStaticModel(nn.Module):
+         """Run the model using input IDs."""
+         input_ids = input_ids.view(-1)
+         input_ids = input_ids[input_ids != self.pad_token_id]
+-        w = self.w[input_ids]
++        # Fix for index out of bounds issue
++        # Clamp input_ids to valid range to prevent IndexError during training
++        valid_input_ids = torch.clamp(input_ids, 0, self.w.shape[0] - 1)
++        w = self.w[valid_input_ids]
+         return self.sub_forward(w)
+     def forward(self, x):
+@@ -46,7 +49,10 @@ class FinetunableStaticModel(nn.Module):
+         # Add a small epsilon to avoid division by zero
+         length = zeros.sum(1) + 1e-16
+-        embedded = self.embeddings(input_ids)
++        # Fix for embedding index out of bounds issue
++        # Clamp input_ids to valid embedding range
++        valid_input_ids = torch.clamp(input_ids, 0, self.embeddings.num_embeddings - 1)
++        embedded = self.embeddings(valid_input_ids)
+         # Zero out the padding
+         embedded = torch.bmm(w[:, None, :], embedded).squeeze(1)
+         # Simulate actual mean

src/distiller/distill.py CHANGED Viewed

@@ -49,6 +49,7 @@ from .config import (
 	directories,
 	distillation_config,
 	get_distillation_function_kwargs,
 	get_volume_config,
 	languages_config,
 )
@@ -358,6 +359,21 @@ def simple_distillation(
 		logger.info("✅ Core distillation completed successfully")
 		# Save the model
 		model.save_pretrained(str(output_path))
 		logger.info(f"💾 Model saved to {output_path}")
@@ -772,7 +788,11 @@ def apply_post_training_regularization(
 	logger.info(f"🔄 Applying PCA with {pca_dims} dimensions...")
 	# Get current embeddings
-	embeddings = model.embedding.cpu().numpy().astype(np.float64)
 	original_shape = embeddings.shape
 	logger.info(f"Original embedding shape: {original_shape}")
@@ -846,229 +866,288 @@ def tokenlearn_training(
 	4. Post-training re-regularization (PCA + SIF weighting)
 	"""
 	import subprocess
-	import tempfile
 	from pathlib import Path
 	logger.info("🧪 Starting tokenlearn training (POTION approach)...")
-	# Create temporary directories for tokenlearn workflow
-	with tempfile.TemporaryDirectory() as temp_dir:
-		temp_path = Path(temp_dir)
-		features_dir = temp_path / "features"
-		model_dir = temp_path / "base_model"
-		trained_dir = temp_path / "trained_model"
-		features_dir.mkdir(exist_ok=True)
-		model_dir.mkdir(exist_ok=True)
-		trained_dir.mkdir(exist_ok=True)
-		# Save the base distilled model for tokenlearn
-		student_model.save_pretrained(str(model_dir))
-		logger.info(f"💾 Saved base model to {model_dir}")
-		# Step 2: Create features using sentence transformer
-		logger.info("🔍 Step 2: Creating features using sentence transformer...")
-		# Get teacher model name/path for tokenlearn
-		teacher_model_name = getattr(teacher_model, "model_name", None)
-		if not teacher_model_name and hasattr(teacher_model, "_modules") and len(teacher_model._modules) > 0:  # noqa: SLF001
-			# Try to extract from the first module if it's a SentenceTransformer
-			# _modules is a dict-like container, get the first module by iterating
-			first_module = next(iter(teacher_model._modules.values()))  # noqa: SLF001
-			if hasattr(first_module, "auto_model") and hasattr(first_module.auto_model, "name_or_path"):
-				teacher_model_name = first_module.auto_model.name_or_path
 		logger.info(f"📊 Using teacher model: {teacher_model_name}")
-		# Check if featurization already completed (checkpoint detection)
-		featurization_complete_marker = features_dir / ".featurization_complete"
-		if featurization_complete_marker.exists() and verify_featurization_output(features_dir):
-			logger.info("✅ Found existing featurization checkpoint with valid output files")
-			logger.info(f"📂 Using cached features from: {features_dir}")
-			# Verify marker is still valid
-			output_files = list(features_dir.glob("*.npy")) + list(features_dir.glob("*.json"))
-			logger.info(f"📁 Found {len(output_files)} cached feature files")
-		else:
-			if featurization_complete_marker.exists():
-				logger.warning("⚠️ Featurization marker exists but output files are missing - re-running featurization")
-				featurization_complete_marker.unlink()
-			logger.info("🔄 No valid featurization checkpoint found - starting featurization...")
-			if not teacher_model_name:
-				logger.warning("⚠️ Could not determine teacher model name, using fallback")
-				teacher_model_name = "BAAI/bge-base-en-v1.5"  # Fallback to a common model
-			logger.info(f"📊 Using teacher model: {teacher_model_name}")
-			try:
-				# Use configured dataset for code specialization
-				featurize_cmd = [
-					"python",
-					"-m",
-					"tokenlearn.featurize",
-					"--model-name",
-					str(teacher_model_name),
-					"--output-dir",
-					str(features_dir),
-					"--dataset-path",
-					str(distillation_config.tokenlearn_dataset),
-					"--dataset-name",
-					str(distillation_config.tokenlearn_dataset_name),
-					"--dataset-split",
-					"train",
-					"--key",
-					str(distillation_config.tokenlearn_text_key),  # Use configured text field
-					"--batch-size",
-					"1024",  # Optimized batch size for A100-40G
-				]
-				logger.info("🔄 Running tokenlearn featurization...")
-				logger.info(
-					f"📊 Dataset: {distillation_config.tokenlearn_dataset} (config: {distillation_config.tokenlearn_dataset_name})"
-				)
-				logger.info(f"📝 Text field: {distillation_config.tokenlearn_text_key}")
-				logger.info(f"Command: {' '.join(featurize_cmd)}")
-				print(f"\n🔄 Executing: {' '.join(featurize_cmd)}\n")
-				result = subprocess.run(  # noqa: S603
-					featurize_cmd,
-					text=True,
-					timeout=distillation_config.tokenlearn_timeout_featurize,
-					check=False,
-				)
-				if result.returncode != 0:
-					logger.error(f"❌ Featurization failed with return code: {result.returncode}")
-					logger.error("💥 Tokenlearn featurization is required for training - cannot proceed")
-					msg = f"Tokenlearn featurization failed with return code: {result.returncode}"
-					raise RuntimeError(msg)
-				logger.info("✅ Featurization completed successfully")
-				# Create checkpoint marker to indicate featurization is complete
-				featurization_complete_marker.touch()
-				logger.info(f"💾 Created featurization checkpoint: {featurization_complete_marker}")
-				# Generate token frequencies for post-training re-regularization
-				logger.info("📊 Computing token frequencies for SIF weighting...")
-				compute_token_frequencies_for_sif(teacher_model, features_dir)
-			except Exception as e:
-				logger.exception("💥 Tokenlearn featurization failed")
-				logger.exception("💥 Tokenlearn featurization is required for training - cannot proceed")
-				msg = f"Tokenlearn featurization failed: {e}"
-				raise RuntimeError(msg) from e
-		# Step 3: Train using tokenlearn-train
-		logger.info("🎓 Step 3: Training using tokenlearn...")
-		# Check if training already completed (checkpoint detection)
-		training_complete_marker = trained_dir / ".training_complete"
-		if training_complete_marker.exists() and verify_training_output(trained_dir):
-			logger.info("✅ Found existing training checkpoint with valid model files")
-			logger.info(f"📂 Using cached trained model from: {trained_dir}")
-			# Show available model files
-			model_files = []
-			for pattern in ["*.json", "*.safetensors", "*.bin"]:
-				model_files.extend(list(trained_dir.glob(pattern)))
-				for subdir in ["model", "model_weighted"]:
-					subdir_path = trained_dir / subdir
-					if subdir_path.exists():
-						model_files.extend(list(subdir_path.glob(pattern)))
-			logger.info(f"📁 Found {len(model_files)} cached model files")
-		else:
-			if training_complete_marker.exists():
-				logger.warning("⚠️ Training marker exists but model files are missing - re-running training")
-				training_complete_marker.unlink()
-			logger.info("🔄 No valid training checkpoint found - starting training...")
-			try:
-				train_cmd = [
-					"python",
-					"-m",
-					"tokenlearn.train",
-					"--model-name",
-					str(teacher_model_name),
-					"--data-path",
-					str(features_dir),
-					"--save-path",
-					str(trained_dir),
-				]
-				logger.info("���� Running tokenlearn training...")
-				logger.info(f"Command: {' '.join(train_cmd)}")
-				print(f"\n🎓 Executing: {' '.join(train_cmd)}\n")
-				result = subprocess.run(  # noqa: S603
-					train_cmd,
-					text=True,
-					timeout=distillation_config.tokenlearn_timeout_train,
-					check=False,
-				)
-				if result.returncode != 0:
-					logger.error(f"❌ Tokenlearn training failed with return code: {result.returncode}")
-					logger.error("💥 Tokenlearn training is required - cannot proceed")
-					msg = f"Tokenlearn training failed with return code: {result.returncode}"
-					raise RuntimeError(msg)
-				logger.info("✅ Tokenlearn training completed successfully")
-				# Create checkpoint marker to indicate training is complete
-				training_complete_marker.touch()
-				logger.info(f"💾 Created training checkpoint: {training_complete_marker}")
-			except Exception as e:
-				logger.exception("💥 Tokenlearn training failed")
-				logger.exception("💥 Tokenlearn training is required - cannot proceed")
-				msg = f"Tokenlearn training failed: {e}"
-				raise RuntimeError(msg) from e
-		# Step 4: Load the trained model and apply post-training re-regularization
-		logger.info("📦 Step 4: Loading trained model and applying post-training re-regularization...")
-		try:
-			from model2vec.model import StaticModel
-			# Load the trained model from tokenlearn
-			trained_model_path = trained_dir / "model"
-			if not trained_model_path.exists():
-				# Try alternative paths
-				possible_paths = [
-					trained_dir / "model_weighted",
-					trained_dir,
-				]
-				for path in possible_paths:
-					if path.exists() and any(path.glob("*.json")):
-						trained_model_path = path
-						break
-				else:
-					logger.error(f"❌ Could not find trained model in {trained_dir}")
-					msg = f"Tokenlearn training failed - no model found in {trained_dir}"
 					raise RuntimeError(msg)
-			# Load the model before re-regularization
-			logger.info("🔄 Loading model from tokenlearn training...")
-			trained_model = StaticModel.from_pretrained(str(trained_model_path))
-			# Apply post-training re-regularization (POTION Step 4)
-			logger.info("🔧 Applying post-training re-regularization (PCA + SIF weighting)...")
-			final_model = apply_post_training_regularization(
-				trained_model, features_dir, pca_dims=distillation_config.optimal_pca_dims
-			)
-			logger.info("✅ Tokenlearn training pipeline with post-training re-regularization completed successfully")
-			return final_model
-		except Exception as e:
-			logger.exception("💥 Failed to load tokenlearn trained model")
-			logger.exception("💥 Cannot load trained model - training failed")
-			msg = f"Failed to load tokenlearn trained model: {e}"
 			raise RuntimeError(msg) from e
 def distill_single_teacher(
@@ -1118,7 +1197,6 @@ def distill_single_teacher(
 	# Initialize Beam utilities if requested
 	checkpoint_mgr = None
-	model_mgr = None
 	if use_beam_utilities:
 		try:
 			_, checkpoint_mgr, model_mgr, _ = create_beam_utilities(VOLUME_CONFIG.name, VOLUME_CONFIG.mount_path)
@@ -1197,44 +1275,65 @@ def distill_single_teacher(
 			existing_base = str(base_dir)
-		# Step 3: Handle final model creation
-		if enable_training and base_model is not None:
-			# Perform tokenlearn training (POTION approach)
-			logger.info(f"🧪 Starting tokenlearn training for {teacher_name}")
-			# Load teacher model for training
-			device = "cuda" if torch.cuda.is_available() else "cpu"
-			teacher_st_model = load_model_with_flash_attention(teacher_model, device)
-			# Perform tokenlearn training (POTION approach)
-			final_model = tokenlearn_training(base_model, teacher_st_model, checkpoint_mgr)
-			# Save final model
-			final_dir.mkdir(parents=True, exist_ok=True)
-			final_model.save_pretrained(str(final_dir))
-			# Sync final model and training checkpoints to Beam
-			if use_beam_utilities:
-				sync_model_to_beam(f"{teacher_name}_final", str(final_dir), use_beam_utilities)
-				if checkpoint_mgr:
-					sync_checkpoints_to_beam(VOLUME_CONFIG.name, f"training_{teacher_name}", directories.checkpoints)
-			del teacher_st_model
-			if torch.cuda.is_available():
-				torch.cuda.empty_cache()
-		else:
-			# Copy base to final (no training)
-			logger.info(f"📁 Copying base to final for {teacher_name}")
-			if not copy_base_to_final(teacher_name, enable_training):
-				return {
-					"teacher_model": teacher_model,
-					"teacher_name": teacher_name,
-					"status": "failed_copy_to_final",
-					"error": "Failed to copy base to final",
-				}
-		total_time = time.time() - start_time
 		return {
 			"teacher_model": teacher_model,
@@ -1318,6 +1417,9 @@ def run_local_distillation(
 		if result["status"] == "success" or result["status"].startswith("skipped"):
 			successful_models.append(teacher_name)
 	# Summary
 	logger.info("\n🏆 DISTILLATION WORKFLOW COMPLETE!")
@@ -1349,16 +1451,13 @@ def run_local_distillation(
 	return results_summary
-@function(**get_distillation_function_kwargs())
-def _beam_distill_models(
 	teacher_models: list[str] | None = None,
 	enable_training: bool = False,
 	pca_dims: int | None = None,
 	clear_cache: bool = False,
 ) -> dict[str, Any]:
-	"""Internal Beam function for distillation."""
-	logger.info("☁️ Running distillation on Beam")
 	# Apply patches
 	patch_success = apply_local_patches()
 	if patch_success:
@@ -1404,6 +1503,9 @@ def _beam_distill_models(
 		if result["status"] == "success" or result["status"].startswith("skipped"):
 			successful_models.append(teacher_name)
 	# Summary
 	logger.info("\n🏆 BEAM DISTILLATION WORKFLOW COMPLETE!")
@@ -1429,6 +1531,30 @@ def _beam_distill_models(
 	return results_summary
 def run_beam_distillation(
 	teacher_models: list[str] | None = None,
 	enable_training: bool = False,
@@ -1439,8 +1565,11 @@ def run_beam_distillation(
 	logger.info("☁️ Running distillation on Beam with local sync")
 	try:
 		# Run distillation on Beam
-		results = _beam_distill_models.remote(teacher_models, enable_training, pca_dims, clear_cache)
 		# Check if Beam execution was successful
 		if not results:
@@ -1529,27 +1658,23 @@ def main(
 	# Clear tokenlearn checkpoints if requested (for training mode)
 	if clear_checkpoints and train:
-		import tempfile
 		logger.info("🧹 Clearing tokenlearn checkpoints to force fresh featurization and training...")
 		for teacher_model in models_to_distill:
-			teacher_name = teacher_model.split("/")[-1].replace("-", "_")
-			# Construct checkpoint paths using secure temporary directory
-			temp_dir = Path(tempfile.gettempdir()) / f"tokenlearn_{teacher_name}"
-			features_dir = temp_dir / "features"
-			trained_dir = temp_dir / "trained"
-			# Also check local paths
-			local_temp = Path("temp") / f"tokenlearn_{teacher_name}"
-			local_features = local_temp / "features"
-			local_trained = local_temp / "trained"
-			# Clear checkpoints for all possible paths
-			for feat_dir, train_dir in [(features_dir, trained_dir), (local_features, local_trained)]:
-				if feat_dir.exists() or train_dir.exists():
-					clear_tokenlearn_checkpoints(feat_dir, train_dir)
-					logger.info(f"🗑️ Cleared checkpoints for {teacher_model}")
 	elif clear_checkpoints and not train:
 		logger.warning("⚠️ --clear-checkpoints flag is only relevant when training is enabled (--train)")

 	directories,
 	distillation_config,
 	get_distillation_function_kwargs,
+	get_training_function_kwargs,
 	get_volume_config,
 	languages_config,
 )
 		logger.info("✅ Core distillation completed successfully")
+		# Validate model before saving
+		if hasattr(model, "tokenizer") and hasattr(model, "embedding"):
+			vocab_size = len(model.tokenizer.get_vocab())
+			embedding_size = model.embedding.shape[0]
+			logger.info("📊 Model validation:")
+			logger.info(f"  - Vocabulary size: {vocab_size}")
+			logger.info(f"  - Embedding matrix size: {embedding_size}")
+			if vocab_size != embedding_size:
+				logger.warning(f"⚠️ Vocabulary size mismatch: vocab={vocab_size}, embeddings={embedding_size}")
+				logger.warning("⚠️ This may cause issues in downstream usage")
+			else:
+				logger.info("✅ Vocabulary and embedding sizes match")
 		# Save the model
 		model.save_pretrained(str(output_path))
 		logger.info(f"💾 Model saved to {output_path}")
 	logger.info(f"🔄 Applying PCA with {pca_dims} dimensions...")
 	# Get current embeddings
+	# Handle both torch tensors and numpy arrays
+	if hasattr(model.embedding, "cpu"):
+		embeddings = model.embedding.cpu().numpy().astype(np.float64)
+	else:
+		embeddings = model.embedding.astype(np.float64)
 	original_shape = embeddings.shape
 	logger.info(f"Original embedding shape: {original_shape}")
 	4. Post-training re-regularization (PCA + SIF weighting)
 	"""
 	import subprocess
 	from pathlib import Path
 	logger.info("🧪 Starting tokenlearn training (POTION approach)...")
+	# Create persistent directories for tokenlearn workflow (for checkpoint preservation)
+	teacher_model_name = getattr(teacher_model, "model_name", None)
+	if not teacher_model_name and hasattr(teacher_model, "_modules") and len(teacher_model._modules) > 0:  # noqa: SLF001
+		# Try to extract from the first module if it's a SentenceTransformer
+		first_module = next(iter(teacher_model._modules.values()))  # noqa: SLF001
+		if hasattr(first_module, "auto_model") and hasattr(first_module.auto_model, "name_or_path"):
+			teacher_model_name = first_module.auto_model.name_or_path
+	if not teacher_model_name:
+		teacher_model_name = "unknown_teacher"
+	# Use persistent directory for tokenlearn checkpoints
+	teacher_slug = teacher_model_name.replace("/", "_").replace("-", "_")
+	persistent_tokenlearn_dir = Path(directories.base).parent / "tokenlearn_cache" / teacher_slug
+	features_dir = persistent_tokenlearn_dir / "features"
+	model_dir = persistent_tokenlearn_dir / "base_model"
+	trained_dir = persistent_tokenlearn_dir / "trained_model"
+	features_dir.mkdir(parents=True, exist_ok=True)
+	model_dir.mkdir(parents=True, exist_ok=True)
+	trained_dir.mkdir(parents=True, exist_ok=True)
+	logger.info(f"📁 Using persistent tokenlearn directory: {persistent_tokenlearn_dir}")
+	# Save the base distilled model for tokenlearn
+	student_model.save_pretrained(str(model_dir))
+	logger.info(f"💾 Saved base model to {model_dir}")
+	# Step 2: Create features using sentence transformer
+	logger.info("🔍 Step 2: Creating features using sentence transformer...")
+	# Get teacher model name/path for tokenlearn
+	teacher_model_name = getattr(teacher_model, "model_name", None)
+	if not teacher_model_name and hasattr(teacher_model, "_modules") and len(teacher_model._modules) > 0:  # noqa: SLF001
+		# Try to extract from the first module if it's a SentenceTransformer
+		# _modules is a dict-like container, get the first module by iterating
+		first_module = next(iter(teacher_model._modules.values()))  # noqa: SLF001
+		if hasattr(first_module, "auto_model") and hasattr(first_module.auto_model, "name_or_path"):
+			teacher_model_name = first_module.auto_model.name_or_path
+	logger.info(f"📊 Using teacher model: {teacher_model_name}")
+	# Check if featurization already completed (checkpoint detection)
+	featurization_complete_marker = features_dir / ".featurization_complete"
+	if featurization_complete_marker.exists() and verify_featurization_output(features_dir):
+		logger.info("✅ Found existing featurization checkpoint with valid output files")
+		logger.info(f"📂 Using cached features from: {features_dir}")
+		# Verify marker is still valid
+		output_files = list(features_dir.glob("*.npy")) + list(features_dir.glob("*.json"))
+		logger.info(f"📁 Found {len(output_files)} cached feature files")
+	else:
+		if featurization_complete_marker.exists():
+			logger.warning("⚠️ Featurization marker exists but output files are missing - re-running featurization")
+			featurization_complete_marker.unlink()
+		logger.info("🔄 No valid featurization checkpoint found - starting featurization...")
+		if not teacher_model_name:
+			logger.warning("⚠️ Could not determine teacher model name, using fallback")
+			teacher_model_name = "BAAI/bge-base-en-v1.5"  # Fallback to a common model
 		logger.info(f"📊 Using teacher model: {teacher_model_name}")
+		try:
+			# Use configured dataset for code specialization
+			featurize_cmd = [
+				"python",
+				"-m",
+				"tokenlearn.featurize",
+				"--model-name",
+				str(teacher_model_name),
+				"--output-dir",
+				str(features_dir),
+				"--dataset-path",
+				str(distillation_config.tokenlearn_dataset),
+				"--dataset-name",
+				str(distillation_config.tokenlearn_dataset_name),
+				"--dataset-split",
+				"train",
+				"--key",
+				str(distillation_config.tokenlearn_text_key),  # Use configured text field
+				"--batch-size",
+				"1024",  # Optimized batch size for A100-40G
+			]
+			logger.info("🔄 Running tokenlearn featurization...")
+			logger.info(
+				f"📊 Dataset: {distillation_config.tokenlearn_dataset} (config: {distillation_config.tokenlearn_dataset_name})"
+			)
+			logger.info(f"📝 Text field: {distillation_config.tokenlearn_text_key}")
+			logger.info(f"Command: {' '.join(featurize_cmd)}")
+			print(f"\n🔄 Executing: {' '.join(featurize_cmd)}\n")
+			result = subprocess.run(  # noqa: S603
+				featurize_cmd,
+				text=True,
+				timeout=distillation_config.tokenlearn_timeout_featurize,
+				check=False,
+			)
+			if result.returncode != 0:
+				logger.error(f"❌ Featurization failed with return code: {result.returncode}")
+				logger.error("💥 Tokenlearn featurization is required for training - cannot proceed")
+				msg = f"Tokenlearn featurization failed with return code: {result.returncode}"
+				raise RuntimeError(msg)
+			logger.info("✅ Featurization completed successfully")
+			# Create checkpoint marker to indicate featurization is complete
+			featurization_complete_marker.touch()
+			logger.info(f"💾 Created featurization checkpoint: {featurization_complete_marker}")
+			# Generate token frequencies for post-training re-regularization
+			logger.info("📊 Computing token frequencies for SIF weighting...")
+			compute_token_frequencies_for_sif(teacher_model, features_dir)
+		except Exception as e:
+			logger.exception("💥 Tokenlearn featurization failed")
+			logger.exception("💥 Tokenlearn featurization is required for training - cannot proceed")
+			msg = f"Tokenlearn featurization failed: {e}"
+			raise RuntimeError(msg) from e
+	# Step 3: Train using tokenlearn-train
+	logger.info("🎓 Step 3: Training using tokenlearn...")
+	# Check if training already completed (checkpoint detection)
+	training_complete_marker = trained_dir / ".training_complete"
+	training_fallback_marker = trained_dir / ".training_fallback"
+	if training_complete_marker.exists() and verify_training_output(trained_dir):
+		logger.info("✅ Found existing training checkpoint with valid model files")
+		logger.info(f"📂 Using cached trained model from: {trained_dir}")
+		# Show available model files
+		model_files = []
+		for pattern in ["*.json", "*.safetensors", "*.bin"]:
+			model_files.extend(list(trained_dir.glob(pattern)))
+			for subdir in ["model", "model_weighted"]:
+				subdir_path = trained_dir / subdir
+				if subdir_path.exists():
+					model_files.extend(list(subdir_path.glob(pattern)))
+		logger.info(f"📁 Found {len(model_files)} cached model files")
+	elif training_fallback_marker.exists():
+		logger.warning("⚠️ Training fallback marker found - tokenlearn failed previously")
+		logger.info("🔄 Proceeding with fallback to base model (simple distillation)")
+		# Skip training and proceed to model loading (will fallback to base model)
+	else:
+		if training_complete_marker.exists():
+			logger.warning("⚠️ Training marker exists but model files are missing - re-running training")
+			training_complete_marker.unlink()
+		logger.info("🔄 No valid training checkpoint found - starting training...")
+		try:
+			train_cmd = [
+				"python",
+				"-m",
+				"tokenlearn.train",
+				"--model-name",
+				str(teacher_model_name),
+				"--data-path",
+				str(features_dir),
+				"--save-path",
+				str(trained_dir),
+			]
+			logger.info("🔄 Running tokenlearn training...")
+			logger.info(f"Command: {' '.join(train_cmd)}")
+			print(f"\n🎓 Executing: {' '.join(train_cmd)}\n")
+			result = subprocess.run(  # noqa: S603
+				train_cmd,
+				text=True,
+				capture_output=True,  # Capture stdout and stderr
+				timeout=distillation_config.tokenlearn_timeout_train,
+				check=False,
+			)
+			if result.returncode != 0:
+				logger.error(f"❌ Tokenlearn training failed with return code: {result.returncode}")
+				# Log the actual error output for debugging
+				if result.stderr:
+					logger.error(f"stderr: {result.stderr}")
+				if result.stdout:
+					logger.info(f"stdout: {result.stdout}")
+				# Check if it's the token-vector mismatch issue
+				error_output = str(result.stderr) + str(result.stdout)
+				if "Number of tokens" in error_output and "does not match number of vectors" in error_output:
+					logger.error("🔧 Token-vector mismatch detected in tokenlearn")
+					logger.error("💥 This is a known issue with tokenlearn/Model2Vec integration")
+					# Create training marker to indicate we tried but failed
+					training_fallback_marker = trained_dir / ".training_fallback"
+					training_fallback_marker.touch()
+					logger.error("❌ Tokenlearn training failed due to token-vector mismatch")
+					msg = f"Tokenlearn training failed with token-vector mismatch: {error_output}"
 					raise RuntimeError(msg)
+				logger.error("💥 Tokenlearn training failed with different error")
+				msg = f"Tokenlearn training failed with return code: {result.returncode}"
+				raise RuntimeError(msg)
+			logger.info("✅ Tokenlearn training completed successfully")
+			# Create checkpoint marker to indicate training is complete
+			training_complete_marker.touch()
+			logger.info(f"💾 Created training checkpoint: {training_complete_marker}")
+		except Exception as e:
+			logger.exception("💥 Tokenlearn training failed")
+			logger.exception("💥 Tokenlearn training is required - cannot proceed")
+			msg = f"Tokenlearn training failed: {e}"
+			raise RuntimeError(msg) from e
+	# Step 4: Load the trained model and apply post-training re-regularization
+	logger.info("📦 Step 4: Loading trained model and applying post-training re-regularization...")
+	# Check if we need to use fallback due to tokenlearn failure
+	training_fallback_marker = trained_dir / ".training_fallback"
+	if training_fallback_marker.exists():
+		logger.error("❌ Tokenlearn training failed previously - cannot return trained model")
+		logger.error("💥 Training was requested but failed - this would be misleading to return base model")
+		msg = "Tokenlearn training failed - cannot proceed with training pipeline"
+		raise RuntimeError(msg)
+	try:
+		from model2vec.model import StaticModel
+		# Load the trained model from tokenlearn
+		trained_model_path = trained_dir / "model"
+		if not trained_model_path.exists():
+			# Try alternative paths
+			possible_paths = [
+				trained_dir / "model_weighted",
+				trained_dir,
+			]
+			for path in possible_paths:
+				if path.exists() and any(path.glob("*.json")):
+					trained_model_path = path
+					break
+			else:
+				logger.error(f"❌ Could not find trained model in {trained_dir}")
+				logger.error("💥 Training was requested but no trained model found - cannot proceed")
+				msg = f"Trained model not found in {trained_dir} - training pipeline failed"
+				raise RuntimeError(msg)
+		# Load the model before re-regularization
+		logger.info("🔄 Loading model from tokenlearn training...")
+		trained_model = StaticModel.from_pretrained(str(trained_model_path))
+		# Apply post-training re-regularization (POTION Step 4)
+		logger.info("🔧 Applying post-training re-regularization (PCA + SIF weighting)...")
+		final_model = apply_post_training_regularization(
+			trained_model, features_dir, pca_dims=distillation_config.optimal_pca_dims
+		)
+		logger.info("✅ Tokenlearn training pipeline with post-training re-regularization completed successfully")
+		return final_model
+	except ValueError as e:
+		if "Number of tokens" in str(e) and "does not match number of vectors" in str(e):
+			logger.exception("💥 Token-vector mismatch in tokenlearn training")
+			logger.exception("Error details")
+			logger.exception("🔧 This is a known issue with tokenlearn/Model2Vec integration")
+			logger.exception("💥 Training was requested but failed due to token-vector mismatch")
+			msg = f"Tokenlearn training failed due to token-vector mismatch: {e}"
 			raise RuntimeError(msg) from e
+		logger.exception("💥 Failed to load tokenlearn trained model")
+		msg = f"Failed to load tokenlearn trained model: {e}"
+		raise RuntimeError(msg) from e
+	except Exception as e:
+		logger.exception("💥 Failed to load tokenlearn trained model")
+		logger.exception("💥 Cannot load trained model - training failed")
+		msg = f"Failed to load tokenlearn trained model: {e}"
+		raise RuntimeError(msg) from e
 def distill_single_teacher(
 	# Initialize Beam utilities if requested
 	checkpoint_mgr = None
 	if use_beam_utilities:
 		try:
 			_, checkpoint_mgr, model_mgr, _ = create_beam_utilities(VOLUME_CONFIG.name, VOLUME_CONFIG.mount_path)
 			existing_base = str(base_dir)
+			# Step 3: Handle final model creation
+			if enable_training and base_model is not None:
+				# Perform tokenlearn training (POTION approach)
+				logger.info(f"🧪 Starting tokenlearn training for {teacher_name}")
+				try:
+					# Load teacher model for training
+					device = "cuda" if torch.cuda.is_available() else "cpu"
+					teacher_st_model = load_model_with_flash_attention(teacher_model, device)
+					# Perform tokenlearn training (POTION approach)
+					final_model = tokenlearn_training(base_model, teacher_st_model, checkpoint_mgr)
+					# Save final model
+					final_dir.mkdir(parents=True, exist_ok=True)
+					final_model.save_pretrained(str(final_dir))
+					# Sync final model and training checkpoints to Beam
+					if use_beam_utilities:
+						sync_model_to_beam(f"{teacher_name}_final", str(final_dir), use_beam_utilities)
+						if checkpoint_mgr:
+							sync_checkpoints_to_beam(
+								VOLUME_CONFIG.name, f"training_{teacher_name}", directories.checkpoints
+							)
+					del teacher_st_model
+					if torch.cuda.is_available():
+						torch.cuda.empty_cache()
+				except RuntimeError as e:
+					# Training failed - clean up and return failure
+					logger.exception(f"❌ Training failed for {teacher_name}")
+					# Clean up teacher model if it was loaded
+					if "teacher_st_model" in locals():
+						del teacher_st_model
+					if torch.cuda.is_available():
+						torch.cuda.empty_cache()
+					return {
+						"teacher_model": teacher_model,
+						"teacher_name": teacher_name,
+						"status": "failed_training",
+						"error": f"Training failed: {e!s}",
+						"base_path": existing_base,  # Base model was created successfully
+					}
+			else:
+				# Copy base to final (no training)
+				logger.info(f"📁 Copying base to final for {teacher_name}")
+				if not copy_base_to_final(teacher_name, enable_training):
+					return {
+						"teacher_model": teacher_model,
+						"teacher_name": teacher_name,
+						"status": "failed_copy_to_final",
+						"error": "Failed to copy base to final",
+					}
+			total_time = time.time() - start_time
 		return {
 			"teacher_model": teacher_model,
 		if result["status"] == "success" or result["status"].startswith("skipped"):
 			successful_models.append(teacher_name)
+		elif result["status"] == "failed_training":
+			# Note: Training failed but base model may still be available
+			logger.warning(f"⚠️ Training failed for {teacher_name}, but base distillation may have succeeded")
 	# Summary
 	logger.info("\n🏆 DISTILLATION WORKFLOW COMPLETE!")
 	return results_summary
+def _beam_distill_internal(
 	teacher_models: list[str] | None = None,
 	enable_training: bool = False,
 	pca_dims: int | None = None,
 	clear_cache: bool = False,
 ) -> dict[str, Any]:
+	"""Shared internal implementation for beam distillation."""
 	# Apply patches
 	patch_success = apply_local_patches()
 	if patch_success:
 		if result["status"] == "success" or result["status"].startswith("skipped"):
 			successful_models.append(teacher_name)
+		elif result["status"] == "failed_training":
+			# Note: Training failed but base model may still be available
+			logger.warning(f"⚠️ Training failed for {teacher_name}, but base distillation may have succeeded")
 	# Summary
 	logger.info("\n🏆 BEAM DISTILLATION WORKFLOW COMPLETE!")
 	return results_summary
+@function(**get_training_function_kwargs())
+def _beam_train_models(
+	teacher_models: list[str] | None = None,
+	enable_training: bool = True,
+	pca_dims: int | None = None,
+	clear_cache: bool = False,
+) -> dict[str, Any]:
+	"""Beam function for training (distillation + tokenlearn)."""
+	logger.info("☁️ Running training on Beam")
+	return _beam_distill_internal(teacher_models, enable_training, pca_dims, clear_cache)
+@function(**get_distillation_function_kwargs())
+def _beam_distill_models(
+	teacher_models: list[str] | None = None,
+	enable_training: bool = False,
+	pca_dims: int | None = None,
+	clear_cache: bool = False,
+) -> dict[str, Any]:
+	"""Beam function for basic distillation only."""
+	logger.info("☁️ Running distillation on Beam")
+	return _beam_distill_internal(teacher_models, enable_training, pca_dims, clear_cache)
 def run_beam_distillation(
 	teacher_models: list[str] | None = None,
 	enable_training: bool = False,
 	logger.info("☁️ Running distillation on Beam with local sync")
 	try:
+		# Choose appropriate beam function based on training flag
+		beam_function = _beam_train_models if enable_training else _beam_distill_models
 		# Run distillation on Beam
+		results = beam_function.remote(teacher_models, enable_training, pca_dims, clear_cache)
 		# Check if Beam execution was successful
 		if not results:
 	# Clear tokenlearn checkpoints if requested (for training mode)
 	if clear_checkpoints and train:
 		logger.info("🧹 Clearing tokenlearn checkpoints to force fresh featurization and training...")
 		for teacher_model in models_to_distill:
+			teacher_model.split("/")[-1].replace("-", "_")
+			# Use the same persistent directory structure as the training function
+			teacher_slug = teacher_model.replace("/", "_").replace("-", "_")
+			persistent_tokenlearn_dir = Path(LOCAL_BASE_DIR).parent / "tokenlearn_cache" / teacher_slug
+			features_dir = persistent_tokenlearn_dir / "features"
+			trained_dir = persistent_tokenlearn_dir / "trained_model"
+			# Clear persistent tokenlearn checkpoints
+			if features_dir.exists() or trained_dir.exists():
+				clear_tokenlearn_checkpoints(features_dir, trained_dir)
+				logger.info(f"🗑️ Cleared persistent tokenlearn checkpoints for {teacher_model}")
+			else:
+				logger.info(f"ℹ️ No tokenlearn checkpoints found for {teacher_model}")
 	elif clear_checkpoints and not train:
 		logger.warning("⚠️ --clear-checkpoints flag is only relevant when training is enabled (--train)")