feat(distiller): add option to skip post-training re-regularization

This change introduces an option to skip the post-training re-regularization step in the tokenlearn training pipeline. This can be useful for debugging or experimentation, or when re-regularization is not desired.

Files changed (2) hide show

src/distiller/config.py +3 -0
src/distiller/distill.py +26 -8

src/distiller/config.py CHANGED Viewed

@@ -216,6 +216,9 @@ class DistillationConfig(BaseModel):
 	tokenlearn_timeout_featurize: int = 21600  # 6 hour timeout for featurization (dataset needs ~5 hours)
 	tokenlearn_timeout_train: int = 7200  # 2 hour timeout for training
 distillation_config = DistillationConfig()

 	tokenlearn_timeout_featurize: int = 21600  # 6 hour timeout for featurization (dataset needs ~5 hours)
 	tokenlearn_timeout_train: int = 7200  # 2 hour timeout for training
+	# Post-training configuration
+	skip_post_training_regularization: bool = False  # Skip PCA + SIF re-regularization step
 distillation_config = DistillationConfig()

src/distiller/distill.py CHANGED Viewed

@@ -855,6 +855,7 @@ def tokenlearn_training(
 	student_model: Any,
 	teacher_model: SentenceTransformer,
 	checkpoint_manager: BeamCheckpointManager | None = None,  # noqa: ARG001
 ) -> Any:
 	"""
 	Perform tokenlearn training following the official POTION approach.
@@ -1122,13 +1123,17 @@ def tokenlearn_training(
 		logger.info("🔄 Loading model from tokenlearn training...")
 		trained_model = StaticModel.from_pretrained(str(trained_model_path))
-		# Apply post-training re-regularization (POTION Step 4)
-		logger.info("🔧 Applying post-training re-regularization (PCA + SIF weighting)...")
-		final_model = apply_post_training_regularization(
-			trained_model, features_dir, pca_dims=distillation_config.optimal_pca_dims
-		)
-		logger.info("✅ Tokenlearn training pipeline with post-training re-regularization completed successfully")
 		return final_model
@@ -1286,7 +1291,12 @@ def distill_single_teacher(
 					teacher_st_model = load_model_with_flash_attention(teacher_model, device)
 					# Perform tokenlearn training (POTION approach)
-					final_model = tokenlearn_training(base_model, teacher_st_model, checkpoint_mgr)
 					# Save final model
 					final_dir.mkdir(parents=True, exist_ok=True)
@@ -1634,10 +1644,18 @@ def main(
 	clear_checkpoints: Annotated[
 		bool, typer.Option(help="Clear tokenlearn checkpoints to force fresh featurization and training")
 	] = False,
 ) -> None:
 	"""Unified distillation command with optional training."""
 	logger.info("🚀 Starting unified Model2Vec distillation workflow")
 	logger.info(f"🎓 Training mode: {'Tokenlearn (POTION) training' if train else 'Basic distillation only'}")
 	logger.info(f"☁️  Execution: {'Beam' if use_beam else 'Local'}")

 	student_model: Any,
 	teacher_model: SentenceTransformer,
 	checkpoint_manager: BeamCheckpointManager | None = None,  # noqa: ARG001
+	skip_post_training_regularization: bool = False,
 ) -> Any:
 	"""
 	Perform tokenlearn training following the official POTION approach.
 		logger.info("🔄 Loading model from tokenlearn training...")
 		trained_model = StaticModel.from_pretrained(str(trained_model_path))
+		# Apply post-training re-regularization (POTION Step 4) unless skipped
+		if skip_post_training_regularization:
+			logger.info("⏭️ Skipping post-training re-regularization (PCA + SIF weighting) as requested")
+			final_model = trained_model
+			logger.info("✅ Tokenlearn training pipeline completed successfully (without re-regularization)")
+		else:
+			logger.info("🔧 Applying post-training re-regularization (PCA + SIF weighting)...")
+			final_model = apply_post_training_regularization(
+				trained_model, features_dir, pca_dims=distillation_config.optimal_pca_dims
+			)
+			logger.info("✅ Tokenlearn training pipeline with post-training re-regularization completed successfully")
 		return final_model
 					teacher_st_model = load_model_with_flash_attention(teacher_model, device)
 					# Perform tokenlearn training (POTION approach)
+					final_model = tokenlearn_training(
+						base_model,
+						teacher_st_model,
+						checkpoint_mgr,
+						skip_post_training_regularization=distillation_config.skip_post_training_regularization,
+					)
 					# Save final model
 					final_dir.mkdir(parents=True, exist_ok=True)
 	clear_checkpoints: Annotated[
 		bool, typer.Option(help="Clear tokenlearn checkpoints to force fresh featurization and training")
 	] = False,
+	skip_ptr: Annotated[
+		bool, typer.Option("--skip-ptr", help="Skip post-training re-regularization (PCA + SIF weighting) step")
+	] = False,
 ) -> None:
 	"""Unified distillation command with optional training."""
 	logger.info("🚀 Starting unified Model2Vec distillation workflow")
+	# Set post-training regularization flag in config
+	distillation_config.skip_post_training_regularization = skip_ptr
+	if skip_ptr and train:
+		logger.info("⏭️ Post-training re-regularization will be skipped (PCA + SIF weighting disabled)")
 	logger.info(f"🎓 Training mode: {'Tokenlearn (POTION) training' if train else 'Basic distillation only'}")
 	logger.info(f"☁️  Execution: {'Beam' if use_beam else 'Local'}")