feat(distiller): add checkpointing and refactor analyze.py

This change introduces checkpointing to the tokenlearn featurization and training processes. This allows the processes to resume from where they left off if they are interrupted or have already completed. It also adds a --clear-checkpoints flag to force fresh featurization and training.

Additionally, minor refactoring was done to use list comprehension in analyze.py

Files changed (2) hide show

src/distiller/analyze.py +6 -5
src/distiller/distill.py +211 -86

src/distiller/analyze.py CHANGED Viewed

@@ -496,10 +496,11 @@ class CodeSearchNetAnalyzer:
 			return
 		# Find all our model directories
-		our_model_dirs = []
-		for model_dir in final_models_dir.iterdir():
-			if model_dir.is_dir() and "code_model2vec" in model_dir.name:
-				our_model_dirs.append(model_dir)
 		logger.info(f"📁 Found {len(our_model_dirs)} distilled model directories")
@@ -1567,7 +1568,7 @@ This report presents a comprehensive analysis of Model2Vec distillation experime
 		if self.model_specs:
 			successful_specs = {k: v for k, v in self.model_specs.items() if v.get("analysis_successful", False)}
 			if successful_specs:
-				report += f"""
 ### 📊 Model Specifications Analysis

 			return
 		# Find all our model directories
+		our_model_dirs = [
+			model_dir
+			for model_dir in final_models_dir.iterdir()
+			if model_dir.is_dir() and "code_model2vec" in model_dir.name
+		]
 		logger.info(f"📁 Found {len(our_model_dirs)} distilled model directories")
 		if self.model_specs:
 			successful_specs = {k: v for k, v in self.model_specs.items() if v.get("analysis_successful", False)}
 			if successful_specs:
+				report += """
 ### 📊 Model Specifications Analysis

src/distiller/distill.py CHANGED Viewed

@@ -866,7 +866,7 @@ def tokenlearn_training(
 		student_model.save_pretrained(str(model_dir))
 		logger.info(f"💾 Saved base model to {model_dir}")
-		# Step 2: Create features using tokenlearn-featurize
 		logger.info("🔍 Step 2: Creating features using sentence transformer...")
 		# Get teacher model name/path for tokenlearn
@@ -878,107 +878,153 @@ def tokenlearn_training(
 			if hasattr(first_module, "auto_model") and hasattr(first_module.auto_model, "name_or_path"):
 				teacher_model_name = first_module.auto_model.name_or_path
-		if not teacher_model_name:
-			logger.warning("⚠️ Could not determine teacher model name, using fallback")
-			teacher_model_name = "BAAI/bge-base-en-v1.5"  # Fallback to a common model
 		logger.info(f"📊 Using teacher model: {teacher_model_name}")
-		try:
-			# Use configured dataset for code specialization
-			featurize_cmd = [
-				"python",
-				"-m",
-				"tokenlearn.featurize",
-				"--model-name",
-				str(teacher_model_name),
-				"--output-dir",
-				str(features_dir),
-				"--dataset-path",
-				str(distillation_config.tokenlearn_dataset),
-				"--dataset-name",
-				str(distillation_config.tokenlearn_dataset_name),
-				"--dataset-split",
-				"train",
-				"--key",
-				str(distillation_config.tokenlearn_text_key),  # Use configured text field
-				"--batch-size",
-				"1024",  # Optimized batch size for A100-40G
-			]
-			logger.info("🔄 Running tokenlearn featurization...")
-			logger.info(
-				f"📊 Dataset: {distillation_config.tokenlearn_dataset} (config: {distillation_config.tokenlearn_dataset_name})"
-			)
-			logger.info(f"📝 Text field: {distillation_config.tokenlearn_text_key}")
-			logger.info(f"Command: {' '.join(featurize_cmd)}")
-			print(f"\n🔄 Executing: {' '.join(featurize_cmd)}\n")
-			result = subprocess.run(  # noqa: S603
-				featurize_cmd,
-				text=True,
-				timeout=distillation_config.tokenlearn_timeout_featurize,
-				check=False,
-			)
-			if result.returncode != 0:
-				logger.error(f"❌ Featurization failed with return code: {result.returncode}")
-				logger.error("💥 Tokenlearn featurization is required for training - cannot proceed")
-				msg = f"Tokenlearn featurization failed with return code: {result.returncode}"
-				raise RuntimeError(msg)
-			logger.info("✅ Featurization completed successfully")
-			# Generate token frequencies for post-training re-regularization
-			logger.info("📊 Computing token frequencies for SIF weighting...")
-			compute_token_frequencies_for_sif(teacher_model, features_dir)
-		except Exception as e:
-			logger.exception("💥 Tokenlearn featurization failed")
-			logger.exception("💥 Tokenlearn featurization is required for training - cannot proceed")
-			msg = f"Tokenlearn featurization failed: {e}"
-			raise RuntimeError(msg) from e
 		# Step 3: Train using tokenlearn-train
 		logger.info("🎓 Step 3: Training using tokenlearn...")
-		try:
-			train_cmd = [
-				"python",
-				"-m",
-				"tokenlearn.train",
-				"--model-name",
-				str(teacher_model_name),
-				"--data-path",
-				str(features_dir),
-				"--save-path",
-				str(trained_dir),
-			]
-			logger.info("🔄 Running tokenlearn training...")
-			logger.info(f"Command: {' '.join(train_cmd)}")
-			print(f"\n🎓 Executing: {' '.join(train_cmd)}\n")
-			result = subprocess.run(  # noqa: S603
-				train_cmd,
-				text=True,
-				timeout=distillation_config.tokenlearn_timeout_train,
-				check=False,
-			)
-			if result.returncode != 0:
-				logger.error(f"❌ Tokenlearn training failed with return code: {result.returncode}")
-				logger.error("💥 Tokenlearn training is required - cannot proceed")
-				msg = f"Tokenlearn training failed with return code: {result.returncode}"
-				raise RuntimeError(msg)
-			logger.info("✅ Tokenlearn training completed successfully")
-		except Exception as e:
-			logger.exception("💥 Tokenlearn training failed")
-			logger.exception("💥 Tokenlearn training is required - cannot proceed")
-			msg = f"Tokenlearn training failed: {e}"
-			raise RuntimeError(msg) from e
 		# Step 4: Load the trained model and apply post-training re-regularization
 		logger.info("📦 Step 4: Loading trained model and applying post-training re-regularization...")
@@ -1256,6 +1302,9 @@ def run_local_distillation(
 			if model in models_to_distill:
 				clear_model_cache(model)
 	for teacher_model in models_to_distill:
 		result = distill_single_teacher(
 			teacher_model=teacher_model,
@@ -1453,6 +1502,9 @@ def main(
 	clear_cache: Annotated[
 		bool, typer.Option(help="Clear HuggingFace cache for problematic models before distillation")
 	] = False,
 ) -> None:
 	"""Unified distillation command with optional training."""
 	logger.info("🚀 Starting unified Model2Vec distillation workflow")
@@ -1475,6 +1527,32 @@ def main(
 			if model in models_to_distill:
 				clear_model_cache(model)
 	# Run distillation workflow
 	if use_beam:
 		results = run_beam_distillation(
@@ -1822,5 +1900,52 @@ def baai_bge_model_distillation(
 		return None
 if __name__ == "__main__":
 	typer.run(main)

 		student_model.save_pretrained(str(model_dir))
 		logger.info(f"💾 Saved base model to {model_dir}")
+		# Step 2: Create features using sentence transformer
 		logger.info("🔍 Step 2: Creating features using sentence transformer...")
 		# Get teacher model name/path for tokenlearn
 			if hasattr(first_module, "auto_model") and hasattr(first_module.auto_model, "name_or_path"):
 				teacher_model_name = first_module.auto_model.name_or_path
 		logger.info(f"📊 Using teacher model: {teacher_model_name}")
+		# Check if featurization already completed (checkpoint detection)
+		featurization_complete_marker = features_dir / ".featurization_complete"
+		if featurization_complete_marker.exists() and verify_featurization_output(features_dir):
+			logger.info("✅ Found existing featurization checkpoint with valid output files")
+			logger.info(f"📂 Using cached features from: {features_dir}")
+			# Verify marker is still valid
+			output_files = list(features_dir.glob("*.npy")) + list(features_dir.glob("*.json"))
+			logger.info(f"📁 Found {len(output_files)} cached feature files")
+		else:
+			if featurization_complete_marker.exists():
+				logger.warning("⚠️ Featurization marker exists but output files are missing - re-running featurization")
+				featurization_complete_marker.unlink()
+			logger.info("🔄 No valid featurization checkpoint found - starting featurization...")
+			if not teacher_model_name:
+				logger.warning("⚠️ Could not determine teacher model name, using fallback")
+				teacher_model_name = "BAAI/bge-base-en-v1.5"  # Fallback to a common model
+			logger.info(f"📊 Using teacher model: {teacher_model_name}")
+			try:
+				# Use configured dataset for code specialization
+				featurize_cmd = [
+					"python",
+					"-m",
+					"tokenlearn.featurize",
+					"--model-name",
+					str(teacher_model_name),
+					"--output-dir",
+					str(features_dir),
+					"--dataset-path",
+					str(distillation_config.tokenlearn_dataset),
+					"--dataset-name",
+					str(distillation_config.tokenlearn_dataset_name),
+					"--dataset-split",
+					"train",
+					"--key",
+					str(distillation_config.tokenlearn_text_key),  # Use configured text field
+					"--batch-size",
+					"1024",  # Optimized batch size for A100-40G
+				]
+				logger.info("🔄 Running tokenlearn featurization...")
+				logger.info(
+					f"📊 Dataset: {distillation_config.tokenlearn_dataset} (config: {distillation_config.tokenlearn_dataset_name})"
+				)
+				logger.info(f"📝 Text field: {distillation_config.tokenlearn_text_key}")
+				logger.info(f"Command: {' '.join(featurize_cmd)}")
+				print(f"\n🔄 Executing: {' '.join(featurize_cmd)}\n")
+				result = subprocess.run(  # noqa: S603
+					featurize_cmd,
+					text=True,
+					timeout=distillation_config.tokenlearn_timeout_featurize,
+					check=False,
+				)
+				if result.returncode != 0:
+					logger.error(f"❌ Featurization failed with return code: {result.returncode}")
+					logger.error("💥 Tokenlearn featurization is required for training - cannot proceed")
+					msg = f"Tokenlearn featurization failed with return code: {result.returncode}"
+					raise RuntimeError(msg)
+				logger.info("✅ Featurization completed successfully")
+				# Create checkpoint marker to indicate featurization is complete
+				featurization_complete_marker.touch()
+				logger.info(f"💾 Created featurization checkpoint: {featurization_complete_marker}")
+				# Generate token frequencies for post-training re-regularization
+				logger.info("📊 Computing token frequencies for SIF weighting...")
+				compute_token_frequencies_for_sif(teacher_model, features_dir)
+			except Exception as e:
+				logger.exception("💥 Tokenlearn featurization failed")
+				logger.exception("💥 Tokenlearn featurization is required for training - cannot proceed")
+				msg = f"Tokenlearn featurization failed: {e}"
+				raise RuntimeError(msg) from e
 		# Step 3: Train using tokenlearn-train
 		logger.info("🎓 Step 3: Training using tokenlearn...")
+		# Check if training already completed (checkpoint detection)
+		training_complete_marker = trained_dir / ".training_complete"
+		if training_complete_marker.exists() and verify_training_output(trained_dir):
+			logger.info("✅ Found existing training checkpoint with valid model files")
+			logger.info(f"📂 Using cached trained model from: {trained_dir}")
+			# Show available model files
+			model_files = []
+			for pattern in ["*.json", "*.safetensors", "*.bin"]:
+				model_files.extend(list(trained_dir.glob(pattern)))
+				for subdir in ["model", "model_weighted"]:
+					subdir_path = trained_dir / subdir
+					if subdir_path.exists():
+						model_files.extend(list(subdir_path.glob(pattern)))
+			logger.info(f"📁 Found {len(model_files)} cached model files")
+		else:
+			if training_complete_marker.exists():
+				logger.warning("⚠️ Training marker exists but model files are missing - re-running training")
+				training_complete_marker.unlink()
+			logger.info("🔄 No valid training checkpoint found - starting training...")
+			try:
+				train_cmd = [
+					"python",
+					"-m",
+					"tokenlearn.train",
+					"--model-name",
+					str(teacher_model_name),
+					"--data-path",
+					str(features_dir),
+					"--save-path",
+					str(trained_dir),
+				]
+				logger.info("🔄 Running tokenlearn training...")
+				logger.info(f"Command: {' '.join(train_cmd)}")
+				print(f"\n🎓 Executing: {' '.join(train_cmd)}\n")
+				result = subprocess.run(  # noqa: S603
+					train_cmd,
+					text=True,
+					timeout=distillation_config.tokenlearn_timeout_train,
+					check=False,
+				)
+				if result.returncode != 0:
+					logger.error(f"❌ Tokenlearn training failed with return code: {result.returncode}")
+					logger.error("💥 Tokenlearn training is required - cannot proceed")
+					msg = f"Tokenlearn training failed with return code: {result.returncode}"
+					raise RuntimeError(msg)
+				logger.info("✅ Tokenlearn training completed successfully")
+				# Create checkpoint marker to indicate training is complete
+				training_complete_marker.touch()
+				logger.info(f"💾 Created training checkpoint: {training_complete_marker}")
+			except Exception as e:
+				logger.exception("💥 Tokenlearn training failed")
+				logger.exception("💥 Tokenlearn training is required - cannot proceed")
+				msg = f"Tokenlearn training failed: {e}"
+				raise RuntimeError(msg) from e
 		# Step 4: Load the trained model and apply post-training re-regularization
 		logger.info("📦 Step 4: Loading trained model and applying post-training re-regularization...")
 			if model in models_to_distill:
 				clear_model_cache(model)
+	# Clear tokenlearn checkpoints if requested (for training mode)
+	# Note: Checkpoint clearing is handled at the main function level
+	# Run distillation workflow
 	for teacher_model in models_to_distill:
 		result = distill_single_teacher(
 			teacher_model=teacher_model,
 	clear_cache: Annotated[
 		bool, typer.Option(help="Clear HuggingFace cache for problematic models before distillation")
 	] = False,
+	clear_checkpoints: Annotated[
+		bool, typer.Option(help="Clear tokenlearn checkpoints to force fresh featurization and training")
+	] = False,
 ) -> None:
 	"""Unified distillation command with optional training."""
 	logger.info("🚀 Starting unified Model2Vec distillation workflow")
 			if model in models_to_distill:
 				clear_model_cache(model)
+	# Clear tokenlearn checkpoints if requested (for training mode)
+	if clear_checkpoints and train:
+		import tempfile
+		logger.info("🧹 Clearing tokenlearn checkpoints to force fresh featurization and training...")
+		for teacher_model in models_to_distill:
+			teacher_name = teacher_model.split("/")[-1].replace("-", "_")
+			# Construct checkpoint paths using secure temporary directory
+			temp_dir = Path(tempfile.gettempdir()) / f"tokenlearn_{teacher_name}"
+			features_dir = temp_dir / "features"
+			trained_dir = temp_dir / "trained"
+			# Also check local paths
+			local_temp = Path("temp") / f"tokenlearn_{teacher_name}"
+			local_features = local_temp / "features"
+			local_trained = local_temp / "trained"
+			# Clear checkpoints for all possible paths
+			for feat_dir, train_dir in [(features_dir, trained_dir), (local_features, local_trained)]:
+				if feat_dir.exists() or train_dir.exists():
+					clear_tokenlearn_checkpoints(feat_dir, train_dir)
+					logger.info(f"🗑️ Cleared checkpoints for {teacher_model}")
+	elif clear_checkpoints and not train:
+		logger.warning("⚠️ --clear-checkpoints flag is only relevant when training is enabled (--train)")
 	# Run distillation workflow
 	if use_beam:
 		results = run_beam_distillation(
 		return None
+def clear_tokenlearn_checkpoints(features_dir: Path, trained_dir: Path) -> None:
+	"""Clear tokenlearn checkpoint markers to force re-execution of steps."""
+	featurization_marker = features_dir / ".featurization_complete"
+	training_marker = trained_dir / ".training_complete"
+	if featurization_marker.exists():
+		featurization_marker.unlink()
+		logger.info(f"🗑️ Cleared featurization checkpoint: {featurization_marker}")
+	if training_marker.exists():
+		training_marker.unlink()
+		logger.info(f"🗑️ Cleared training checkpoint: {training_marker}")
+def verify_featurization_output(features_dir: Path) -> bool:
+	"""Verify that featurization output files actually exist and are valid."""
+	if not features_dir.exists():
+		return False
+	# Check for expected tokenlearn output files
+	# Check if any expected files exist
+	return any(list(features_dir.glob(file_pattern)) for file_pattern in ["*.npy", "*.json", "*.pt", "*.pkl"])
+def verify_training_output(trained_dir: Path) -> bool:
+	"""Verify that training output files actually exist and are valid."""
+	if not trained_dir.exists():
+		return False
+	# Check for model files
+	model_files = ["config.json", "model.safetensors", "modules.json", "tokenizer.json"]
+	for model_file in model_files:
+		if (trained_dir / model_file).exists():
+			return True
+	# Check for alternative model directory structure
+	for subdir in ["model", "model_weighted"]:
+		subdir_path = trained_dir / subdir
+		if subdir_path.exists():
+			for model_file in model_files:
+				if (subdir_path / model_file).exists():
+					return True
+	return False
 if __name__ == "__main__":
 	typer.run(main)