feat: created a cli to manage the complete generation process

Files changed (11) hide show

patches/model2vec.patch +39 -0
src/distiller/__init__.py +7 -0
src/distiller/__main__.py +183 -0
src/distiller/analyze.py +1495 -0
src/distiller/beam_utils.py +753 -0
src/distiller/benchmark.py +1181 -0
src/distiller/distill.py +1306 -0
src/distiller/distill_simplified.py +413 -0
src/distiller/evaluate.py +839 -0
src/distiller/patch_utils.py +145 -0
src/distiller/sync.py +262 -0

patches/model2vec.patch ADDED Viewed

	@@ -0,0 +1,39 @@

+--- a/model2vec/train/base.py
++++ b/model2vec/train/base.py
+@@ -35,7 +35,7 @@ class FinetunableStaticModel(nn.Module):
+             )
+             self.vectors = vectors.float()
+-        self.embeddings = nn.Embedding.from_pretrained(vectors.clone(), freeze=False, padding_idx=pad_id)
++        self.embeddings = nn.Embedding.from_pretrained(self.vectors.clone(), freeze=False, padding_idx=pad_id)
+         self.head = self.construct_head()
+         self.w = self.construct_weights()
+         self.tokenizer = tokenizer
+--- a/model2vec/distill/distillation.py
++++ b/model2vec/distill/distillation.py
+@@ -137,7 +137,10 @@ def distill_from_model(
+         # Get the language from the model card.
+         try:
+             info = model_info(model_name)
+-            language = info.cardData.get("language", None)
++            if info is not None and hasattr(info, 'cardData') and info.cardData is not None:
++                language = info.cardData.get("language", None)
++            else:
++                language = None
+         except RepositoryNotFoundError:
+             logger.info("No model info found for the model. Setting language to None.")
+             language = None
+--- a/model2vec/distill/inference.py
++++ b/model2vec/distill/inference.py
+@@ -109,5 +109,12 @@ def create_embeddings(
+     out_tokens.extend([Token(x, False) for x in tokens])
+     out_weights = np.stack(intermediate_weights)
++    # Validate token-vector consistency to prevent failures
++    if len(out_tokens) != out_weights.shape[0]:
++        logger.warning(f"Token-vector mismatch: {len(out_tokens)} tokens vs {out_weights.shape[0]} vectors. Truncating to prevent failure.")
++        min_count = min(len(out_tokens), out_weights.shape[0])
++        out_tokens = out_tokens[:min_count]
++        out_weights = out_weights[:min_count]
++
+     return out_tokens, out_weights

src/distiller/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Model2Vec Distillation Pipeline for gte-Qwen2-7B-instruct."""
+__version__ = "0.1.0"
+from .distill import beam_code_distillation, code_specialized_distillation
+__all__ = ["beam_code_distillation", "code_specialized_distillation"]

src/distiller/__main__.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""Main entry point for the distiller package."""
+import argparse
+import sys
+def main() -> None:
+	"""Main entry point for the distiller package."""
+	parser = argparse.ArgumentParser(description="Model2Vec Code-Specialized Distillation Pipeline")
+	subparsers = parser.add_subparsers(dest="command", help="Available commands")
+	# Distillation command
+	distill_parser = subparsers.add_parser("distill", help="Run code-specialized model distillation")
+	distill_parser.add_argument("--model", default="Alibaba-NLP/gte-Qwen2-7B-instruct", help="Model to distill")
+	distill_parser.add_argument("--output-dir", default="gte_qwen2_m2v_code", help="Output directory")
+	distill_parser.add_argument("--pca-dims", type=int, default=512, help="PCA dimensions")
+	distill_parser.add_argument("--max-samples", type=int, default=50000, help="Max CodeSearchNet samples")
+	distill_parser.add_argument("--use-beam", action="store_true", help="Use Beam for cloud GPU distillation")
+	# Simplified distillation command
+	simple_parser = subparsers.add_parser("distill-simple", help="Run simplified Model2Vec distillation (local)")
+	simple_parser.add_argument(
+		"--teacher", default="sentence-transformers/all-MiniLM-L6-v2", help="Teacher model to distill from"
+	)
+	simple_parser.add_argument("--output-dir", default="gte_qwen2_m2v_code_simplified", help="Output directory")
+	simple_parser.add_argument("--pca-dims", type=int, default=256, help="PCA dimensions")
+	# CodeSearchNet evaluation command
+	evaluate_parser = subparsers.add_parser("evaluate", help="Run CodeSearchNet evaluation on all default models")
+	evaluate_parser.add_argument("--use-beam", action="store_true", help="Use Beam for cloud evaluation")
+	# CodeSearchNet evaluation command (simplified models only)
+	evaluate_simple_parser = subparsers.add_parser(
+		"evaluate-simple", help="Run CodeSearchNet evaluation on simplified models only"
+	)
+	evaluate_simple_parser.add_argument("--use-beam", action="store_true", help="Use Beam for cloud evaluation")
+	# Analysis command
+	analysis_parser = subparsers.add_parser("analyze", help="Generate CodeSearchNet analysis report")
+	analysis_parser.add_argument("--results-dir", default="code_evaluation_results", help="Results directory")
+	analysis_parser.add_argument("--results-file", help="Single results file to analyze")
+	analysis_parser.add_argument("--model-name", default="gte_qwen2_m2v_code", help="Model name for report")
+	analysis_parser.add_argument("--output", default="README.md", help="Output report file")
+	analysis_parser.add_argument("--export-csv", help="Export comparison results to CSV")
+	analysis_parser.add_argument("--use-beam", action="store_true", help="Use Beam for cloud analysis")
+	# Sync command
+	sync_parser = subparsers.add_parser("sync", help="Download files from Beam volume to local directory")
+	sync_parser.add_argument("--model-files", action="store_true", help="Download final model files")
+	sync_parser.add_argument(
+		"--analysis-files",
+		action="store_true",
+		help="Download analysis reports and charts",
+	)
+	sync_parser.add_argument("--all", action="store_true", help="Download all generated files")
+	sync_parser.add_argument("--output-dir", default=".", help="Local output directory")
+	# Benchmark command
+	benchmark_parser = subparsers.add_parser("benchmark", help="Run performance benchmarking on all default models")
+	benchmark_parser.add_argument("--use-beam", action="store_true", help="Use Beam for cloud benchmarking")
+	# Benchmark command (simplified models only)
+	benchmark_simple_parser = subparsers.add_parser(
+		"benchmark-simple", help="Run performance benchmarking on simplified models only"
+	)
+	benchmark_simple_parser.add_argument("--use-beam", action="store_true", help="Use Beam for cloud benchmarking")
+	args = parser.parse_args()
+	if args.command == "distill":
+		from .distill_simplified import run_local_distillation, beam_distill_all_teachers
+		if args.use_beam:
+			# Run on Beam
+			print("Running comprehensive teacher model distillation on Beam...")
+			results = beam_distill_all_teachers()
+		else:
+			# Run locally
+			print("Running comprehensive teacher model distillation locally...")
+			results = run_local_distillation()
+		print(f"✅ Distillation complete! Created {results['total_successful']} models")
+		print("📁 Models location: ./code_model2vec/final/")
+		print("\n✅ Created models:")
+		for model_name in results["successful_models"]:
+			model_info = results["all_results"][model_name]
+			print(f"   • {model_name} (from {model_info['teacher_model']})")
+	elif args.command == "distill-simple":
+		from .distill_simplified import run_local_distillation
+		# Run simplified distillation for all teacher models locally
+		print("Running comprehensive teacher model distillation locally...")
+		results = run_local_distillation()
+		print(f"✅ Distillation complete! Created {results['total_successful']} models")
+		print("📁 Models location: ./code_model2vec/final/")
+		print("\n✅ Created models:")
+		for model_name in results["successful_models"]:
+			model_info = results["all_results"][model_name]
+			print(f"   • {model_name} (from {model_info['teacher_model']})")
+	elif args.command == "evaluate":
+		from .evaluate import main as evaluate_main, run_local_evaluation
+		if args.use_beam:
+			# Run on Beam with all default models
+			print("Running comprehensive evaluation on Beam...")
+			evaluate_main()
+		else:
+			# Run locally with all default models
+			print("Running comprehensive evaluation locally...")
+			run_local_evaluation()
+	elif args.command == "evaluate-simple":
+		from .evaluate import evaluate_simplified_only, run_local_evaluation_simplified
+		if args.use_beam:
+			# Run on Beam with simplified models only
+			print("Running simplified model evaluation on Beam...")
+			evaluate_simplified_only()
+		else:
+			# Run locally with simplified models only
+			print("Running simplified model evaluation locally...")
+			run_local_evaluation_simplified()
+	elif args.command == "analyze":
+		from .analyze import main as analyze_main
+		# Run locally - Override sys.argv to pass arguments to the analyze script
+		sys.argv = ["analyze.py"]
+		if args.results_dir != "code_evaluation_results":
+			sys.argv.extend(["--results-dir", args.results_dir])
+		if args.results_file:
+			sys.argv.extend(["--results-file", args.results_file])
+		if args.model_name != "gte_qwen2_m2v_code":
+			sys.argv.extend(["--model-name", args.model_name])
+		if args.output != "README.md":
+			sys.argv.extend(["--output", args.output])
+		if args.export_csv:
+			sys.argv.extend(["--export-csv", args.export_csv])
+		analyze_main()
+	elif args.command == "sync":
+		from .sync import sync_files
+		# Run locally
+		sync_files(
+			model_files=args.model_files,
+			analysis_files=args.analysis_files,
+			all_files=args.all,
+			output_dir=args.output_dir,
+		)
+	elif args.command == "benchmark":
+		from .benchmark import main as benchmark_main, run_local_benchmark
+		if args.use_beam:
+			# Run on Beam with all default models
+			print("Running comprehensive benchmarking on Beam...")
+			benchmark_main()
+		else:
+			# Run locally with all default models
+			print("Running comprehensive benchmarking locally...")
+			run_local_benchmark()
+	elif args.command == "benchmark-simple":
+		from .benchmark import benchmark_simplified_only, run_local_benchmark_simplified
+		if args.use_beam:
+			# Run on Beam with simplified models only
+			print("Running simplified model benchmarking on Beam...")
+			benchmark_simplified_only()
+		else:
+			# Run locally with simplified models only
+			print("Running simplified model benchmarking locally...")
+			run_local_benchmark_simplified()
+	else:
+		parser.print_help()
+if __name__ == "__main__":
+	main()

src/distiller/analyze.py ADDED Viewed

	@@ -0,0 +1,1495 @@

+"""
+Comprehensive CodeSearchNet Analysis and Reporting Script.
+This script provides a complete CodeSearchNet evaluation pipeline that includes:
+1. Model evaluation results analysis
+2. Peer model comparison analysis
+3. Advanced visualizations and charts
+4. Leaderboard comparison and ranking analysis
+5. Comprehensive README report generation
+6. Performance efficiency analysis
+7. Language-specific performance analysis
+Features:
+- CodeSearchNet-style scoring (NDCG@10, MRR, Recall metrics)
+- Comparison with peer code-specialized models
+- Model efficiency metrics (performance per parameter)
+- Interactive visualizations with Plotly and Matplotlib
+- Professional charts for README integration
+- Statistical analysis of results across programming languages
+Usage:
+    python analyze.py --results-dir results/ --model-name my_model
+    distiller analyze --results-dir evaluation_results
+"""
+import argparse
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Any
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+# Optional Plotly import with fallback
+PLOTLY_AVAILABLE = True
+try:
+	import plotly.graph_objects as go
+except ImportError:
+	PLOTLY_AVAILABLE = False
+# Set plotting style
+try:
+	plt.style.use("seaborn-v0_8")
+except OSError:
+	plt.style.use("seaborn")  # Fallback for older matplotlib versions
+sns.set_palette("husl")
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+# Constants
+MIN_SCORES_FOR_STATS = 2
+HIGH_PERFORMANCE_THRESHOLD = 0.3
+MEDIUM_PERFORMANCE_THRESHOLD = 0.2
+# Model Configuration
+MODEL_NAME = "code_model2vec_analysis"  # Generic name for multi-model analysis
+ORIGINAL_MODEL_NAME = "Alibaba-NLP/gte-Qwen2-7B-instruct"
+OUTPUT_DIR = Path("analysis_results")
+IMAGES_DIR = Path("analysis_charts")
+REPORT_FILE = Path("REPORT.md")  # Changed from README.md
+# Local directories for results - updated for new structure
+DEFAULT_EVALUATION_DIR = "code_model2vec/evaluation_results"
+DEFAULT_BENCHMARK_DIR = "code_model2vec/benchmark_results"
+# CodeSearchNet Languages
+CODE_LANGUAGES = ["python", "javascript", "java", "php", "ruby", "go"]
+# Model name mapping from the default models in evaluate.py and benchmark.py
+MODEL_NAME_MAPPING = {
+	# File names to display names
+	"gte_qwen2_m2v_code": "gte_qwen2_m2v_code (Ours)",
+	"all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
+	"codebert-base": "microsoft/codebert-base",
+	"graphcodebert-base": "microsoft/graphcodebert-base",
+	"CodeBERTa-small-v1": "huggingface/CodeBERTa-small-v1",
+	"all-mpnet-base-v2": "sentence-transformers/all-mpnet-base-v2",
+	"all-MiniLM-L12-v2": "sentence-transformers/all-MiniLM-L12-v2",
+	"potion-base-8M": "minishlab/potion-base-8M",
+	"potion-retrieval-32M": "minishlab/potion-retrieval-32M",
+	"codet5-base": "Salesforce/codet5-base",
+}
+# Reverse mapping for lookups
+DISPLAY_NAME_TO_FILE = {v: k for k, v in MODEL_NAME_MAPPING.items()}
+# Peer models for comparison (code-specialized models)
+PEER_MODELS = {
+	"sentence-transformers/all-MiniLM-L6-v2": {"overall_ndcg": 0.25, "type": "General"},
+	"microsoft/codebert-base": {"overall_ndcg": 0.32, "type": "Code-Specific"},
+	"microsoft/graphcodebert-base": {"overall_ndcg": 0.35, "type": "Code-Specific"},
+	"huggingface/CodeBERTa-small-v1": {"overall_ndcg": 0.28, "type": "Code-Specific"},
+	"sentence-transformers/all-mpnet-base-v2": {"overall_ndcg": 0.27, "type": "General"},
+}
+# Model specifications for efficiency analysis
+MODEL_SPECS = {
+	"sentence-transformers/all-MiniLM-L6-v2": {"parameters": 22.7, "size_mb": 90},
+	"microsoft/codebert-base": {"parameters": 125.0, "size_mb": 500},
+	"microsoft/graphcodebert-base": {"parameters": 125.0, "size_mb": 500},
+	"huggingface/CodeBERTa-small-v1": {"parameters": 84.0, "size_mb": 340},
+	"sentence-transformers/all-mpnet-base-v2": {"parameters": 109.0, "size_mb": 440},
+	"Alibaba-NLP/gte-Qwen2-7B-instruct": {"parameters": 7000.0, "size_mb": 13000},
+}
+# Distilled model specifications
+DISTILLED_MODEL_SPECS = {
+	"parameters": 39.0,  # Model2Vec parameters
+	"size_mb": 149.0,  # Actual model size
+	"dimensions": 256,  # Model2Vec dimensions
+	"original_dimensions": 3584,
+	"distillation_method": "Model2Vec",
+	"training_dataset": "CodeSearchNet",
+}
+# =============================================================================
+# UTILITY FUNCTIONS
+# =============================================================================
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+def setup_directories(base_path: Path | None = None) -> tuple[Path, Path, Path]:
+	"""Create necessary directories and return their paths."""
+	if base_path:
+		output_dir = base_path / "analysis_results"
+		images_dir = base_path / "analysis_results" / "charts"
+		reports_dir = base_path / "analysis_results" / "reports"
+	else:
+		output_dir = OUTPUT_DIR
+		images_dir = IMAGES_DIR
+		reports_dir = OUTPUT_DIR / "reports"
+	output_dir.mkdir(parents=True, exist_ok=True)
+	images_dir.mkdir(parents=True, exist_ok=True)
+	reports_dir.mkdir(parents=True, exist_ok=True)
+	return output_dir, images_dir, reports_dir
+def extract_model_name_from_filename(filename: str) -> str:
+	"""Extract and map model name from filename."""
+	# Remove prefixes and extensions
+	name = filename.replace("codesearchnet_eval_", "").replace("benchmark_", "").replace(".json", "")
+	# Check if it's in our mapping
+	if name in MODEL_NAME_MAPPING:
+		return MODEL_NAME_MAPPING[name]
+	# Try to find partial matches
+	for file_key, display_name in MODEL_NAME_MAPPING.items():
+		if file_key in name or name in file_key:
+			return display_name
+	# If no mapping found, return the cleaned name
+	return name
+class CodeSearchNetAnalyzer:
+	"""Analyzer for CodeSearchNet evaluation results and performance benchmarks."""
+	def __init__(
+		self,
+		results_dir: str | None = None,
+		benchmark_dir: str | None = None,
+		images_dir: Path | None = None,
+	) -> None:
+		"""Initialize analyzer with results directories."""
+		self.results_dir = Path(results_dir) if results_dir else Path(DEFAULT_EVALUATION_DIR)
+		self.benchmark_dir = Path(benchmark_dir) if benchmark_dir else Path(DEFAULT_BENCHMARK_DIR)
+		self.images_dir = images_dir or IMAGES_DIR
+		self.results: list[dict[str, Any]] = []
+		self.benchmark_results: list[dict[str, Any]] = []
+		self.comparison_df: pd.DataFrame | None = None
+		self.benchmark_df: pd.DataFrame | None = None
+	def load_benchmark_results(self) -> None:
+		"""Load benchmark results from local directory."""
+		logger.info("📊 Loading benchmark results...")
+		if not self.benchmark_dir.exists():
+			logger.warning(f"Benchmark directory not found: {self.benchmark_dir}")
+			return
+		logger.info(f"🔍 Searching for benchmark files in: {self.benchmark_dir}")
+		benchmark_files = list(self.benchmark_dir.glob("benchmark_*.json"))
+		logger.info(f"📁 Found {len(benchmark_files)} benchmark files")
+		for benchmark_file_path in benchmark_files:
+			try:
+				logger.info(f"📖 Loading: {benchmark_file_path.name}")
+				with benchmark_file_path.open() as f:
+					data = json.load(f)
+				if data is not None:
+					# Update model name with proper mapping
+					original_name = data.get("model_name", "Unknown")
+					mapped_name = extract_model_name_from_filename(benchmark_file_path.stem)
+					data["model_name"] = mapped_name
+					data["original_model_name"] = original_name
+					self.benchmark_results.append(data)
+					logger.info(f"✅ Successfully loaded: {mapped_name}")
+			except (json.JSONDecodeError, KeyError) as e:
+				logger.warning(f"❌ Failed to load {benchmark_file_path}: {e}")
+		logger.info(f"📊 Total benchmark results loaded: {len(self.benchmark_results)}")
+		if self.benchmark_results:
+			model_names = [r.get("model_name", "Unknown") for r in self.benchmark_results]
+			logger.info(f"🎯 Benchmark models found: {', '.join(model_names)}")
+		self._create_benchmark_dataframe()
+	def _create_benchmark_dataframe(self) -> None:
+		"""Create benchmark comparison DataFrame from results."""
+		if not self.benchmark_results:
+			return
+		benchmark_data = []
+		for result in self.benchmark_results:
+			model_name = result.get("model_name", "Unknown")
+			size_metrics = result.get("size_metrics", {})
+			speed_benchmarks = result.get("speed_benchmarks", {})
+			memory_benchmarks = result.get("memory_benchmarks", {})
+			cpu_vs_gpu = result.get("cpu_vs_gpu", {})
+			# Extract key metrics
+			row = {
+				"Model": model_name,
+				"Disk_Size_MB": size_metrics.get("disk_size_mb", 0),
+				"Parameters_M": size_metrics.get("parameters_millions", 0),
+				"Embedding_Dim": size_metrics.get("embedding_dim", 0),
+				"RAM_Usage_MB": size_metrics.get("ram_usage_mb", 0),
+				"GPU_Memory_MB": size_metrics.get("gpu_memory_mb", 0),
+			}
+			# Speed metrics (medium texts, batch 32)
+			if "medium" in speed_benchmarks and "batch_32" in speed_benchmarks["medium"]:
+				batch_32 = speed_benchmarks["medium"]["batch_32"]
+				row.update(
+					{
+						"Throughput_TextsPerSec": batch_32.get("texts_per_second", 0),
+						"Latency_MsPerText": batch_32.get("time_per_text_ms", 0),
+						"TokenSpeed_TokensPerSec": batch_32.get("tokens_per_second", 0),
+					}
+				)
+			# Memory scaling (batch 32)
+			if "batch_32" in memory_benchmarks:
+				batch_32_mem = memory_benchmarks["batch_32"]
+				if not batch_32_mem.get("oom", False) and "error" not in batch_32_mem:
+					row.update(
+						{
+							"Memory_Used_MB": batch_32_mem.get("memory_used_mb", 0),
+							"Memory_Per_Text_MB": batch_32_mem.get("memory_per_text_mb", 0),
+						}
+					)
+			# CPU vs GPU comparison
+			for device in ["cpu", "cuda"]:
+				if device in cpu_vs_gpu and "error" not in cpu_vs_gpu[device]:
+					device_key = f"{device.upper()}_TextsPerSec"
+					row[device_key] = cpu_vs_gpu[device].get("texts_per_second", 0)
+			benchmark_data.append(row)
+		self.benchmark_df = pd.DataFrame(benchmark_data)
+	def load_results(self) -> None:
+		"""Load evaluation results from local directory."""
+		logger.info("🔍 Loading evaluation results...")
+		if not self.results_dir.exists():
+			logger.warning(f"Evaluation directory not found: {self.results_dir}")
+			return
+		logger.info(f"🔍 Searching for evaluation files in: {self.results_dir}")
+		json_files = list(self.results_dir.glob("codesearchnet_eval_*.json"))
+		logger.info(f"📁 Found {len(json_files)} evaluation files")
+		for json_file in json_files:
+			try:
+				logger.info(f"📖 Loading: {json_file.name}")
+				with json_file.open() as f:
+					data = json.load(f)
+				if data is not None:
+					# Update model name with proper mapping
+					original_name = data.get("model_name", "Unknown")
+					mapped_name = extract_model_name_from_filename(json_file.stem)
+					data["model_name"] = mapped_name
+					data["original_model_name"] = original_name
+					self.results.append(data)
+					logger.info(f"✅ Successfully loaded: {mapped_name}")
+			except (json.JSONDecodeError, KeyError) as e:
+				logger.warning(f"❌ Failed to load {json_file}: {e}")
+		logger.info(f"📊 Total loaded: {len(self.results)} model results")
+		if self.results:
+			model_names = [r.get("model_name", "Unknown") for r in self.results]
+			logger.info(f"🎯 Models found: {', '.join(model_names)}")
+		self._create_comparison_dataframe()
+		# Also load benchmark results
+		self.load_benchmark_results()
+	def _create_comparison_dataframe(self) -> None:
+		"""Create comparison DataFrame from results."""
+		if not self.results:
+			return
+		comparison_data = []
+		for result in self.results:
+			overall = result.get("overall", {})
+			row = {
+				"Model": result["model_name"],
+				"MRR": overall.get("mrr", 0),
+				"NDCG@1": overall.get("ndcg@1", 0),
+				"NDCG@5": overall.get("ndcg@5", 0),
+				"NDCG@10": overall.get("ndcg@10", 0),
+				"Recall@1": overall.get("recall@1", 0),
+				"Recall@5": overall.get("recall@5", 0),
+				"Recall@10": overall.get("recall@10", 0),
+				"Mean_Rank": overall.get("mean_rank", 0),
+				"Median_Rank": overall.get("median_rank", 0),
+			}
+			comparison_data.append(row)
+		self.comparison_df = pd.DataFrame(comparison_data)
+		if not self.comparison_df.empty:
+			self.comparison_df = self.comparison_df.sort_values("NDCG@10", ascending=False)
+	def print_summary(self) -> None:
+		"""Print summary of results."""
+		if not self.results:
+			logger.warning("No results to summarize")
+			return
+		print(f"\n{'=' * 60}")
+		print("CodeSearchNet Evaluation Summary")
+		print(f"{'=' * 60}")
+		print(f"Total models evaluated: {len(self.results)}")
+		if self.comparison_df is not None and not self.comparison_df.empty:
+			print(f"\nTop performing model: {self.comparison_df.iloc[0]['Model']}")
+			print(f"Best NDCG@10: {self.comparison_df.iloc[0]['NDCG@10']:.4f}")
+			print(f"Best MRR: {self.comparison_df['MRR'].max():.4f}")
+		print(f"\nEvaluated languages: {', '.join(CODE_LANGUAGES)}")
+		# Also print benchmark summary if available
+		if self.benchmark_results:
+			print(f"\n{'=' * 60}")
+			print("Performance Benchmark Summary")
+			print(f"{'=' * 60}")
+			print(f"Total models benchmarked: {len(self.benchmark_results)}")
+			if self.benchmark_df is not None and not self.benchmark_df.empty:
+				# Safely get fastest and smallest models
+				fastest_model = "N/A"
+				smallest_model = "N/A"
+				if "Throughput_TextsPerSec" in self.benchmark_df.columns:
+					fastest_idx = self.benchmark_df["Throughput_TextsPerSec"].idxmax()
+					fastest_model = str(self.benchmark_df.loc[fastest_idx, "Model"])
+				if "Disk_Size_MB" in self.benchmark_df.columns:
+					smallest_idx = self.benchmark_df["Disk_Size_MB"].idxmin()
+					smallest_model = str(self.benchmark_df.loc[smallest_idx, "Model"])
+				print(f"\nFastest model: {fastest_model}")
+				print(f"Smallest model: {smallest_model}")
+	def analyze_language_performance(self) -> None:
+		"""Analyze performance across programming languages."""
+		if not self.results:
+			return
+		print(f"\n{'=' * 60}")
+		print("Language-Specific Performance Analysis")
+		print(f"{'=' * 60}")
+		for result in self.results:
+			model_name = result["model_name"]
+			print(f"\nModel: {model_name}")
+			print("-" * 40)
+			languages = result.get("languages", {})
+			lang_data = []
+			for lang, lang_results in languages.items():
+				metrics = lang_results.get("metrics", {})
+				lang_data.append(
+					{
+						"Language": lang,
+						"NDCG@10": metrics.get("ndcg@10", 0),
+						"MRR": metrics.get("mrr", 0),
+						"Recall@5": metrics.get("recall@5", 0),
+						"Queries": lang_results.get("num_queries", 0),
+					}
+				)
+			if lang_data:
+				lang_df = pd.DataFrame(lang_data)
+				print(lang_df.to_string(index=False, float_format="%.4f"))
+				print(f"\nBest language: {lang_df.loc[lang_df['NDCG@10'].idxmax(), 'Language']}")
+				print(f"Average NDCG@10: {lang_df['NDCG@10'].mean():.4f}")
+				print(f"Average queries per language: {lang_df['Queries'].mean():.0f}")
+	def analyze_benchmark_performance(self) -> None:
+		"""Analyze and print benchmark performance summary."""
+		if not self.benchmark_results:
+			logger.warning("No benchmark results to analyze")
+			return
+		print(f"\n{'=' * 60}")
+		print("Performance Benchmark Analysis")
+		print(f"{'=' * 60}")
+		for result in self.benchmark_results:
+			model_name = result.get("model_name", "Unknown")
+			print(f"\nModel: {model_name}")
+			print("-" * 40)
+			# Size metrics
+			size_metrics = result.get("size_metrics", {})
+			if size_metrics:
+				print("📏 Model Size:")
+				print(f"  Disk Size: {size_metrics.get('disk_size_mb', 0):.1f} MB")
+				if "parameters_millions" in size_metrics:
+					print(f"  Parameters: {size_metrics['parameters_millions']:.1f}M")
+				if "embedding_dim" in size_metrics:
+					print(f"  Embedding Dimension: {size_metrics['embedding_dim']}")
+			# Speed metrics
+			speed_benchmarks = result.get("speed_benchmarks", {})
+			if "medium" in speed_benchmarks and "batch_32" in speed_benchmarks["medium"]:
+				batch_32 = speed_benchmarks["medium"]["batch_32"]
+				print("⚡ Performance (Batch 32, Medium Texts):")
+				print(f"  Throughput: {batch_32.get('texts_per_second', 0):.1f} texts/sec")
+				print(f"  Latency: {batch_32.get('time_per_text_ms', 0):.1f} ms/text")
+				print(f"  Token Speed: {batch_32.get('tokens_per_second', 0):.0f} tokens/sec")
+			# CPU vs GPU
+			cpu_vs_gpu = result.get("cpu_vs_gpu", {})
+			if cpu_vs_gpu:
+				print("🖥️  CPU vs GPU:")
+				for device, metrics in cpu_vs_gpu.items():
+					if "error" not in metrics:
+						print(f"  {device.upper()}: {metrics.get('texts_per_second', 0):.1f} texts/sec")
+			# Memory efficiency
+			memory_benchmarks = result.get("memory_benchmarks", {})
+			if "batch_32" in memory_benchmarks:
+				batch_32_mem = memory_benchmarks["batch_32"]
+				if not batch_32_mem.get("oom", False) and "error" not in batch_32_mem:
+					print("💾 Memory Usage (Batch 32):")
+					print(f"  Total: {batch_32_mem.get('memory_used_mb', 0):.1f} MB")
+					print(f"  Per Text: {batch_32_mem.get('memory_per_text_mb', 0):.2f} MB")
+	def create_performance_radar_chart(self, model_name: str, language_scores: dict[str, float]) -> str:
+		"""Create radar chart showing performance across languages."""
+		if not PLOTLY_AVAILABLE:
+			logger.warning("Plotly not available, skipping radar chart")
+			return ""
+		languages = list(language_scores.keys())
+		scores = list(language_scores.values())
+		if not languages:
+			return ""
+		# Close the radar chart
+		languages_closed = [*languages, languages[0]]
+		scores_closed = [*scores, scores[0]]
+		fig = go.Figure()
+		fig.add_trace(
+			go.Scatterpolar(
+				r=scores_closed,
+				theta=languages_closed,
+				fill="toself",
+				name=model_name,
+				line_color="rgb(67, 147, 195)",
+				fillcolor="rgba(67, 147, 195, 0.3)",
+			)
+		)
+		fig.update_layout(
+			polar={"radialaxis": {"visible": True, "range": [0, max(scores) * 1.1]}},
+			showlegend=True,
+			title=f"CodeSearchNet Performance by Language: {model_name}",
+			width=800,
+			height=600,
+		)
+		static_path = self.images_dir / "code_performance_radar.png"
+		try:
+			fig.write_image(str(static_path), width=800, height=600, scale=2)
+			return str(static_path)
+		except Exception as e:
+			logger.warning(f"Could not create static image: {e}")
+			return ""
+	def create_comparative_radar_chart(self, simplified_models: list, peer_models: list) -> str:
+		"""Create comparative radar chart between best distilled model and top peer models."""
+		if not PLOTLY_AVAILABLE:
+			logger.warning("Plotly not available, skipping comparative radar chart")
+			return ""
+		if not simplified_models:
+			return ""
+		# Get the best simplified model
+		best_simplified = max(simplified_models, key=lambda x: x.get("overall", {}).get("ndcg@10", 0))
+		# Get top 3 peer models by performance
+		peer_models_sorted = sorted(peer_models, key=lambda x: x.get("overall", {}).get("ndcg@10", 0), reverse=True)
+		top_peers = peer_models_sorted[:3]
+		models_to_compare = [best_simplified, *top_peers]
+		fig = go.Figure()
+		# Define colors for each model
+		colors = ["rgb(255, 99, 132)", "rgb(54, 162, 235)", "rgb(255, 205, 86)", "rgb(75, 192, 192)"]
+		for i, model_result in enumerate(models_to_compare):
+			model_name = model_result["model_name"]
+			languages = model_result.get("languages", {})
+			# Calculate language scores
+			language_scores = {}
+			for lang, lang_data in languages.items():
+				metrics = lang_data.get("metrics", {})
+				language_scores[lang.title()] = metrics.get("ndcg@10", 0)
+			if language_scores:
+				languages_list = list(language_scores.keys())
+				scores_list = list(language_scores.values())
+				# Close the radar chart
+				languages_closed = [*languages_list, languages_list[0]]
+				scores_closed = [*scores_list, scores_list[0]]
+				# Determine line style - solid for best distilled, dash for peers
+				line_dash = "solid" if i == 0 else "dash"
+				line_width = 3 if i == 0 else 2
+				fig.add_trace(
+					go.Scatterpolar(
+						r=scores_closed,
+						theta=languages_closed,
+						fill="toself" if i == 0 else "none",
+						name=model_name,
+						line={"color": colors[i % len(colors)], "dash": line_dash, "width": line_width},
+						fillcolor=f"rgba{colors[i % len(colors)][3:-1]}, 0.2)" if i == 0 else None,
+					)
+				)
+		fig.update_layout(
+			polar={"radialaxis": {"visible": True, "range": [0, 0.5]}},  # Adjust max range as needed
+			showlegend=True,
+			title="Model Comparison: Best Distilled vs Top Peer Models",
+			width=900,
+			height=700,
+		)
+		static_path = self.images_dir / "comparative_radar.png"
+		try:
+			fig.write_image(str(static_path), width=900, height=700, scale=2)
+			return str(static_path)
+		except Exception as e:
+			logger.warning(f"Could not create comparative radar chart: {e}")
+			return ""
+	def create_individual_radar_charts(self, simplified_models: list) -> dict[str, str]:
+		"""Create individual radar charts for all simplified models."""
+		radar_charts = {}
+		for result in simplified_models:
+			model_name = result["model_name"]
+			model_languages = result.get("languages", {})
+			model_language_scores = {}
+			for lang, lang_data in model_languages.items():
+				metrics = lang_data.get("metrics", {})
+				model_language_scores[lang.title()] = metrics.get("ndcg@10", 0)
+			if model_language_scores:
+				# Create unique filename for each model
+				safe_model_name = "".join(c for c in model_name if c.isalnum() or c in ("-", "_")).rstrip()
+				radar_chart_path = self.create_performance_radar_chart_individual(
+					model_name, model_language_scores, safe_model_name
+				)
+				if radar_chart_path:
+					radar_charts[model_name] = radar_chart_path
+		return radar_charts
+	def create_performance_radar_chart_individual(
+		self, model_name: str, language_scores: dict[str, float], filename_suffix: str
+	) -> str:
+		"""Create radar chart for individual model with unique filename."""
+		if not PLOTLY_AVAILABLE:
+			logger.warning("Plotly not available, skipping radar chart")
+			return ""
+		languages = list(language_scores.keys())
+		scores = list(language_scores.values())
+		if not languages:
+			return ""
+		# Close the radar chart
+		languages_closed = [*languages, languages[0]]
+		scores_closed = [*scores, scores[0]]
+		fig = go.Figure()
+		fig.add_trace(
+			go.Scatterpolar(
+				r=scores_closed,
+				theta=languages_closed,
+				fill="toself",
+				name=model_name,
+				line_color="rgb(67, 147, 195)",
+				fillcolor="rgba(67, 147, 195, 0.3)",
+			)
+		)
+		fig.update_layout(
+			polar={"radialaxis": {"visible": True, "range": [0, max(scores) * 1.1]}},
+			showlegend=True,
+			title=f"CodeSearchNet Performance by Language: {model_name}",
+			width=800,
+			height=600,
+		)
+		static_path = self.images_dir / f"radar_{filename_suffix}.png"
+		try:
+			fig.write_image(str(static_path), width=800, height=600, scale=2)
+			return str(static_path)
+		except Exception as e:
+			logger.warning(f"Could not create static image for {model_name}: {e}")
+			return ""
+	def plot_model_comparison(self, save_path: str | None = None) -> str:
+		"""Create comparison plots for models."""
+		if self.comparison_df is None or self.comparison_df.empty:
+			logger.warning("No comparison data available for plotting")
+			return ""
+		fig, axes = plt.subplots(2, 2, figsize=(15, 12))
+		fig.suptitle("CodeSearchNet Model Comparison", fontsize=16, fontweight="bold")
+		# NDCG@10 comparison
+		axes[0, 0].barh(self.comparison_df["Model"], self.comparison_df["NDCG@10"])
+		axes[0, 0].set_title("NDCG@10 Comparison")
+		axes[0, 0].set_xlabel("NDCG@10")
+		# MRR comparison
+		axes[0, 1].barh(self.comparison_df["Model"], self.comparison_df["MRR"])
+		axes[0, 1].set_title("Mean Reciprocal Rank (MRR)")
+		axes[0, 1].set_xlabel("MRR")
+		# Recall@5 comparison
+		axes[1, 0].barh(self.comparison_df["Model"], self.comparison_df["Recall@5"])
+		axes[1, 0].set_title("Recall@5")
+		axes[1, 0].set_xlabel("Recall@5")
+		# Mean Rank comparison (lower is better)
+		axes[1, 1].barh(self.comparison_df["Model"], self.comparison_df["Mean_Rank"])
+		axes[1, 1].set_title("Mean Rank (lower is better)")
+		axes[1, 1].set_xlabel("Mean Rank")
+		plt.tight_layout()
+		output_path = save_path or str(self.images_dir / "model_comparison.png")
+		plt.savefig(output_path, dpi=300, bbox_inches="tight")
+		plt.close()
+		return output_path
+	def plot_language_heatmap(self, save_path: str | None = None) -> str:
+		"""Create a heatmap of performance across languages."""
+		if not self.results:
+			return ""
+		# Prepare data for heatmap
+		heatmap_data = []
+		for result in self.results:
+			model_name = result["model_name"]
+			languages = result.get("languages", {})
+			row = {"Model": model_name}
+			for lang in CODE_LANGUAGES:
+				if lang in languages:
+					metrics = languages[lang].get("metrics", {})
+					row[lang.title()] = metrics.get("ndcg@10", 0)
+				else:
+					row[lang.title()] = 0
+			heatmap_data.append(row)
+		if not heatmap_data:
+			return ""
+		df = pd.DataFrame(heatmap_data).set_index("Model")
+		plt.figure(figsize=(12, 8))
+		sns.heatmap(
+			df,
+			annot=True,
+			fmt=".3f",
+			cmap="RdYlBu_r",
+			center=0.2,
+			vmin=0,
+			vmax=df.to_numpy().max(),
+			cbar_kws={"label": "NDCG@10 Score"},
+		)
+		plt.title(
+			"CodeSearchNet Performance Heatmap by Language",
+			fontsize=16,
+			fontweight="bold",
+		)
+		plt.xlabel("Programming Language", fontsize=12)
+		plt.ylabel("Model", fontsize=12)
+		plt.tight_layout()
+		output_path = save_path or str(self.images_dir / "language_heatmap.png")
+		plt.savefig(output_path, dpi=300, bbox_inches="tight")
+		plt.close()
+		return output_path
+	def plot_benchmark_performance(self, save_path: str | None = None) -> str:
+		"""Create comprehensive benchmark performance plots."""
+		if not self.benchmark_results:
+			logger.warning("No benchmark data available for plotting")
+			return ""
+		fig, axes = plt.subplots(2, 3, figsize=(18, 12))
+		fig.suptitle("Performance Benchmark Analysis", fontsize=16, fontweight="bold")
+		# 1. Model Size Comparison
+		if self.benchmark_df is not None and "Disk_Size_MB" in self.benchmark_df.columns:
+			axes[0, 0].barh(self.benchmark_df["Model"], self.benchmark_df["Disk_Size_MB"])
+			axes[0, 0].set_title("Model Size (MB)")
+			axes[0, 0].set_xlabel("Size (MB)")
+		# 2. Inference Throughput
+		if self.benchmark_df is not None and "Throughput_TextsPerSec" in self.benchmark_df.columns:
+			axes[0, 1].barh(self.benchmark_df["Model"], self.benchmark_df["Throughput_TextsPerSec"])
+			axes[0, 1].set_title("Inference Throughput")
+			axes[0, 1].set_xlabel("Texts/Second")
+		# 3. Memory Usage
+		if self.benchmark_df is not None and "Memory_Used_MB" in self.benchmark_df.columns:
+			axes[0, 2].barh(self.benchmark_df["Model"], self.benchmark_df["Memory_Used_MB"])
+			axes[0, 2].set_title("Memory Usage (Batch 32)")
+			axes[0, 2].set_xlabel("Memory (MB)")
+		# 4. Latency Comparison
+		if self.benchmark_df is not None and "Latency_MsPerText" in self.benchmark_df.columns:
+			axes[1, 0].barh(self.benchmark_df["Model"], self.benchmark_df["Latency_MsPerText"])
+			axes[1, 0].set_title("Inference Latency")
+			axes[1, 0].set_xlabel("Milliseconds/Text")
+		# 5. CPU vs GPU Performance
+		if self.benchmark_df is not None:
+			cpu_col = "CPU_TextsPerSec"
+			gpu_col = "CUDA_TextsPerSec"
+			if cpu_col in self.benchmark_df.columns and gpu_col in self.benchmark_df.columns:
+				x = np.arange(len(self.benchmark_df))
+				width = 0.35
+				axes[1, 1].bar(x - width / 2, self.benchmark_df[cpu_col], width, label="CPU", alpha=0.7)
+				axes[1, 1].bar(x + width / 2, self.benchmark_df[gpu_col], width, label="GPU", alpha=0.7)
+				axes[1, 1].set_title("CPU vs GPU Performance")
+				axes[1, 1].set_ylabel("Texts/Second")
+				axes[1, 1].set_xticks(x)
+				axes[1, 1].set_xticklabels(self.benchmark_df["Model"], rotation=45, ha="right")
+				axes[1, 1].legend()
+		# 6. Parameter Efficiency
+		if (
+			self.benchmark_df is not None
+			and "Parameters_M" in self.benchmark_df.columns
+			and "Throughput_TextsPerSec" in self.benchmark_df.columns
+		):
+			# Efficiency = Throughput / Parameters (higher is better)
+			efficiency = self.benchmark_df["Throughput_TextsPerSec"] / (self.benchmark_df["Parameters_M"] + 1e-6)
+			axes[1, 2].barh(self.benchmark_df["Model"], efficiency)
+			axes[1, 2].set_title("Parameter Efficiency")
+			axes[1, 2].set_xlabel("Texts/Sec per Million Parameters")
+		plt.tight_layout()
+		output_path = save_path or str(self.images_dir / "benchmark_performance.png")
+		plt.savefig(output_path, dpi=300, bbox_inches="tight")
+		plt.close()
+		return output_path
+	def plot_batch_size_scaling(self, save_path: str | None = None) -> str:
+		"""Create batch size scaling analysis plot."""
+		if not self.benchmark_results:
+			return ""
+		plt.figure(figsize=(12, 8))
+		for result in self.benchmark_results:
+			model_name = result.get("model_name", "Unknown")
+			speed_benchmarks = result.get("speed_benchmarks", {})
+			# Extract batch size performance for medium texts
+			if "medium" in speed_benchmarks:
+				batch_sizes = []
+				throughputs = []
+				for batch_key, metrics in speed_benchmarks["medium"].items():
+					if batch_key.startswith("batch_"):
+						batch_size = int(batch_key.split("_")[1])
+						throughput = metrics.get("texts_per_second", 0)
+						batch_sizes.append(batch_size)
+						throughputs.append(throughput)
+				if batch_sizes:
+					plt.plot(batch_sizes, throughputs, marker="o", label=model_name, linewidth=2)
+		plt.xlabel("Batch Size", fontsize=12)
+		plt.ylabel("Throughput (Texts/Second)", fontsize=12)
+		plt.title("Batch Size Scaling Performance", fontsize=16, fontweight="bold")
+		plt.legend()
+		plt.grid(visible=True, alpha=0.3)
+		plt.xscale("log", base=2)
+		output_path = save_path or str(self.images_dir / "batch_size_scaling.png")
+		plt.savefig(output_path, dpi=300, bbox_inches="tight")
+		plt.close()
+		return output_path
+	def plot_memory_scaling(self, save_path: str | None = None) -> str:
+		"""Create memory scaling analysis plot."""
+		if not self.benchmark_results:
+			return ""
+		plt.figure(figsize=(12, 8))
+		for result in self.benchmark_results:
+			model_name = result.get("model_name", "Unknown")
+			memory_benchmarks = result.get("memory_benchmarks", {})
+			batch_sizes = []
+			memory_usage = []
+			for batch_key, metrics in memory_benchmarks.items():
+				if batch_key.startswith("batch_") and not metrics.get("oom", False) and "error" not in metrics:
+					batch_size = int(batch_key.split("_")[1])
+					memory_mb = metrics.get("memory_used_mb", 0)
+					batch_sizes.append(batch_size)
+					memory_usage.append(memory_mb)
+			if batch_sizes:
+				plt.plot(batch_sizes, memory_usage, marker="s", label=model_name, linewidth=2)
+		plt.xlabel("Batch Size", fontsize=12)
+		plt.ylabel("Memory Usage (MB)", fontsize=12)
+		plt.title("Memory Scaling by Batch Size", fontsize=16, fontweight="bold")
+		plt.legend()
+		plt.grid(visible=True, alpha=0.3)
+		plt.xscale("log", base=2)
+		output_path = save_path or str(self.images_dir / "memory_scaling.png")
+		plt.savefig(output_path, dpi=300, bbox_inches="tight")
+		plt.close()
+		return output_path
+	def create_peer_comparison_chart(self, model_name: str) -> str:
+		"""Create comparison chart using actual evaluation results."""
+		if self.comparison_df is None or self.comparison_df.empty:
+			logger.warning("No comparison data available for peer comparison chart")
+			return ""
+		# Use actual evaluation results instead of hardcoded scores
+		df_sorted = self.comparison_df.sort_values("NDCG@10", ascending=True)
+		plt.figure(figsize=(12, 8))
+		# Color models differently - highlight the user's model
+		colors = []
+		for model in df_sorted["Model"]:
+			if model_name.lower() in model.lower() or "gte_qwen2_m2v_code" in model.lower():
+				colors.append("red")  # User's model
+			else:
+				colors.append("skyblue")  # Peer models
+		bars = plt.barh(df_sorted["Model"], df_sorted["NDCG@10"], color=colors)
+		# Highlight current model with special formatting
+		for i, model in enumerate(df_sorted["Model"]):
+			if model_name.lower() in model.lower() or "gte_qwen2_m2v_code" in model.lower():
+				bars[i].set_alpha(0.8)
+				bars[i].set_edgecolor("black")
+				bars[i].set_linewidth(2)
+		plt.xlabel("NDCG@10 Score", fontsize=12)
+		plt.title(
+			"CodeSearchNet Model Comparison (Actual Results)",
+			fontsize=16,
+			fontweight="bold",
+		)
+		plt.grid(axis="x", alpha=0.3)
+		# Add score labels
+		for i, score in enumerate(df_sorted["NDCG@10"]):
+			plt.text(score + 0.005, i, f"{score:.3f}", va="center")
+		plt.tight_layout()
+		output_path = self.images_dir / "peer_comparison.png"
+		plt.savefig(output_path, dpi=300, bbox_inches="tight")
+		plt.close()
+		return str(output_path)
+	def create_efficiency_analysis(self, model_name: str) -> str:
+		"""Create efficiency analysis chart using actual evaluation results."""
+		if self.comparison_df is None or self.comparison_df.empty:
+			logger.warning("No comparison data available for efficiency analysis")
+			return ""
+		models = []
+		scores = []
+		params = []
+		is_user_model = []
+		# Process all evaluated models
+		for _, row in self.comparison_df.iterrows():
+			model_display_name = row["Model"]
+			current_model_score = row["NDCG@10"]
+			# Determine if this is the user's model
+			is_users = (
+				model_name.lower() in model_display_name.lower() or "gte_qwen2_m2v_code" in model_display_name.lower()
+			)
+			if is_users:
+				# User's distilled model
+				models.append(model_display_name)
+				# Safe conversion to float for pandas values
+				score_value = pd.to_numeric(current_model_score, errors="coerce")
+				scores.append(float(score_value) if not pd.isna(score_value) else 0.0)
+				# Safe conversion for DISTILLED_MODEL_SPECS parameters
+				param_value = DISTILLED_MODEL_SPECS.get("parameters", 39)
+				params.append(float(param_value) if isinstance(param_value, (int, float)) else 39.0)
+				is_user_model.append(True)
+			else:
+				# Find corresponding peer model specs
+				model_key = None
+				for peer_key in MODEL_SPECS:
+					peer_short_name = peer_key.split("/")[-1].lower()
+					if peer_short_name in model_display_name.lower():
+						model_key = peer_key
+						break
+				if model_key and model_key in MODEL_SPECS:
+					models.append(model_display_name.split("/")[-1])  # Short name
+					# Safe conversion to float for pandas values
+					score_value = pd.to_numeric(current_model_score, errors="coerce")
+					scores.append(float(score_value) if not pd.isna(score_value) else 0.0)
+					params.append(float(MODEL_SPECS[model_key].get("parameters", 100)))
+					is_user_model.append(False)
+		if not models:
+			logger.warning("No models with parameter specifications found")
+			return ""
+		plt.figure(figsize=(12, 8))
+		# Plot peer models
+		peer_models = [m for i, m in enumerate(models) if not is_user_model[i]]
+		peer_params = [p for i, p in enumerate(params) if not is_user_model[i]]
+		peer_scores = [s for i, s in enumerate(scores) if not is_user_model[i]]
+		if peer_models:
+			plt.scatter(
+				peer_params,
+				peer_scores,
+				s=100,
+				alpha=0.6,
+				label="Peer Models",
+				color="skyblue",
+			)
+		# Plot user's model
+		user_models = [m for i, m in enumerate(models) if is_user_model[i]]
+		user_params = [p for i, p in enumerate(params) if is_user_model[i]]
+		user_scores = [s for i, s in enumerate(scores) if is_user_model[i]]
+		if user_models:
+			plt.scatter(
+				user_params,
+				user_scores,
+				s=200,
+				color="red",
+				alpha=0.8,
+				label=f"{user_models[0]} (Distilled)",
+				marker="*",
+			)
+		# Add model labels
+		for i, (model, param, score) in enumerate(zip(models, params, scores, strict=False)):
+			if is_user_model[i]:
+				plt.annotate(
+					model,
+					(param, score),
+					xytext=(10, 10),
+					textcoords="offset points",
+					fontweight="bold",
+					color="red",
+				)
+			else:
+				plt.annotate(
+					model,
+					(param, score),
+					xytext=(5, 5),
+					textcoords="offset points",
+					fontsize=9,
+				)
+		plt.xlabel("Model Size (Million Parameters)", fontsize=12)
+		plt.ylabel("NDCG@10 Score", fontsize=12)
+		plt.title(
+			"Model Efficiency: Performance vs Size (Actual Results)",
+			fontsize=16,
+			fontweight="bold",
+		)
+		plt.legend()
+		plt.grid(visible=True, alpha=0.3)
+		plt.xscale("log")
+		plt.tight_layout()
+		output_path = self.images_dir / "efficiency_analysis.png"
+		plt.savefig(output_path, dpi=300, bbox_inches="tight")
+		plt.close()
+		return str(output_path)
+	def generate_comprehensive_report(self, model_name: str = "Simplified Distillation Models") -> str:
+		"""Generate comprehensive markdown report for all evaluated models."""
+		if not self.results:
+			logger.error("No results to analyze")
+			return ""
+		# Find all simplified distillation models
+		simplified_models = []
+		peer_models = []
+		for result in self.results:
+			result_model_name = result["model_name"]
+			if (
+				"code_model2vec" in result_model_name.lower()
+				or "distilled" in result_model_name.lower()
+				or "(ours)" in result_model_name.lower()
+			):
+				simplified_models.append(result)
+			else:
+				peer_models.append(result)
+		# Get the best performing simplified model for main analysis
+		if simplified_models:
+			main_result = max(simplified_models, key=lambda x: x.get("overall", {}).get("ndcg@10", 0))
+			main_model_name = main_result["model_name"]
+		else:
+			# Fallback to first result if no simplified models found
+			main_result = self.results[0]
+			main_model_name = main_result["model_name"]
+		overall = main_result.get("overall", {})
+		languages = main_result.get("languages", {})
+		# Calculate language scores for radar chart
+		language_scores = {}
+		for lang, lang_data in languages.items():
+			metrics = lang_data.get("metrics", {})
+			language_scores[lang.title()] = metrics.get("ndcg@10", 0)
+		# Create visualizations
+		logger.info("Generating visualizations...")
+		setup_directories()
+		self.create_performance_radar_chart(main_model_name, language_scores)
+		comparison_chart = self.plot_model_comparison()
+		heatmap_chart = self.plot_language_heatmap()
+		peer_chart = self.create_peer_comparison_chart(main_model_name)
+		efficiency_chart = self.create_efficiency_analysis(main_model_name)
+		# Generate individual radar charts for all simplified models
+		individual_radar_charts = self.create_individual_radar_charts(simplified_models)
+		# Create comparative radar chart (best distilled vs top peer models)
+		comparative_radar_chart = self.create_comparative_radar_chart(simplified_models, peer_models)
+		# Create benchmark visualizations
+		benchmark_chart = ""
+		batch_scaling_chart = ""
+		memory_scaling_chart = ""
+		if self.benchmark_results:
+			benchmark_chart = self.plot_benchmark_performance()
+			batch_scaling_chart = self.plot_batch_size_scaling()
+			memory_scaling_chart = self.plot_memory_scaling()
+		# Generate report
+		report = f"""# Code-Specialized Model2Vec Distillation Analysis
+## 🎯 Executive Summary
+This report presents a comprehensive analysis of Model2Vec distillation experiments using different teacher models for code-specialized embedding generation.
+### Evaluated Models Overview
+**Simplified Distillation Models:** {len(simplified_models)}
+**Peer Comparison Models:** {len(peer_models)}
+**Total Models Analyzed:** {len(self.results)}
+### Best Performing Simplified Model: {main_model_name}
+**Overall CodeSearchNet Performance:**
+- **NDCG@10**: {overall.get("ndcg@10", 0):.4f}
+- **Mean Reciprocal Rank (MRR)**: {overall.get("mrr", 0):.4f}
+- **Recall@5**: {overall.get("recall@5", 0):.4f}
+- **Mean Rank**: {overall.get("mean_rank", 0):.1f}
+## 📊 Comprehensive Model Comparison
+### All Simplified Distillation Models Performance
+"""
+		# Add table of all simplified models
+		if simplified_models:
+			report += "| Model | Teacher | NDCG@10 | MRR | Recall@5 | Status |\n"
+			report += "|-------|---------|---------|-----|----------|--------|\n"
+			# Sort by performance
+			simplified_models_sorted = sorted(
+				simplified_models, key=lambda x: x.get("overall", {}).get("ndcg@10", 0), reverse=True
+			)
+			for rank, result in enumerate(simplified_models_sorted, 1):
+				model_display = result["model_name"]
+				overall_metrics = result.get("overall", {})
+				# Extract teacher model name from model name
+				teacher = "Unknown"
+				if "all_MiniLM_L6_v2" in model_display:
+					teacher = "all-MiniLM-L6-v2"
+				elif "codebert_base" in model_display:
+					teacher = "codebert-base"
+				elif "graphcodebert_base" in model_display:
+					teacher = "graphcodebert-base"
+				elif "gte_Qwen2_7B_instruct" in model_display:
+					teacher = "gte-Qwen2-7B-instruct"
+				elif "all_mpnet_base_v2" in model_display:
+					teacher = "all-mpnet-base-v2"
+				status = "🥇 Best" if rank == 1 else "🥈 2nd" if rank == 2 else "🥉 3rd" if rank == 3 else f"#{rank}"
+				report += f"| {model_display} | {teacher} | {overall_metrics.get('ndcg@10', 0):.4f} | {overall_metrics.get('mrr', 0):.4f} | {overall_metrics.get('recall@5', 0):.4f} | {status} |\n"
+		report += """
+### Key Findings
+"""
+		if simplified_models and len(simplified_models) > 1:
+			best_model = simplified_models_sorted[0]
+			worst_model = simplified_models_sorted[-1]
+			best_score = best_model.get("overall", {}).get("ndcg@10", 0)
+			worst_score = worst_model.get("overall", {}).get("ndcg@10", 0)
+			report += f"""
+- **Best Teacher Model**: {best_model["model_name"]} (NDCG@10: {best_score:.4f})
+- **Least Effective Teacher**: {worst_model["model_name"]} (NDCG@10: {worst_score:.4f})
+- **Performance Range**: {((best_score - worst_score) / best_score * 100):.1f}% difference between best and worst
+- **Average Performance**: {sum(r.get("overall", {}).get("ndcg@10", 0) for r in simplified_models) / len(simplified_models):.4f} NDCG@10
+"""
+		# Add radar charts section
+		report += """
+## 🎯 Language Performance Radar Charts
+### Best Model vs Peer Models Comparison
+"""
+		if comparative_radar_chart:
+			report += f"![Comparative Radar Chart]({comparative_radar_chart})\n\n"
+			report += "*Comparative view showing how the best simplified distillation model performs against top peer models across programming languages.*\n\n"
+		# Add individual radar charts for all simplified models
+		if individual_radar_charts:
+			report += "### Individual Model Performance by Language\n\n"
+			for chart_model_name, chart_path in individual_radar_charts.items():
+				# Extract teacher name for cleaner display
+				teacher = "Unknown"
+				if "all_MiniLM_L6_v2" in chart_model_name:
+					teacher = "all-MiniLM-L6-v2"
+				elif "codebert_base" in chart_model_name:
+					teacher = "codebert-base"
+				elif "graphcodebert_base" in chart_model_name:
+					teacher = "graphcodebert-base"
+				elif "gte_Qwen2_7B_instruct" in chart_model_name:
+					teacher = "gte-Qwen2-7B-instruct"
+				elif "all_mpnet_base_v2" in chart_model_name:
+					teacher = "all-mpnet-base-v2"
+				report += f"#### {chart_model_name} (Teacher: {teacher})\n\n"
+				report += f"![{chart_model_name} Radar Chart]({chart_path})\n\n"
+			report += f"""
+## 🏆 Peer Model Comparison
+![Peer Comparison]({peer_chart})
+*Comparison with established code-specialized embedding models using actual evaluation results.*
+### Complete Model Ranking
+"""
+		# Add comprehensive ranking table
+		if self.comparison_df is not None and len(self.comparison_df) > 0:
+			report += "| Rank | Model | Type | NDCG@10 | MRR | Recall@5 |\n"
+			report += "|------|-------|------|---------|-----|----------|\n"
+			for rank in range(len(self.comparison_df)):
+				row_data = self.comparison_df.iloc[rank]
+				model_name_display = str(row_data["Model"])
+				# Determine model type
+				if (
+					"code_model2vec" in model_name_display.lower()
+					or "distilled" in model_name_display.lower()
+					or "(ours)" in model_name_display.lower()
+				):
+					model_type = "**🔥 Simplified Distillation**"
+				elif any(code_term in model_name_display.lower() for code_term in ["codebert", "graphcode", "codet5"]):
+					model_type = "Code-Specific"
+				elif "potion" in model_name_display.lower():
+					model_type = "Model2Vec"
+				else:
+					model_type = "General"
+				report += f"| {rank + 1} | {model_name_display} | {model_type} | {row_data['NDCG@10']:.4f} | {row_data['MRR']:.4f} | {row_data['Recall@5']:.4f} |\n"
+		report += f"""
+## 📈 Performance Analysis
+### Multi-Model Comparison Charts
+![Model Comparison]({comparison_chart})
+*Comprehensive comparison across all evaluation metrics.*
+### Language Performance Analysis
+![Language Heatmap]({heatmap_chart})
+*Performance heatmap showing how different models perform across programming languages.*
+### Efficiency Analysis
+![Efficiency Analysis]({efficiency_chart})
+*Performance vs model size analysis showing the efficiency benefits of distillation.*
+"""
+		# Add benchmark analysis if available
+		if self.benchmark_results:
+			report += f"""
+## ⚡ Operational Performance Analysis
+![Benchmark Performance]({benchmark_chart})
+*Comprehensive performance benchmarking across multiple operational metrics.*
+### Performance Scaling Analysis
+![Batch Size Scaling]({batch_scaling_chart})
+*How performance scales with different batch sizes for optimal throughput.*
+![Memory Scaling]({memory_scaling_chart})
+*Memory usage patterns across different batch sizes.*
+"""
+		# Add detailed language analysis
+		report += """
+## 🔍 Language-Specific Analysis
+### Performance by Programming Language
+"""
+		if language_scores:
+			report += "| Language | Best Model Performance | Average Performance | Language Difficulty |\n"
+			report += "|----------|------------------------|--------------------|--------------------||\n"
+			for lang in sorted(language_scores.keys()):
+				# Find best performance for this language across all models
+				lang_performances = []
+				for result in self.results:
+					lang_data = result.get("languages", {}).get(lang.lower(), {})
+					if lang_data:
+						lang_performances.append(lang_data.get("metrics", {}).get("ndcg@10", 0))
+				if lang_performances:
+					best_lang_perf = max(lang_performances)
+					avg_lang_perf = sum(lang_performances) / len(lang_performances)
+					difficulty = "Easy" if avg_lang_perf > 0.3 else "Medium" if avg_lang_perf > 0.2 else "Hard"
+					report += f"| {lang} | {best_lang_perf:.4f} | {avg_lang_perf:.4f} | {difficulty} |\n"
+		report += """
+## 🎯 Conclusions and Recommendations
+### Teacher Model Analysis
+Based on the evaluation results across all simplified distillation models:
+"""
+		if simplified_models and len(simplified_models) > 1:
+			# Analyze which teacher models work best
+			teacher_performance = {}
+			for result in simplified_models:
+				model_name = result["model_name"]
+				score = result.get("overall", {}).get("ndcg@10", 0)
+				if "all_MiniLM_L6_v2" in model_name:
+					teacher_performance["all-MiniLM-L6-v2"] = score
+				elif "codebert_base" in model_name:
+					teacher_performance["codebert-base"] = score
+				elif "graphcodebert_base" in model_name:
+					teacher_performance["graphcodebert-base"] = score
+				elif "gte_Qwen2_7B_instruct" in model_name:
+					teacher_performance["gte-Qwen2-7B-instruct"] = score
+				elif "all_mpnet_base_v2" in model_name:
+					teacher_performance["all-mpnet-base-v2"] = score
+			if teacher_performance:
+				best_teacher = max(teacher_performance.items(), key=lambda x: x[1])
+				worst_teacher = min(teacher_performance.items(), key=lambda x: x[1])
+				report += f"""
+1. **Best Teacher Model**: {best_teacher[0]} (NDCG@10: {best_teacher[1]:.4f})
+2. **Least Effective Teacher**: {worst_teacher[0]} (NDCG@10: {worst_teacher[1]:.4f})
+3. **Teacher Model Impact**: Choice of teacher model affects performance by {((best_teacher[1] - worst_teacher[1]) / best_teacher[1] * 100):.1f}%
+### Recommendations
+- **For Production**: Use {best_teacher[0]} as teacher model for best performance
+- **For Efficiency**: Model2Vec distillation provides significant size reduction with competitive performance
+- **For Code Tasks**: Specialized models consistently outperform general-purpose models
+"""
+		report += f"""
+## 📄 Methodology
+### Evaluation Protocol
+- **Dataset**: CodeSearchNet test sets for 6 programming languages
+- **Metrics**: NDCG@k, MRR, Recall@k following CodeSearchNet methodology
+- **Query Format**: Natural language documentation strings
+- **Corpus Format**: Function code strings
+- **Evaluation**: Retrieval of correct code for each documentation query
+### Teacher Models Tested
+- sentence-transformers/all-MiniLM-L6-v2 (proven baseline)
+- microsoft/codebert-base (code-specialized)
+- microsoft/graphcodebert-base (graph-aware code model)
+- Alibaba-NLP/gte-Qwen2-7B-instruct (large instruction model)
+- sentence-transformers/all-mpnet-base-v2 (general purpose)
+### Distillation Method
+- **Technique**: Model2Vec static embedding generation
+- **Parameters**: PCA dims=256, SIF coefficient=1e-3, Zipf weighting=True
+- **Training Data**: CodeSearchNet comment-code pairs
+- **Languages**: Python, JavaScript, Java, PHP, Ruby, Go
+---
+*Report generated on {time.strftime("%Y-%m-%d %H:%M:%S")} using automated analysis pipeline.*
+*For questions about methodology or results, please refer to the CodeSearchNet documentation.*
+"""
+		return report
+	def export_results(self, output_file: str) -> None:
+		"""Export results to CSV format."""
+		if self.comparison_df is not None:
+			self.comparison_df.to_csv(output_file, index=False)
+			logger.info(f"Results exported to {output_file}")
+def main() -> None:
+	"""Main analysis function."""
+	parser = argparse.ArgumentParser(description="Analyze CodeSearchNet evaluation results and performance benchmarks")
+	parser.add_argument("--results-dir", default=DEFAULT_EVALUATION_DIR, help="Evaluation results directory")
+	parser.add_argument("--benchmark-dir", default=DEFAULT_BENCHMARK_DIR, help="Benchmark results directory")
+	parser.add_argument("--model-name", default="gte_qwen2_m2v_code (Ours)", help="Model name for report")
+	parser.add_argument("--output", default="REPORT.md", help="Output report file")
+	parser.add_argument("--export-csv", help="Export comparison results to CSV")
+	args = parser.parse_args()
+	logger.info("Starting CodeSearchNet Analysis with Benchmark Integration")
+	logger.info("=" * 60)
+	# Setup output directories
+	output_dir, images_dir, reports_dir = setup_directories()
+	# Initialize analyzer with local directories
+	analyzer = CodeSearchNetAnalyzer(
+		results_dir=args.results_dir,
+		benchmark_dir=args.benchmark_dir,
+		images_dir=images_dir,
+	)
+	# Load results (this will also load benchmark results)
+	analyzer.load_results()
+	if not analyzer.results:
+		logger.error("No evaluation results found! Please run evaluation first.")
+		return
+	# Print summary (includes both evaluation and benchmark summaries)
+	analyzer.print_summary()
+	analyzer.analyze_language_performance()
+	# Analyze benchmark performance if available
+	if analyzer.benchmark_results:
+		analyzer.analyze_benchmark_performance()
+	else:
+		logger.warning("No benchmark results found. Run benchmark.py first for complete analysis.")
+	# Generate comprehensive report with benchmark integration
+	logger.info("Generating comprehensive report with benchmark data...")
+	report = analyzer.generate_comprehensive_report(args.model_name)
+	# Save report
+	report_path = Path(args.output)
+	with report_path.open("w") as f:
+		f.write(report)
+	# Export CSV if requested
+	if args.export_csv:
+		analyzer.export_results(args.export_csv)
+	# Export benchmark CSV if available
+	if analyzer.benchmark_df is not None and not analyzer.benchmark_df.empty:
+		benchmark_csv = report_path.parent / f"{args.model_name}_benchmark_comparison.csv"
+		analyzer.benchmark_df.to_csv(benchmark_csv, index=False)
+		logger.info(f"📊 Benchmark comparison saved to: {benchmark_csv}")
+	logger.info("✅ CodeSearchNet analysis with benchmarks complete!")
+	logger.info(f"📊 Report saved to: {report_path}")
+	logger.info(f"🖼️  Charts saved to: {images_dir}")
+if __name__ == "__main__":
+	import argparse
+	main()

src/distiller/beam_utils.py ADDED Viewed

	@@ -0,0 +1,753 @@

+"""
+Beam Cloud Utilities for Model Distillation and Evaluation.
+This module provides comprehensive utilities for managing Beam volumes, checkpoints,
+and file operations across distillation, evaluation, and analysis workflows.
+Features:
+- Volume management (direct file operations when mounted)
+- Checkpoint operations (save, load, cleanup, resume)
+- File transfer utilities (copy, move, sync)
+- Evaluation result management
+- Model artifact handling
+- Distributed storage optimization
+"""
+import json
+import logging
+import shutil
+import time
+from pathlib import Path
+from typing import Any
+# Configure logging
+logger = logging.getLogger(__name__)
+class BeamVolumeManager:
+	"""Manager for Beam distributed storage volumes using direct file operations."""
+	def __init__(self, volume_name: str, mount_path: str = "./data") -> None:
+		"""
+		Initialize Beam Volume Manager.
+		Args:
+		    volume_name: Name of the Beam volume
+		    mount_path: Local mount path for the volume (should match Beam function mount path)
+		"""
+		self.volume_name = volume_name
+		self.mount_path = Path(mount_path)
+		self.mount_path.mkdir(parents=True, exist_ok=True)
+	def exists(self) -> bool:
+		"""Check if the volume mount path exists."""
+		return self.mount_path.exists()
+	def list_contents(self, subpath: str = "") -> list[dict[str, Any]]:
+		"""List contents of the volume directory."""
+		try:
+			target_path = self.mount_path / subpath if subpath else self.mount_path
+			if not target_path.exists():
+				logger.warning(f"⚠️ Path does not exist: {target_path}")
+				return []
+			contents: list[dict[str, Any]] = []
+			for item in target_path.iterdir():
+				stat = item.stat()
+				contents.append(
+					{
+						"name": item.name,
+						"size": f"{stat.st_size / (1024 * 1024):.2f}MB" if item.is_file() else "0MB",
+						"modified": time.ctime(stat.st_mtime),
+						"is_dir": item.is_dir(),
+						"path": str(item.relative_to(self.mount_path)),
+					}
+				)
+			return sorted(contents, key=lambda x: (not x["is_dir"], x["name"]))
+		except Exception:
+			logger.exception("❌ Error listing contents")
+			return []
+	def copy_file(self, src: str | Path, dst: str | Path) -> bool:
+		"""Copy a file within the volume."""
+		try:
+			src_path = self.mount_path / src if not Path(src).is_absolute() else Path(src)
+			dst_path = self.mount_path / dst if not Path(dst).is_absolute() else Path(dst)
+			dst_path.parent.mkdir(parents=True, exist_ok=True)
+			shutil.copy2(src_path, dst_path)
+			logger.info(f"📄 Copied {src_path.name} to {dst_path}")
+			return True
+		except Exception:
+			logger.exception("❌ Error copying file")
+			return False
+	def copy_directory(self, src: str | Path, dst: str | Path) -> bool:
+		"""Copy a directory within the volume."""
+		try:
+			src_path = self.mount_path / src if not Path(src).is_absolute() else Path(src)
+			dst_path = self.mount_path / dst if not Path(dst).is_absolute() else Path(dst)
+			if dst_path.exists():
+				shutil.rmtree(dst_path)
+			shutil.copytree(src_path, dst_path)
+			logger.info(f"📁 Copied directory {src_path.name} to {dst_path}")
+			return True
+		except Exception:
+			logger.exception("❌ Error copying directory")
+			return False
+	def move_file(self, src: str | Path, dst: str | Path) -> bool:
+		"""Move a file within the volume."""
+		try:
+			src_path = self.mount_path / src if not Path(src).is_absolute() else Path(src)
+			dst_path = self.mount_path / dst if not Path(dst).is_absolute() else Path(dst)
+			dst_path.parent.mkdir(parents=True, exist_ok=True)
+			shutil.move(str(src_path), str(dst_path))
+			logger.info(f"➡️ Moved {src_path.name} to {dst_path}")
+			return True
+		except Exception:
+			logger.exception("❌ Error moving file")
+			return False
+	def remove_file(self, file_path: str | Path) -> bool:
+		"""Remove a file from the volume."""
+		try:
+			target_path = self.mount_path / file_path if not Path(file_path).is_absolute() else Path(file_path)
+			if target_path.exists():
+				if target_path.is_file():
+					target_path.unlink()
+					logger.info(f"🗑️ Removed file: {target_path.name}")
+				else:
+					logger.warning(f"⚠️ Path is not a file: {target_path}")
+					return False
+				return True
+			logger.warning(f"⚠️ File does not exist: {target_path}")
+			return False
+		except Exception:
+			logger.exception("❌ Error removing file")
+			return False
+	def remove_directory(self, dir_path: str | Path) -> bool:
+		"""Remove a directory from the volume."""
+		try:
+			target_path = self.mount_path / dir_path if not Path(dir_path).is_absolute() else Path(dir_path)
+			if target_path.exists() and target_path.is_dir():
+				shutil.rmtree(target_path)
+				logger.info(f"🗑️ Removed directory: {target_path.name}")
+				return True
+			logger.warning(f"⚠️ Directory does not exist: {target_path}")
+			return False
+		except Exception:
+			logger.exception("❌ Error removing directory")
+			return False
+	def cleanup_old_files(self, pattern: str = "*", older_than_days: int = 7, subpath: str = "") -> list[str]:
+		"""Clean up old files in the volume based on age and pattern."""
+		try:
+			target_path = self.mount_path / subpath if subpath else self.mount_path
+			if not target_path.exists():
+				return []
+			cutoff_time = time.time() - (older_than_days * 24 * 3600)
+			removed_files: list[str] = []
+			for item in target_path.rglob(pattern):
+				if item.is_file() and item.stat().st_mtime < cutoff_time:
+					try:
+						item.unlink()
+						removed_files.append(str(item.relative_to(self.mount_path)))
+						logger.info(f"🧹 Removed old file: {item.name}")
+					except Exception as e:
+						logger.warning(f"⚠️ Could not remove {item.name}: {e}")
+			if removed_files:
+				logger.info(f"🧹 Cleaned up {len(removed_files)} old files")
+			return removed_files
+		except Exception:
+			logger.exception("❌ Error during cleanup")
+			return []
+	def get_size(self, subpath: str = "") -> int:
+		"""Get total size of volume or subpath in bytes."""
+		try:
+			target_path = self.mount_path / subpath if subpath else self.mount_path
+			if not target_path.exists():
+				return 0
+			total_size = 0
+			for item in target_path.rglob("*"):
+				if item.is_file():
+					total_size += item.stat().st_size
+			return total_size
+		except Exception:
+			logger.exception("❌ Error calculating size")
+			return 0
+class BeamCheckpointManager:
+	"""Manager for checkpoint operations on Beam volumes."""
+	def __init__(self, volume_manager: BeamVolumeManager, checkpoint_prefix: str = "checkpoints") -> None:
+		"""
+		Initialize Checkpoint Manager.
+		Args:
+		    volume_manager: BeamVolumeManager instance
+		    checkpoint_prefix: Prefix for checkpoint files
+		"""
+		self.volume = volume_manager
+		self.checkpoint_prefix = checkpoint_prefix
+		self.checkpoint_dir = self.volume.mount_path / checkpoint_prefix
+		self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
+	def save_checkpoint(self, stage: str, data: dict[str, Any], step: int = 0) -> bool:
+		"""Save checkpoint to volume."""
+		try:
+			checkpoint_filename = f"{self.checkpoint_prefix}_{stage}_step_{step}.json"
+			checkpoint_path = self.checkpoint_dir / checkpoint_filename
+			with checkpoint_path.open("w") as f:
+				json.dump(data, f, indent=2, default=str)
+			logger.info(f"💾 Saved checkpoint: {stage} step {step}")
+			return True
+		except Exception:
+			logger.exception("❌ Error saving checkpoint")
+			return False
+	def load_checkpoint(self, stage: str, step: int = 0) -> dict[str, Any] | None:
+		"""Load checkpoint from volume."""
+		try:
+			checkpoint_filename = f"{self.checkpoint_prefix}_{stage}_step_{step}.json"
+			checkpoint_path = self.checkpoint_dir / checkpoint_filename
+			if checkpoint_path.exists():
+				with checkpoint_path.open("r") as f:
+					data = json.load(f)
+					logger.info(f"📂 Loaded checkpoint: {stage} step {step}")
+					return data
+			logger.info(f"Info: No checkpoint found: {stage} step {step}")
+			return None
+		except Exception:
+			logger.exception("❌ Error loading checkpoint")
+			return None
+	def get_latest_checkpoint(self, stage: str) -> tuple[int, dict[str, Any]] | None:
+		"""Get the latest checkpoint for a stage."""
+		try:
+			# Find checkpoint files for this stage
+			pattern = f"{self.checkpoint_prefix}_{stage}_step_*.json"
+			stage_checkpoints: list[tuple[int, Path]] = []
+			for checkpoint_file in self.checkpoint_dir.glob(pattern):
+				try:
+					# Extract step number from filename
+					step_str = checkpoint_file.stem.replace(f"{self.checkpoint_prefix}_{stage}_step_", "")
+					step = int(step_str)
+					stage_checkpoints.append((step, checkpoint_file))
+				except ValueError:
+					continue
+			if not stage_checkpoints:
+				logger.info(f"Info: No checkpoints found for stage: {stage}")
+				return None
+			# Get the latest checkpoint
+			latest_step, latest_file = max(stage_checkpoints, key=lambda x: x[0])
+			# Load the latest checkpoint
+			with latest_file.open("r") as f:
+				data = json.load(f)
+				logger.info(f"📂 Found latest checkpoint: {stage} step {latest_step}")
+				return latest_step, data
+		except Exception:
+			logger.exception("❌ Error finding latest checkpoint")
+			return None
+	def cleanup_old_checkpoints(self, stage: str, keep_latest: int = 3) -> list[str]:
+		"""Clean up old checkpoints, keeping only the latest N."""
+		try:
+			# Find checkpoint files for this stage
+			pattern = f"{self.checkpoint_prefix}_{stage}_step_*.json"
+			stage_checkpoints: list[tuple[int, Path]] = []
+			for checkpoint_file in self.checkpoint_dir.glob(pattern):
+				try:
+					step_str = checkpoint_file.stem.replace(f"{self.checkpoint_prefix}_{stage}_step_", "")
+					step = int(step_str)
+					stage_checkpoints.append((step, checkpoint_file))
+				except ValueError:
+					continue
+			# Sort by step number (descending)
+			stage_checkpoints.sort(key=lambda x: x[0], reverse=True)
+			# Remove old checkpoints
+			removed_files: list[str] = []
+			if len(stage_checkpoints) > keep_latest:
+				for _step, checkpoint_file in stage_checkpoints[keep_latest:]:
+					try:
+						checkpoint_file.unlink()
+						removed_files.append(checkpoint_file.name)
+						logger.info(f"🧹 Removed old checkpoint: {checkpoint_file.name}")
+					except Exception as e:
+						logger.warning(f"⚠️ Could not remove {checkpoint_file.name}: {e}")
+			if removed_files:
+				logger.info(f"🧹 Cleaned up {len(removed_files)} old checkpoints for {stage}")
+			return removed_files
+		except Exception:
+			logger.exception("❌ Error cleaning up checkpoints")
+			return []
+	def list_checkpoints(self, stage: str | None = None) -> list[dict[str, Any]]:
+		"""List all checkpoints, optionally filtered by stage."""
+		try:
+			checkpoints: list[dict[str, Any]] = []
+			pattern = f"{self.checkpoint_prefix}_*.json"
+			for checkpoint_file in self.checkpoint_dir.glob(pattern):
+				# Parse checkpoint info
+				name_parts = checkpoint_file.stem.split("_")
+				if len(name_parts) >= 4:
+					checkpoint_stage = name_parts[1]
+					try:
+						step = int(name_parts[3])
+					except ValueError:
+						step = 0
+					if stage is None or checkpoint_stage == stage:
+						stat = checkpoint_file.stat()
+						checkpoints.append(
+							{
+								"stage": checkpoint_stage,
+								"step": step,
+								"filename": checkpoint_file.name,
+								"size": f"{stat.st_size / 1024:.1f}KB",
+								"modified": time.ctime(stat.st_mtime),
+							}
+						)
+			return sorted(checkpoints, key=lambda x: (x["stage"], x["step"]))
+		except Exception:
+			logger.exception("❌ Error listing checkpoints")
+			return []
+class BeamModelManager:
+	"""Manager for model artifacts on Beam volumes."""
+	def __init__(self, volume_manager: BeamVolumeManager, model_prefix: str = "models") -> None:
+		"""
+		Initialize Model Manager.
+		Args:
+		    volume_manager: BeamVolumeManager instance
+		    model_prefix: Prefix for model files
+		"""
+		self.volume = volume_manager
+		self.model_prefix = model_prefix
+		self.model_dir = self.volume.mount_path / model_prefix
+		self.model_dir.mkdir(parents=True, exist_ok=True)
+	def save_model(self, model_name: str, local_model_path: str | Path) -> bool:
+		"""Save model to Beam volume."""
+		try:
+			local_path = Path(local_model_path)
+			if not local_path.exists():
+				logger.error(f"❌ Model path does not exist: {local_path}")
+				return False
+			model_dest = self.model_dir / model_name
+			if local_path.is_dir():
+				# Copy entire directory
+				if model_dest.exists():
+					shutil.rmtree(model_dest)
+				shutil.copytree(local_path, model_dest)
+				logger.info(f"💾 Saved model directory {model_name}")
+			else:
+				# Copy single file
+				model_dest.mkdir(exist_ok=True)
+				shutil.copy2(local_path, model_dest / local_path.name)
+				logger.info(f"💾 Saved model file {model_name}")
+			return True
+		except Exception:
+			logger.exception("❌ Error saving model")
+			return False
+	def load_model(self, model_name: str, local_model_path: str | Path = "./models") -> bool:
+		"""Load model from Beam volume."""
+		try:
+			local_path = Path(local_model_path)
+			local_path.mkdir(parents=True, exist_ok=True)
+			model_source = self.model_dir / model_name
+			model_dest = local_path / model_name
+			if not model_source.exists():
+				logger.error(f"❌ Model does not exist: {model_name}")
+				return False
+			if model_dest.exists():
+				if model_dest.is_dir():
+					shutil.rmtree(model_dest)
+				else:
+					model_dest.unlink()
+			if model_source.is_dir():
+				shutil.copytree(model_source, model_dest)
+			else:
+				shutil.copy2(model_source, model_dest)
+			logger.info(f"📥 Loaded model {model_name}")
+			return True
+		except Exception:
+			logger.exception("❌ Error loading model")
+			return False
+	def list_models(self) -> list[dict[str, str]]:
+		"""List all models in the volume."""
+		try:
+			models: list[dict[str, str]] = []
+			if not self.model_dir.exists():
+				return models
+			for item in self.model_dir.iterdir():
+				if item.is_dir():
+					stat = item.stat()
+					# Calculate directory size
+					total_size = sum(f.stat().st_size for f in item.rglob("*") if f.is_file())
+					models.append(
+						{
+							"name": item.name,
+							"size": f"{total_size / (1024 * 1024):.1f}MB",
+							"modified": time.ctime(stat.st_mtime),
+						}
+					)
+			return sorted(models, key=lambda x: x["name"])
+		except Exception:
+			logger.exception("❌ Error listing models")
+			return []
+	def remove_model(self, model_name: str) -> bool:
+		"""Remove a model from the volume."""
+		try:
+			model_path = self.model_dir / model_name
+			if model_path.exists():
+				if model_path.is_dir():
+					shutil.rmtree(model_path)
+				else:
+					model_path.unlink()
+				logger.info(f"🗑️ Removed model: {model_name}")
+				return True
+			logger.warning(f"⚠️ Model does not exist: {model_name}")
+			return False
+		except Exception:
+			logger.exception("❌ Error removing model")
+			return False
+class BeamEvaluationManager:
+	"""Manager for evaluation results on Beam volumes."""
+	def __init__(
+		self,
+		volume_manager: BeamVolumeManager,
+		results_prefix: str = "evaluation_results",
+	) -> None:
+		"""
+		Initialize Evaluation Manager.
+		Args:
+		    volume_manager: BeamVolumeManager instance
+		    results_prefix: Prefix for evaluation result files
+		"""
+		self.volume = volume_manager
+		self.results_prefix = results_prefix
+		self.results_dir = self.volume.mount_path / results_prefix
+		self.results_dir.mkdir(parents=True, exist_ok=True)
+	def save_evaluation_results(
+		self, model_name: str, results: dict[str, Any], eval_type: str = "codesearchnet"
+	) -> bool:
+		"""Save evaluation results to Beam volume."""
+		try:
+			results_filename = f"{eval_type}_eval_{model_name.replace('/', '_')}.json"
+			results_path = self.results_dir / results_filename
+			with results_path.open("w") as f:
+				json.dump(results, f, indent=2, default=str)
+			logger.info(f"💾 Saved evaluation results for {model_name}")
+			return True
+		except Exception:
+			logger.exception("❌ Error saving evaluation results")
+			return False
+	def load_evaluation_results(self, model_name: str, eval_type: str = "codesearchnet") -> dict[str, Any] | None:
+		"""Load evaluation results from Beam volume."""
+		try:
+			results_filename = f"{eval_type}_eval_{model_name.replace('/', '_')}.json"
+			results_path = self.results_dir / results_filename
+			if results_path.exists():
+				with results_path.open("r") as f:
+					results = json.load(f)
+					logger.info(f"📂 Loaded evaluation results for {model_name}")
+					return results
+			logger.info(f"Info: No evaluation results found for {model_name}")
+			return None
+		except Exception:
+			logger.exception("❌ Error loading evaluation results")
+			return None
+	def list_evaluation_results(self, eval_type: str | None = None) -> list[dict[str, str]]:
+		"""List all evaluation results."""
+		try:
+			results: list[dict[str, str]] = []
+			if not self.results_dir.exists():
+				return results
+			for item in self.results_dir.glob("*.json"):
+				# Parse evaluation info
+				if eval_type is None or item.name.startswith(f"{eval_type}_eval_"):
+					# Extract model name from filename
+					model_name = item.name.replace("_eval_", "_").replace(".json", "")
+					if eval_type:
+						model_name = model_name.replace(f"{eval_type}_", "")
+					stat = item.stat()
+					results.append(
+						{
+							"model_name": model_name,
+							"filename": item.name,
+							"size": f"{stat.st_size / 1024:.1f}KB",
+							"modified": time.ctime(stat.st_mtime),
+						}
+					)
+			return sorted(results, key=lambda x: x["model_name"])
+		except Exception:
+			logger.exception("❌ Error listing evaluation results")
+			return []
+	def remove_evaluation_results(self, model_name: str, eval_type: str = "codesearchnet") -> bool:
+		"""Remove evaluation results from volume."""
+		try:
+			results_filename = f"{eval_type}_eval_{model_name.replace('/', '_')}.json"
+			results_path = self.results_dir / results_filename
+			if results_path.exists():
+				results_path.unlink()
+				logger.info(f"🗑️ Removed evaluation results for {model_name}")
+				return True
+			logger.warning(f"⚠️ Evaluation results do not exist for {model_name}")
+			return False
+		except Exception:
+			logger.exception("❌ Error removing evaluation results")
+			return False
+def create_beam_utilities(
+	volume_name: str, mount_path: str = "./data"
+) -> tuple[BeamVolumeManager, BeamCheckpointManager, BeamModelManager, BeamEvaluationManager]:
+	"""
+	Create a complete set of Beam utilities.
+	Args:
+	    volume_name: Name of the Beam volume
+	    mount_path: Local mount path for the volume
+	Returns:
+	    Tuple of (volume_manager, checkpoint_manager, model_manager, evaluation_manager)
+	"""
+	volume_manager = BeamVolumeManager(volume_name, mount_path)
+	checkpoint_manager = BeamCheckpointManager(volume_manager)
+	model_manager = BeamModelManager(volume_manager)
+	evaluation_manager = BeamEvaluationManager(volume_manager)
+	return volume_manager, checkpoint_manager, model_manager, evaluation_manager
+def cleanup_beam_workspace(volume_name: str, mount_path: str = "./data", confirm: bool = False) -> bool:
+	"""
+	Clean up entire Beam workspace including all data in the mounted volume.
+	Args:
+	    volume_name: Name of the volume to clean up
+	    mount_path: Mount path of the volume
+	    confirm: If True, skip confirmation prompt
+	Returns:
+	    True if cleanup successful, False otherwise
+	"""
+	if not confirm:
+		response = input(f"⚠️ This will delete all data in volume '{volume_name}' at '{mount_path}'. Continue? (y/N): ")
+		if response.lower() != "y":
+			logger.info("Cleanup cancelled")
+			return False
+	try:
+		volume_manager = BeamVolumeManager(volume_name, mount_path)
+		if not volume_manager.exists():
+			logger.info(f"Volume mount path does not exist: {mount_path}")
+			return True
+		# List what will be deleted
+		contents = volume_manager.list_contents()
+		logger.info(f"🗑️ Will delete {len(contents)} items from volume '{volume_name}'")
+		# Delete all contents in the mount path
+		for item in volume_manager.mount_path.iterdir():
+			try:
+				if item.is_dir():
+					shutil.rmtree(item)
+					logger.info(f"🗑️ Removed directory: {item.name}")
+				else:
+					item.unlink()
+					logger.info(f"🗑️ Removed file: {item.name}")
+			except Exception as e:
+				logger.warning(f"⚠️ Could not remove {item.name}: {e}")
+		logger.info(f"✅ Successfully cleaned up Beam workspace: {volume_name}")
+		return True
+	except Exception:
+		logger.exception("❌ Error during cleanup")
+		return False
+def get_workspace_info(volume_name: str, mount_path: str = "./data") -> dict[str, Any]:
+	"""
+	Get information about the Beam workspace.
+	Args:
+	    volume_name: Name of the volume
+	    mount_path: Mount path of the volume
+	Returns:
+	    Dictionary with workspace information
+	"""
+	try:
+		volume_manager = BeamVolumeManager(volume_name, mount_path)
+		if not volume_manager.exists():
+			return {
+				"volume_name": volume_name,
+				"mount_path": mount_path,
+				"exists": False,
+				"size": 0,
+				"contents": [],
+			}
+		contents = volume_manager.list_contents()
+		total_size = volume_manager.get_size()
+		return {
+			"volume_name": volume_name,
+			"mount_path": str(volume_manager.mount_path),
+			"exists": True,
+			"size": total_size,
+			"size_mb": f"{total_size / (1024 * 1024):.1f}MB",
+			"num_items": len(contents),
+			"contents": contents[:10],  # First 10 items
+		}
+	except Exception:
+		logger.exception("❌ Error getting workspace info")
+		return {
+			"volume_name": volume_name,
+			"mount_path": mount_path,
+			"error": "Error occurred",
+		}
+# Example usage functions
+def example_distillation_workflow() -> None:
+	"""Example of using Beam utilities for distillation workflow."""
+	volume_name = "gte_qwen2_m2v_code"
+	mount_path = "./gte_qwen2_m2v_code"  # Should match Beam function mount path
+	# Create utilities
+	volume_mgr, checkpoint_mgr, model_mgr, eval_mgr = create_beam_utilities(volume_name, mount_path)
+	# Check if volume exists
+	if volume_mgr.exists():
+		logger.info(f"Volume {volume_name} is mounted at {mount_path}")
+	else:
+		logger.warning(f"Volume {volume_name} not found at {mount_path}")
+		return
+	# Save a checkpoint
+	checkpoint_data = {
+		"epoch": 1,
+		"loss": 0.25,
+		"model_state": "dummy_state",
+		"timestamp": time.time(),
+	}
+	checkpoint_mgr.save_checkpoint("training", checkpoint_data, step=1000)
+	# List checkpoints
+	checkpoints = checkpoint_mgr.list_checkpoints("training")
+	logger.info(f"Found {len(checkpoints)} training checkpoints")
+	# Save evaluation results
+	eval_results = {
+		"model_name": "gte_qwen2_m2v_code",
+		"overall": {"ndcg@10": 0.35, "mrr": 0.42},
+		"timestamp": time.time(),
+	}
+	eval_mgr.save_evaluation_results("gte_qwen2_m2v_code", eval_results)
+	# Get workspace info
+	info = get_workspace_info(volume_name, mount_path)
+	logger.info(f"Workspace info: {info}")
+if __name__ == "__main__":
+	# Example usage
+	logging.basicConfig(level=logging.INFO)
+	example_distillation_workflow()

src/distiller/benchmark.py ADDED Viewed

	@@ -0,0 +1,1181 @@

+"""
+Operational Performance Benchmarking for Embedding Models.
+This module benchmarks embedding models on operational metrics like:
+- Inference speed (latency and throughput)
+- Memory efficiency (RAM and GPU usage)
+- Model size and storage requirements
+- Scalability with batch size
+- CPU vs GPU performance
+"""
+import gc
+import json
+import logging
+import os
+import time
+from pathlib import Path
+from typing import Any
+import pandas as pd
+import psutil
+import torch
+from beam import GpuType, Image, Volume, function
+from sentence_transformers import SentenceTransformer
+from .beam_utils import (
+	BeamCheckpointManager,
+	BeamEvaluationManager,
+	create_beam_utilities,
+)
+logger = logging.getLogger(__name__)
+# =============================================================================
+# BEAM CONFIGURATION
+# =============================================================================
+GPU_NAME = GpuType.A100_40
+VOLUME_NAME = "gte_qwen2_m2v_code"  # Same volume as distill.py and evaluate.py
+VOLUME_PATH = "./gte_qwen2_m2v_code"  # Same mount path as distill.py and evaluate.py
+BENCHMARK_RESULTS_DIR = "benchmark_results"  # Subdirectory within volume
+BENCHMARK_CACHE_DIR = "benchmark_cache"  # Cache for models
+IMAGE = Image(python_version="python3.12").add_python_packages(
+	[
+		"torch>=2.7.0",
+		"transformers>=4.40.0",
+		"datasets>=3.2.0",
+		"sentence-transformers>=4.1.0",
+		"model2vec[train]>=0.5.0",
+		"numpy>=1.26.4",
+		"scikit-learn>=1.6.1",
+		"pandas>=2.0.0",
+		"tqdm>=4.65.0",
+		"psutil>=5.9.0",
+	]
+)
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+DEFAULT_OUTPUT_DIR = "benchmark_results"  # Local fallback directory
+# Default models to benchmark (can be overridden via command line)
+DEFAULT_BENCHMARK_MODELS = [
+	# Your distilled model (local files in Beam volume root)
+	"gte_qwen2_m2v_code",  # This will be resolved to VOLUME_PATH in Beam
+	# Established Code Models
+	"sentence-transformers/all-MiniLM-L6-v2",
+	"microsoft/codebert-base",
+	"microsoft/graphcodebert-base",
+	"huggingface/CodeBERTa-small-v1",
+	"sentence-transformers/all-mpnet-base-v2",
+	"sentence-transformers/all-MiniLM-L12-v2",
+	# Model2Vec & Efficiency Models (Direct Competitors)
+	"minishlab/potion-base-8M",
+	"minishlab/potion-retrieval-32M",
+	# Small Transformer-Based Code Models
+	"Salesforce/codet5-base",
+]
+# =============================================================================
+# CHECKPOINT CONFIGURATION
+# =============================================================================
+# Prevent conflicts with other modules by using unique prefixes
+BENCHMARK_CHECKPOINT_PREFIX = "benchmark_checkpoints"
+MODEL_CACHE_PREFIX = "model_cache"
+# Sample texts for benchmarking (various lengths)
+BENCHMARK_TEXTS = {
+	"short": [
+		"def add(a, b): return a + b",
+		"function multiply(x, y) { return x * y; }",
+		"class Calculator { public int subtract(int a, int b) { return a - b; } }",
+	]
+	* 100,  # 300 short texts
+	"medium": [
+		"def fibonacci(n):\n    if n <= 1:\n        return n\n    return fibonacci(n-1) + fibonacci(n-2)",
+		"function quickSort(arr) {\n    if (arr.length <= 1) return arr;\n    const pivot = arr[arr.length - 1];\n    const left = [], right = [];\n    for (let i = 0; i < arr.length - 1; i++) {\n        if (arr[i] < pivot) left.push(arr[i]);\n        else right.push(arr[i]);\n    }\n    return [...quickSort(left), pivot, ...quickSort(right)];\n}",
+	]
+	* 50,  # 100 medium texts
+	"long": [
+		"""
+def complex_algorithm(data, config):
+    '''
+    Complex data processing algorithm with multiple steps.
+    Args:
+        data: Input data structure
+        config: Configuration parameters
+    Returns:
+        Processed results
+    '''
+    results = []
+    # Step 1: Data validation
+    if not isinstance(data, (list, tuple)):
+        raise ValueError("Data must be list or tuple")
+    # Step 2: Preprocessing
+    processed_data = []
+    for item in data:
+        if config.get('normalize', False):
+            item = normalize_item(item)
+        if config.get('filter', False):
+            if not filter_item(item, config['filter_criteria']):
+                continue
+        processed_data.append(item)
+    # Step 3: Main processing
+    for item in processed_data:
+        result = process_item(item, config)
+        if result is not None:
+            results.append(result)
+    # Step 4: Post-processing
+    if config.get('sort', False):
+        results.sort(key=lambda x: x.get('score', 0), reverse=True)
+    return results
+        """.strip(),
+	]
+	* 20,  # 20 long texts
+}
+class PerformanceBenchmark:
+	"""Comprehensive performance benchmarking for embedding models."""
+	def __init__(
+		self,
+		model_path: str,
+		model_name: str | None = None,
+		checkpoint_manager: BeamCheckpointManager | None = None,
+		eval_manager: BeamEvaluationManager | None = None,
+	) -> None:
+		"""Initialize benchmarker with model and optional Beam utilities."""
+		self.model_path = model_path
+		self.model_name = model_name or Path(model_path).name
+		self.model: SentenceTransformer | None = None
+		self.device = "cuda" if torch.cuda.is_available() else "cpu"
+		self.results: dict[str, Any] = {}
+		self.checkpoint_manager = checkpoint_manager
+		self.eval_manager = eval_manager
+	def load_model(self) -> None:
+		"""Load the embedding model."""
+		logger.info(f"Loading model from {self.model_path}")
+		start_time = time.time()
+		try:
+			self.model = SentenceTransformer(self.model_path, device=self.device, trust_remote_code=True)
+			load_time = time.time() - start_time
+			logger.info(f"✅ Model loaded in {load_time:.2f}s on {self.device}")
+			self.results["model_load_time"] = load_time
+		except Exception:
+			logger.exception("❌ Failed to load model")
+			raise
+	def measure_model_size(self) -> dict[str, float]:
+		"""Measure model size metrics."""
+		logger.info("📏 Measuring model size...")
+		size_metrics = {}
+		# Disk size - handle both local paths and HuggingFace models
+		try:
+			if Path(self.model_path).is_dir():
+				# Local directory - calculate size of model files only
+				model_extensions = {".safetensors", ".bin", ".json", ".txt", ".tokenizer"}
+				total_size = 0
+				model_dir = Path(self.model_path)
+				for file_path in model_dir.rglob("*"):
+					if file_path.is_file() and (
+						file_path.suffix.lower() in model_extensions
+						or file_path.name.lower() in {"config.json", "tokenizer.json", "modules.json", "README.md"}
+					):
+						total_size += file_path.stat().st_size
+				size_metrics["disk_size_mb"] = total_size / (1024 * 1024)
+			elif Path(self.model_path).is_file():
+				# Single file
+				total_size = Path(self.model_path).stat().st_size
+				size_metrics["disk_size_mb"] = total_size / (1024 * 1024)
+			else:
+				# HuggingFace model - estimate from cache if available
+				from transformers import AutoConfig
+				try:
+					config = AutoConfig.from_pretrained(self.model_path)
+					# Estimate size based on parameters (rough approximation)
+					if hasattr(config, "hidden_size") and hasattr(config, "num_hidden_layers"):
+						# Rough estimation for transformer models
+						estimated_params = config.hidden_size * config.num_hidden_layers * 1000  # Very rough
+						size_metrics["disk_size_mb"] = estimated_params * 4 / (1024 * 1024)  # 4 bytes per float32
+					else:
+						size_metrics["disk_size_mb"] = 0  # Unknown
+				except Exception:
+					logger.warning(f"Could not determine disk size for HuggingFace model: {self.model_path}")
+					size_metrics["disk_size_mb"] = 0  # Unknown
+		except Exception as e:
+			logger.warning(f"Could not determine disk size: {e}")
+			size_metrics["disk_size_mb"] = 0
+		# Model parameters (if accessible)
+		try:
+			if self.model is not None and hasattr(self.model, "modules"):
+				total_params = sum(p.numel() for p in self.model.parameters())
+				size_metrics["parameters_millions"] = total_params / 1_000_000
+				# Try to get embedding dimension from model config
+				try:
+					# Use the public modules() method instead of private _modules
+					modules = list(self.model.modules())
+					if len(modules) > 1:  # modules[0] is usually the entire model, modules[1] is first submodule
+						first_module = modules[1]
+						if hasattr(first_module, "auto_model") and hasattr(first_module.auto_model, "config"):
+							config = first_module.auto_model.config
+							if hasattr(config, "hidden_size"):
+								size_metrics["embedding_dim"] = config.hidden_size
+							elif hasattr(config, "model_dim"):
+								size_metrics["embedding_dim"] = config.model_dim
+				except Exception as e:
+					logger.debug(
+						f"Could not extract embedding dimension from model config: {e}"
+					)  # Silently continue if this method fails
+			# For Model2Vec static models
+			elif self.model is not None and hasattr(self.model, "embedding"):
+				# Handle both tensor and numpy array embeddings
+				embedding = self.model.embedding
+				if hasattr(embedding, "shape"):
+					vocab_size, embedding_dim = embedding.shape  # type: ignore[misc]
+					total_params = vocab_size * embedding_dim
+					size_metrics["parameters_millions"] = total_params / 1_000_000
+					size_metrics["vocab_size"] = vocab_size
+					size_metrics["embedding_dim"] = embedding_dim
+				else:
+					logger.warning("Could not determine embedding shape for Model2Vec model")
+			# Alternative method: get embedding dimension from a test encoding
+			if "embedding_dim" not in size_metrics and self.model is not None:
+				try:
+					test_embedding = self.model.encode(["test"], convert_to_tensor=False)
+					if hasattr(test_embedding, "shape") and len(test_embedding.shape) >= 2:
+						size_metrics["embedding_dim"] = test_embedding.shape[1]
+					elif (
+						isinstance(test_embedding, (list, tuple))
+						and len(test_embedding) > 0
+						and hasattr(test_embedding[0], "__len__")
+					):
+						size_metrics["embedding_dim"] = len(test_embedding[0])
+				except Exception as e:
+					logger.warning(f"Could not determine embedding dimension: {e}")
+		except Exception as e:
+			logger.warning(f"Could not determine parameter count: {e}")
+		# Memory footprint
+		if self.device == "cuda" and torch.cuda.is_available():
+			torch.cuda.empty_cache()
+			size_metrics["gpu_memory_mb"] = torch.cuda.memory_allocated() / (1024 * 1024)
+		# RAM usage (approximate)
+		process = psutil.Process(os.getpid())
+		size_metrics["ram_usage_mb"] = process.memory_info().rss / (1024 * 1024)
+		self.results["size_metrics"] = size_metrics
+		return size_metrics
+	def benchmark_inference_speed(self, batch_sizes: list[int] | None = None) -> dict[str, Any]:
+		"""Benchmark inference speed across different batch sizes."""
+		if batch_sizes is None:
+			batch_sizes = [1, 8, 16, 32, 64, 128]
+		logger.info("⚡ Benchmarking inference speed...")
+		if self.model is None:
+			self.load_model()
+		if self.model is None:
+			msg = "Failed to load model"
+			raise RuntimeError(msg)
+		speed_results = {}
+		text_lengths = ["short", "medium", "long"]
+		for text_length in text_lengths:
+			logger.info(f"  📝 Testing {text_length} texts...")
+			texts = BENCHMARK_TEXTS[text_length]
+			length_results = {}
+			for batch_size in batch_sizes:
+				if batch_size > len(texts):
+					continue
+				logger.info(f"    🔄 Batch size: {batch_size}")
+				# Prepare batch
+				batch_texts = texts[:batch_size]
+				# Warmup
+				if self.device == "cuda":
+					torch.cuda.synchronize()
+				_ = self.model.encode(batch_texts[: min(2, batch_size)], convert_to_tensor=False)
+				# Clear cache
+				if self.device == "cuda":
+					torch.cuda.empty_cache()
+					torch.cuda.synchronize()
+				# Measure inference time
+				start_time = time.perf_counter()
+				embeddings = self.model.encode(batch_texts, convert_to_tensor=False, show_progress_bar=False)
+				if self.device == "cuda":
+					torch.cuda.synchronize()
+				end_time = time.perf_counter()
+				# Calculate metrics
+				total_time = end_time - start_time
+				time_per_text = total_time / batch_size
+				texts_per_second = batch_size / total_time
+				# Estimate tokens (rough approximation)
+				avg_tokens = sum(len(text.split()) for text in batch_texts) / batch_size
+				total_tokens = avg_tokens * batch_size
+				tokens_per_second = total_tokens / total_time
+				length_results[f"batch_{batch_size}"] = {
+					"total_time_ms": total_time * 1000,
+					"time_per_text_ms": time_per_text * 1000,
+					"texts_per_second": texts_per_second,
+					"tokens_per_second": tokens_per_second,
+					"avg_tokens_per_text": avg_tokens,
+					"embedding_shape": embeddings.shape
+					if hasattr(embeddings, "shape")
+					else f"({len(embeddings)}, {len(embeddings[0]) if embeddings else 0})",
+				}
+			speed_results[text_length] = length_results
+		self.results["speed_benchmarks"] = speed_results
+		return speed_results
+	def benchmark_memory_scaling(self, batch_sizes: list[int] | None = None) -> dict[str, Any]:
+		"""Benchmark memory usage across batch sizes."""
+		if batch_sizes is None:
+			batch_sizes = [1, 8, 16, 32, 64, 128, 256]
+		logger.info("💾 Benchmarking memory scaling...")
+		if self.model is None:
+			self.load_model()
+		if self.model is None:
+			msg = "Failed to load model"
+			raise RuntimeError(msg)
+		memory_results: dict[str, Any] = {}
+		texts = BENCHMARK_TEXTS["medium"]
+		baseline_memory = 0
+		if self.device == "cuda":
+			torch.cuda.empty_cache()
+			baseline_memory = torch.cuda.memory_allocated()
+		for batch_size in batch_sizes:
+			if batch_size > len(texts):
+				continue
+			logger.info(f"  📊 Testing batch size: {batch_size}")
+			# Clear cache
+			if self.device == "cuda":
+				torch.cuda.empty_cache()
+				gc.collect()
+			batch_texts = texts[:batch_size]
+			# Measure memory before
+			if self.device == "cuda":
+				torch.cuda.memory_allocated()
+			# Run inference
+			try:
+				embeddings = self.model.encode(
+					batch_texts,
+					convert_to_tensor=self.device == "cuda",
+					show_progress_bar=False,
+				)
+				# Measure memory after
+				memory_after = 0
+				if self.device == "cuda":
+					memory_after = torch.cuda.max_memory_allocated()
+					torch.cuda.reset_peak_memory_stats()
+				memory_used_mb = (memory_after - baseline_memory) / (1024 * 1024)
+				memory_per_text_mb = memory_used_mb / batch_size if batch_size > 0 else 0
+				memory_results[f"batch_{batch_size}"] = {
+					"memory_used_mb": memory_used_mb,
+					"memory_per_text_mb": memory_per_text_mb,
+					"baseline_memory_mb": baseline_memory / (1024 * 1024),
+					"peak_memory_mb": memory_after / (1024 * 1024),
+				}
+				# Clean up
+				del embeddings
+			except torch.cuda.OutOfMemoryError:
+				logger.warning(f"❌ OOM at batch size {batch_size}")
+				memory_results[f"batch_{batch_size}"] = {"oom": True}
+				break
+			except Exception as e:
+				logger.warning(f"❌ Error at batch size {batch_size}: {e}")
+				memory_results[f"batch_{batch_size}"] = {"error": str(e)}
+		self.results["memory_benchmarks"] = memory_results
+		return memory_results
+	def benchmark_cpu_vs_gpu(self) -> dict[str, Any]:
+		"""Compare CPU vs GPU performance."""
+		logger.info("🖥️ Benchmarking CPU vs GPU performance...")
+		comparison_results = {}
+		test_texts = BENCHMARK_TEXTS["medium"][:32]  # Fixed batch size
+		devices = ["cpu"]
+		if torch.cuda.is_available():
+			devices.append("cuda")
+		for device in devices:
+			logger.info(f"  🔄 Testing on {device}")
+			# Load model on device
+			try:
+				model = SentenceTransformer(self.model_path, device=device)
+				# Warmup
+				_ = model.encode(test_texts[:2], convert_to_tensor=False)
+				# Benchmark
+				start_time = time.perf_counter()
+				embeddings = model.encode(test_texts, convert_to_tensor=False, show_progress_bar=False)
+				end_time = time.perf_counter()
+				total_time = end_time - start_time
+				comparison_results[device] = {
+					"total_time_ms": total_time * 1000,
+					"texts_per_second": len(test_texts) / total_time,
+					"time_per_text_ms": (total_time / len(test_texts)) * 1000,
+					"embedding_shape": embeddings.shape
+					if hasattr(embeddings, "shape")
+					else f"({len(embeddings)}, {len(embeddings[0]) if embeddings else 0})",
+				}
+				del model
+				if device == "cuda":
+					torch.cuda.empty_cache()
+			except Exception as e:
+				logger.warning(f"❌ Failed on {device}: {e}")
+				comparison_results[device] = {"error": str(e)}
+		self.results["cpu_vs_gpu"] = comparison_results
+		return comparison_results
+	def run_comprehensive_benchmark(self) -> dict[str, Any]:
+		"""Run all benchmarks and return comprehensive results."""
+		logger.info(f"🚀 Starting comprehensive benchmark for {self.model_name}")
+		# Load model
+		self.load_model()
+		# Run all benchmarks
+		self.measure_model_size()
+		self.benchmark_inference_speed()
+		self.benchmark_memory_scaling()
+		self.benchmark_cpu_vs_gpu()
+		# Add metadata
+		self.results["model_name"] = self.model_name
+		self.results["model_path"] = self.model_path
+		self.results["device"] = self.device
+		self.results["torch_version"] = torch.__version__
+		self.results["cuda_available"] = torch.cuda.is_available()
+		if torch.cuda.is_available():
+			self.results["gpu_name"] = torch.cuda.get_device_name(0)
+			self.results["gpu_memory_gb"] = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+		# System info
+		self.results["cpu_count"] = psutil.cpu_count()
+		self.results["ram_gb"] = psutil.virtual_memory().total / (1024**3)
+		logger.info("✅ Comprehensive benchmark completed!")
+		return self.results
+	def save_results(self, output_file: str) -> None:
+		"""Save benchmark results to JSON file."""
+		output_path = Path(output_file)
+		output_path.parent.mkdir(parents=True, exist_ok=True)
+		with output_path.open("w") as f:
+			json.dump(self.results, f, indent=2, default=str)
+		logger.info(f"📄 Results saved to {output_path}")
+	def print_summary(self) -> None:
+		"""Print a summary of benchmark results."""
+		if not self.results:
+			logger.warning("No results to summarize")
+			return
+		print(f"\n{'=' * 60}")
+		print(f"Performance Benchmark Summary: {self.model_name}")
+		print(f"{'=' * 60}")
+		# Model size
+		if "size_metrics" in self.results:
+			size = self.results["size_metrics"]
+			print("\n📏 Model Size:")
+			print(f"  Disk Size: {size.get('disk_size_mb', 0):.1f} MB")
+			if "parameters_millions" in size:
+				print(f"  Parameters: {size['parameters_millions']:.1f}M")
+			if "embedding_dim" in size:
+				print(f"  Embedding Dim: {size['embedding_dim']}")
+		# Speed summary
+		if "speed_benchmarks" in self.results:
+			speed = self.results["speed_benchmarks"]
+			print("\n⚡ Speed (medium texts, batch 32):")
+			if "medium" in speed and "batch_32" in speed["medium"]:
+				batch_32 = speed["medium"]["batch_32"]
+				print(f"  Throughput: {batch_32['texts_per_second']:.1f} texts/sec")
+				print(f"  Latency: {batch_32['time_per_text_ms']:.1f} ms/text")
+				print(f"  Token Speed: {batch_32['tokens_per_second']:.0f} tokens/sec")
+		# CPU vs GPU
+		if "cpu_vs_gpu" in self.results:
+			comparison = self.results["cpu_vs_gpu"]
+			print("\n🖥️ CPU vs GPU:")
+			for device, metrics in comparison.items():
+				if "error" not in metrics:
+					print(f"  {device.upper()}: {metrics['texts_per_second']:.1f} texts/sec")
+		print()
+def run_benchmark(
+	model_path: str | list[str],
+	model_name: str | None = None,
+	output: str = "benchmark_results.json",
+	quick: bool = False,
+	compare_models: list[str] | None = None,
+) -> None:
+	"""Run benchmark for one or multiple models with comparison."""
+	# Handle both single model and multiple models
+	models_to_benchmark = [model_path] if isinstance(model_path, str) else model_path
+	if compare_models:
+		models_to_benchmark.extend(compare_models)
+	all_results = []
+	for i, model in enumerate(models_to_benchmark):
+		current_model_name = model_name if i == 0 else Path(model).name
+		print(f"\n{'=' * 60}")
+		print(f"Benchmarking Model {i + 1}/{len(models_to_benchmark)}: {current_model_name}")
+		print(f"{'=' * 60}")
+		try:
+			benchmarker = PerformanceBenchmark(model, current_model_name)
+			if quick:
+				# Quick benchmark
+				benchmarker.load_model()
+				benchmarker.measure_model_size()
+				benchmarker.benchmark_inference_speed([1, 16, 32])
+			else:
+				# Comprehensive benchmark
+				benchmarker.run_comprehensive_benchmark()
+			all_results.append(benchmarker.results)
+			benchmarker.print_summary()
+		except Exception:
+			logger.exception(f"❌ Failed to benchmark {current_model_name}")
+			continue
+	# Save individual results
+	output_dir = Path(output).parent if Path(output).suffix else Path(output)
+	output_dir.mkdir(parents=True, exist_ok=True)
+	for results in all_results:
+		model_name_safe = "".join(c for c in results["model_name"] if c.isalnum() or c in ("-", "_", "."))
+		output_path = output_dir / f"benchmark_{model_name_safe}.json"
+		with output_path.open("w") as f:
+			json.dump(results, f, indent=2, default=str)
+		logger.info(f"📄 Results saved to {output_path}")
+	# Create comparison if multiple models
+	if len(all_results) > 1:
+		create_benchmark_comparison(all_results, str(output_dir / "benchmark_comparison.json"))
+	print(f"\n✅ Benchmark complete! Results saved to {output_dir}")
+def create_benchmark_comparison(all_results: list[dict[str, Any]], output_path: str) -> None:
+	"""Create a comparison report for multiple benchmark results."""
+	print(f"\n{'=' * 80}")
+	print("Performance Benchmark Comparison")
+	print(f"{'=' * 80}")
+	comparison_data = []
+	for results in all_results:
+		model_name = results.get("model_name", "Unknown")
+		size_metrics = results.get("size_metrics", {})
+		speed_benchmarks = results.get("speed_benchmarks", {})
+		cpu_vs_gpu = results.get("cpu_vs_gpu", {})
+		# Extract key metrics
+		row = {
+			"Model": model_name,
+			"Disk Size (MB)": size_metrics.get("disk_size_mb", 0),
+			"Parameters (M)": size_metrics.get("parameters_millions", 0),
+			"Embedding Dim": size_metrics.get("embedding_dim", 0),
+		}
+		# Speed metrics (medium texts, batch 32)
+		if "medium" in speed_benchmarks and "batch_32" in speed_benchmarks["medium"]:
+			batch_32 = speed_benchmarks["medium"]["batch_32"]
+			row.update(
+				{
+					"Throughput (texts/sec)": batch_32.get("texts_per_second", 0),
+					"Latency (ms/text)": batch_32.get("time_per_text_ms", 0),
+					"Token Speed (tokens/sec)": batch_32.get("tokens_per_second", 0),
+				}
+			)
+		# CPU vs GPU comparison
+		for device in ["cpu", "cuda"]:
+			if device in cpu_vs_gpu and "error" not in cpu_vs_gpu[device]:
+				row[f"{device.upper()} Speed (texts/sec)"] = cpu_vs_gpu[device].get("texts_per_second", 0)
+		comparison_data.append(row)
+	# Create DataFrame and save
+	df = pd.DataFrame(comparison_data)
+	# Sort by throughput (descending)
+	if "Throughput (texts/sec)" in df.columns:
+		df = df.sort_values("Throughput (texts/sec)", ascending=False)
+	# Print comparison table
+	print(df.to_string(index=False, float_format="%.2f"))
+	# Save comparison results
+	comparison_summary = {
+		"comparison_table": df.to_dict(orient="records"),
+		"summary": {
+			"fastest_model": df.iloc[0]["Model"] if len(df) > 0 else None,
+			"smallest_model": df.loc[df["Disk Size (MB)"].idxmin()]["Model"] if len(df) > 0 else None,
+			"most_efficient": df.loc[df["Throughput (texts/sec)"].idxmax()]["Model"]
+			if "Throughput (texts/sec)" in df.columns and len(df) > 0
+			else None,
+		},
+		"timestamp": time.time(),
+	}
+	with Path(output_path).open("w") as f:
+		json.dump(comparison_summary, f, indent=2, default=str)
+	print(f"\n📊 Comparison saved to {output_path}")
+def save_benchmark_results(
+	results: dict[str, Any],
+	output_dir: str,
+	model_name: str,
+	volume_results_dir: Path | None = None,
+) -> None:
+	"""Save benchmark results to JSON file with Beam volume support."""
+	# Save to Beam volume if available
+	if volume_results_dir:
+		volume_output_path = volume_results_dir / f"benchmark_{model_name}.json"
+		try:
+			with volume_output_path.open("w") as f:
+				json.dump(results, f, indent=2, default=str)
+			logger.info(f"💾 Results saved to Beam volume: {volume_output_path}")
+		except Exception as e:
+			logger.warning(f"⚠️ Failed to save to Beam volume: {e}")
+	# Always save local backup
+	output_path = Path(output_dir)
+	output_path.mkdir(parents=True, exist_ok=True)
+	# Clean model name for filename
+	safe_name = "".join(c for c in model_name if c.isalnum() or c in ("-", "_", "."))
+	filename = f"benchmark_{safe_name}.json"
+	filepath = output_path / filename
+	with filepath.open("w") as f:
+		json.dump(results, f, indent=2, default=str)
+	logger.info(f"📄 Local backup saved to {filepath}")
+def beam_benchmark_models(
+	models: list[str],
+	quick: bool = False,
+	output_dir: str = DEFAULT_OUTPUT_DIR,
+	volume_name: str = VOLUME_NAME,
+	mount_path: str = VOLUME_PATH,
+) -> list[dict[str, Any]]:
+	"""Main benchmarking function for Beam execution with checkpoint support."""
+	logger.info("🚀 Starting Beam-powered performance benchmarking")
+	logger.info(f"📊 Benchmarking {len(models)} models")
+	# Initialize Beam utilities
+	volume_mgr, checkpoint_mgr, model_mgr, eval_mgr = create_beam_utilities(volume_name, mount_path)
+	# Create benchmark results directory in volume
+	results_dir = Path(mount_path) / BENCHMARK_RESULTS_DIR
+	results_dir.mkdir(parents=True, exist_ok=True)
+	logger.info(f"📁 Using Beam volume: {volume_name} at {mount_path}")
+	logger.info(f"💾 Benchmark results directory: {results_dir}")
+	all_results = []
+	skipped_models = []
+	for model_path in models:
+		model_name = Path(model_path).name if model_path != str(Path(mount_path)) else "gte_qwen2_m2v_code"
+		# Check if this model has already been benchmarked (except for trained model)
+		is_trained_model = model_path == str(Path(mount_path)) or model_name == "gte_qwen2_m2v_code"
+		if not is_trained_model:
+			# Check for existing benchmark results
+			existing_result_file = results_dir / f"benchmark_{model_name}.json"
+			if existing_result_file.exists():
+				logger.info(f"✅ Model {model_name} already benchmarked - loading existing results")
+				try:
+					with existing_result_file.open("r") as f:
+						existing_results = json.load(f)
+					all_results.append(existing_results)
+					skipped_models.append(model_name)
+					continue
+				except Exception as e:
+					logger.warning(f"⚠️ Failed to load existing results for {model_name}: {e}")
+					# Continue with benchmarking if loading fails
+		logger.info(f"\n{'=' * 60}")
+		logger.info(f"🔍 Benchmarking model: {model_name}")
+		logger.info(f"📂 Path: {model_path}")
+		if is_trained_model:
+			logger.info("🎯 Trained model - always re-benchmark")
+		logger.info(f"{'=' * 60}")
+		try:
+			# Distinguish between local paths and HuggingFace model names
+			is_huggingface_model = (
+				"/" in model_path and not model_path.startswith("/") and not Path(model_path).exists()
+			)
+			if is_huggingface_model:
+				# This is a HuggingFace model name - pass directly to benchmarker
+				logger.info(f"📥 Loading HuggingFace model: {model_path}")
+				benchmarker = PerformanceBenchmark(
+					model_path,
+					model_name,
+					checkpoint_manager=checkpoint_mgr,
+					eval_manager=eval_mgr,
+				)
+			else:
+				# This is a local path - check if it exists in Beam volume
+				actual_model_path = model_path  # Default to original path
+				if not Path(model_path).exists() and not model_path.startswith("/"):
+					# Try to load from Beam volume
+					local_model_path = Path(mount_path) / model_name
+					logger.info(f"🔍 Trying to load {model_name} from Beam volume: {local_model_path}")
+					if local_model_path.exists():
+						actual_model_path = str(local_model_path)
+						logger.info(f"✅ Found model in Beam volume: {actual_model_path}")
+					else:
+						# Try in root of volume (for your trained model)
+						root_model_path = Path(mount_path)
+						if (root_model_path / "config.json").exists():
+							actual_model_path = str(root_model_path)
+							logger.info(f"✅ Found model in Beam volume root: {actual_model_path}")
+						else:
+							logger.warning(f"⚠️ Model not found locally or in Beam volume: {model_name}")
+							continue
+				benchmarker = PerformanceBenchmark(
+					actual_model_path,
+					model_name,
+					checkpoint_manager=checkpoint_mgr,
+					eval_manager=eval_mgr,
+				)
+			# Run benchmarking
+			if quick:
+				# Quick benchmark
+				benchmarker.load_model()
+				benchmarker.measure_model_size()
+				benchmarker.benchmark_inference_speed([1, 16, 32])
+			else:
+				# Comprehensive benchmark
+				benchmarker.run_comprehensive_benchmark()
+			# Save results with Beam support
+			save_benchmark_results(benchmarker.results, output_dir, model_name, results_dir)
+			# Print summary
+			benchmarker.print_summary()
+			all_results.append(benchmarker.results)
+		except Exception:
+			logger.exception(f"❌ Failed to benchmark {model_name}")
+			continue
+	# Create comparison report in Beam volume
+	if len(all_results) > 1:
+		comparison_dir = results_dir / "comparisons"
+		comparison_dir.mkdir(parents=True, exist_ok=True)
+		create_benchmark_comparison(all_results, str(comparison_dir / "benchmark_comparison.json"))
+		logger.info(f"📊 Comparison report saved to Beam volume: {comparison_dir}")
+	# Log summary of what was done
+	newly_benchmarked = len(all_results) - len(skipped_models)
+	logger.info("\n✅ Beam benchmarking complete!")
+	logger.info(f"📊 Newly benchmarked: {newly_benchmarked} models")
+	logger.info(f"⏭️  Skipped (already done): {len(skipped_models)} models")
+	logger.info(f"📁 Total results: {len(all_results)} models")
+	logger.info(f"💾 Results available in Beam volume: {volume_name}")
+	if skipped_models:
+		logger.info(f"⏭️  Skipped models: {', '.join(skipped_models)}")
+	return all_results
+@function(
+	gpu=GPU_NAME,
+	volumes=[Volume(name=VOLUME_NAME, mount_path=VOLUME_PATH)],
+	image=IMAGE,
+	secrets=["HF_ACCESS_TOKEN"],
+	env={
+		"TOKENIZERS_PARALLELISM": "false",
+		"CUDA_LAUNCH_BLOCKING": "0",
+	},
+	timeout=3600 * 4,  # 4 hours for benchmarking all models
+)
+def main() -> None:
+	"""Main benchmarking function - runs all default models on Beam."""
+	logger.info("🚀 Starting comprehensive performance benchmarking on Beam")
+	# Use default models but replace the local model path with Beam volume path
+	models = DEFAULT_BENCHMARK_MODELS.copy()
+	# Replace "gte_qwen2_m2v_code" with actual Beam volume path
+	for i, model in enumerate(models):
+		if model == "gte_qwen2_m2v_code":
+			models[i] = str(Path(VOLUME_PATH))  # Use the Beam volume root
+			logger.info(f"🎯 Using trained model from Beam volume: {models[i]}")
+	# Discover simplified distillation models
+	logger.info("🔍 Discovering simplified distillation models...")
+	discovered_models = discover_simplified_models(".")
+	# Add discovered models
+	if discovered_models:
+		logger.info(f"✅ Found {len(discovered_models)} simplified models:")
+		for model_path in discovered_models:
+			models.append(model_path)
+			logger.info(f"   📁 {model_path}")
+	else:
+		logger.warning("⚠️ No simplified distillation models found")
+	logger.info(f"📊 Benchmarking {len(models)} models:")
+	for i, model in enumerate(models, 1):
+		logger.info(f"  {i}. {model}")
+	logger.info("\n💡 Checkpoint Info:")
+	logger.info("   - Already benchmarked models will be skipped")
+	logger.info("   - Your trained model will always be re-benchmarked")
+	logger.info("   - Results are saved persistently to Beam volume")
+	# Run comprehensive benchmark using Beam utilities
+	results = beam_benchmark_models(
+		models=models,
+		quick=True,  # Use quick benchmark for efficiency
+		output_dir=str(Path(VOLUME_PATH) / BENCHMARK_RESULTS_DIR),
+		volume_name=VOLUME_NAME,
+		mount_path=VOLUME_PATH,
+	)
+	# Print final summary
+	print("\n🎯 Benchmarking Summary:")
+	print(f"📊 Total models processed: {len(results)}")
+	print(f"💾 Results saved to Beam volume: {VOLUME_NAME}")
+	print(f"📁 Directory: {BENCHMARK_RESULTS_DIR}")
+	print("\n🔍 To view analysis:")
+	print("   beam run src.distiller.analyze:beam_analysis")
+	print("\n📈 To run benchmarks again:")
+	print("   distiller benchmark  (will skip already completed models)")
+def discover_simplified_models(base_path: str = ".") -> list[str]:
+	"""
+	Discover all simplified distillation models in the correct directory.
+	Looks for directories matching the pattern: ./code_model2vec/final/code_model2vec_*
+	"""
+	discovered_models: list[str] = []
+	# Look in the correct location where distill_simplified.py saves models
+	models_dir = Path(base_path) / "code_model2vec" / "final"
+	if not models_dir.exists():
+		logger.warning(f"Models directory not found: {models_dir}")
+		return discovered_models
+	# Look for simplified model directories with the updated pattern
+	pattern = "code_model2vec_*"
+	for model_dir in models_dir.glob(pattern):
+		if model_dir.is_dir() and (model_dir / "config.json").exists():
+			discovered_models.append(str(model_dir))
+			logger.info(f"🔍 Discovered simplified model: {model_dir}")
+	# Sort alphabetically for consistent ordering
+	discovered_models.sort()
+	return discovered_models
+@function(
+	gpu=GPU_NAME,
+	volumes=[Volume(name=VOLUME_NAME, mount_path=VOLUME_PATH)],
+	image=IMAGE,
+	secrets=["HF_ACCESS_TOKEN"],
+	env={
+		"TOKENIZERS_PARALLELISM": "false",
+		"CUDA_LAUNCH_BLOCKING": "0",
+	},
+	timeout=3600 * 3,  # 3 hours for simplified models only
+)
+def benchmark_simplified_only() -> None:
+	"""Benchmark only simplified distillation models, skipping 3rd party models."""
+	logger.info("🚀 Starting simplified distillation models benchmarking on Beam")
+	logger.info("⏭️  Skipping 3rd party models - benchmarking only simplified distillation models")
+	# Discover simplified distillation models
+	logger.info("🔍 Discovering simplified distillation models...")
+	discovered_models = discover_simplified_models(".")
+	if not discovered_models:
+		logger.error("❌ No simplified distillation models found! Run distill-simple first.")
+		return
+	logger.info(f"✅ Found {len(discovered_models)} simplified models:")
+	for model_path in discovered_models:
+		logger.info(f"   📁 {model_path}")
+	logger.info("\n💡 Checkpoint Info:")
+	logger.info("   - Already benchmarked models will be skipped")
+	logger.info("   - Results are saved persistently to Beam volume")
+	# Run comprehensive benchmark using Beam utilities
+	results = beam_benchmark_models(
+		models=discovered_models,
+		quick=True,  # Use quick benchmark for efficiency
+		output_dir=str(Path(VOLUME_PATH) / BENCHMARK_RESULTS_DIR),
+		volume_name=VOLUME_NAME,
+		mount_path=VOLUME_PATH,
+	)
+	# Print final summary
+	print("\n🎯 Simplified Benchmarking Summary:")
+	print(f"📊 Total simplified models processed: {len(results)}")
+	print(f"💾 Results saved to Beam volume: {VOLUME_NAME}")
+	print(f"📁 Directory: {BENCHMARK_RESULTS_DIR}")
+	print("⏭️  3rd party models were skipped")
+	print("\n🔍 To view analysis:")
+	print("   distiller analyze")
+	print("\n📈 To run full benchmarks (including 3rd party):")
+	print("   distiller benchmark")
+def run_local_benchmark(
+	models: list[str] | None = None,
+	quick: bool = False,
+	output_dir: str = DEFAULT_OUTPUT_DIR,
+) -> list[dict[str, Any]]:
+	"""Main benchmarking function for local execution without Beam utilities."""
+	logger.info("🖥️ Running performance benchmarking locally")
+	if models is None:
+		models = DEFAULT_BENCHMARK_MODELS.copy()
+		# Replace "gte_qwen2_m2v_code" with a reasonable local path
+		for i, model in enumerate(models):
+			if model == "gte_qwen2_m2v_code":
+				# Look for local trained model
+				local_model_paths = [
+					"./gte_qwen2_m2v_code",
+					"./models/gte_qwen2_m2v_code",
+					"./output/gte_qwen2_m2v_code",
+				]
+				found = False
+				for local_path in local_model_paths:
+					if Path(local_path).exists():
+						models[i] = local_path
+						logger.info(f"🎯 Found local trained model: {local_path}")
+						found = True
+						break
+				if not found:
+					logger.warning("⚠️ Local trained model not found, skipping")
+					models.pop(i)
+		# Discover simplified distillation models
+		logger.info("🔍 Discovering simplified distillation models...")
+		discovered_models = discover_simplified_models(".")
+		# Add discovered models
+		if discovered_models:
+			logger.info(f"✅ Found {len(discovered_models)} simplified models:")
+			for model_path in discovered_models:
+				models.append(model_path)
+				logger.info(f"   📁 {model_path}")
+		else:
+			logger.warning("⚠️ No simplified distillation models found")
+	logger.info(f"📊 Benchmarking {len(models)} models")
+	logger.info(f"📁 Using local output directory: {output_dir}")
+	# Create local output directory
+	output_path = Path(output_dir)
+	output_path.mkdir(parents=True, exist_ok=True)
+	all_results = []
+	skipped_models = []
+	for model_path in models:
+		model_name = Path(model_path).name
+		# Check for existing benchmark results locally
+		safe_name = "".join(c for c in model_name if c.isalnum() or c in ("-", "_", "."))
+		result_file = output_path / f"benchmark_{safe_name}.json"
+		if result_file.exists():
+			logger.info(f"✅ Model {model_name} already benchmarked - loading existing results")
+			try:
+				with result_file.open("r") as f:
+					existing_results = json.load(f)
+				all_results.append(existing_results)
+				skipped_models.append(model_name)
+				continue
+			except Exception as e:
+				logger.warning(f"⚠️ Failed to load existing results for {model_name}: {e}")
+		logger.info(f"\n{'=' * 60}")
+		logger.info(f"🔍 Benchmarking model: {model_name}")
+		logger.info(f"📂 Path: {model_path}")
+		logger.info(f"{'=' * 60}")
+		try:
+			# Create benchmarker without Beam utilities
+			benchmarker = PerformanceBenchmark(
+				model_path,
+				model_name,
+				checkpoint_manager=None,  # No checkpointing for local benchmarking
+				eval_manager=None,
+			)
+			# Run benchmarking
+			if quick:
+				# Quick benchmark
+				benchmarker.load_model()
+				benchmarker.measure_model_size()
+				benchmarker.benchmark_inference_speed([1, 16, 32])
+			else:
+				# Comprehensive benchmark
+				benchmarker.run_comprehensive_benchmark()
+			# Save results locally only
+			save_benchmark_results(benchmarker.results, output_dir, model_name, volume_results_dir=None)
+			# Print summary
+			benchmarker.print_summary()
+			all_results.append(benchmarker.results)
+		except Exception:
+			logger.exception(f"❌ Failed to benchmark {model_name}")
+			continue
+	# Create comparison report locally
+	if len(all_results) > 1:
+		create_benchmark_comparison(all_results, str(output_path / "benchmark_comparison.json"))
+		logger.info(f"📊 Comparison report saved locally: {output_dir}")
+	# Log summary
+	newly_benchmarked = len(all_results) - len(skipped_models)
+	logger.info("\n✅ Local benchmarking complete!")
+	logger.info(f"📊 Newly benchmarked: {newly_benchmarked} models")
+	logger.info(f"⏭️  Skipped (already done): {len(skipped_models)} models")
+	logger.info(f"📁 Total results: {len(all_results)} models")
+	logger.info(f"💾 Results available locally: {output_dir}")
+	if skipped_models:
+		logger.info(f"⏭️  Skipped models: {', '.join(skipped_models)}")
+	return all_results
+def run_local_benchmark_simplified(
+	quick: bool = False,
+	output_dir: str = DEFAULT_OUTPUT_DIR,
+) -> list[dict[str, Any]]:
+	"""Local benchmarking function for simplified models only."""
+	logger.info("🖥️ Running simplified model benchmarking locally")
+	# Discover simplified distillation models only
+	logger.info("🔍 Discovering simplified distillation models...")
+	discovered_models = discover_simplified_models(".")
+	if not discovered_models:
+		logger.error("❌ No simplified distillation models found! Run 'distiller distill-simple' first.")
+		return []
+	logger.info(f"✅ Found {len(discovered_models)} simplified models:")
+	for model_path in discovered_models:
+		logger.info(f"   📁 {model_path}")
+	return run_local_benchmark(
+		models=discovered_models,
+		quick=quick,
+		output_dir=output_dir,
+	)
+if __name__ == "__main__":
+	main()

src/distiller/distill.py ADDED Viewed

	@@ -0,0 +1,1306 @@

+"""
+Code-Specialized Model2Vec Distillation Script with Checkpoint Support.
+This script implements a focused approach for creating code-specialized embeddings
+using Model2Vec distillation with one additional training round on code-specific tasks.
+Features:
+- Incremental checkpoint saving
+- Resume from previous progress
+- Persistent storage of embeddings and models
+- Robust error handling and recovery
+- Smart checkpoint validation for parameter compatibility
+Approach:
+1. Basic Model2Vec distillation with optimized parameters
+2. Single code specialization round using sentence-transformers/codesearchnet dataset
+"""
+import json
+import logging
+import os
+import time
+from pathlib import Path
+from typing import Any
+import numpy as np
+import torch
+from beam import GpuType, Image, Volume, function
+from datasets import load_dataset
+from model2vec.distill import distill
+from model2vec.train.base import FinetunableStaticModel, TextDataset
+from sentence_transformers import SentenceTransformer
+from sklearn.model_selection import train_test_split
+from torch import nn, optim
+from .beam_utils import (
+	BeamCheckpointManager,
+	BeamModelManager,
+	create_beam_utilities,
+)
+# =============================================================================
+# CODE-FOCUSED CONFIGURATION
+# =============================================================================
+# Model Configuration
+MODEL_NAME = "Alibaba-NLP/gte-Qwen2-7B-instruct"
+OUTPUT_DIR = "gte_qwen2_m2v_code"
+CHECKPOINT_DIR = "gte_qwen2_m2v_code/checkpoints"
+# Code-optimized parameters
+PCA_DIMS = 512  # Higher dims for code complexity
+TRAINING_EPOCHS = 2
+LEARNING_RATE = 1e-4
+BATCH_SIZE = 32
+REGULARIZATION_WEIGHT = 0.01
+# CodeSearchNet dataset configuration
+CODESEARCHNET_DATASET = "sentence-transformers/codesearchnet"
+MAX_TRAINING_SAMPLES = 50000  # Limit for manageable training time
+# Checkpoint configuration
+CHECKPOINT_INTERVAL = 1000  # Save every N samples
+EMBEDDINGS_BATCH_SIZE = 100  # Save embeddings in smaller batches
+# OPTIMIZED TEACHER MODEL CONFIGURATION FOR 40GB VRAM
+TEACHER_MODEL_CONFIG: dict[str, Any] = {
+	"batch_size": 12,  # Slightly reduced due to float32 memory usage
+	"precision": "float32",  # Use float32 for quality preservation
+	"max_seq_length": 8192,  # Reduce from 32k default for better performance
+	"device_map": "auto",  # Automatic device placement
+	"torch_dtype": torch.float32,  # Use float32 for quality preservation
+	"trust_remote_code": True,
+	"use_flash_attention": True,  # Try to enable flash attention if available
+	"attn_implementation": "flash_attention_2",  # Use flash attention 2 if available
+}
+# =============================================================================
+# BEAM CONFIGURATION
+# =============================================================================
+GPU_NAME = GpuType.A100_40
+VOLUME_NAME = "gte_qwen2_m2v_code"
+VOLUME_PATH = "./gte_qwen2_m2v_code"
+IMAGE = Image(python_version="python3.12").add_python_packages(
+	[
+		"torch>=2.7.0",  # Install torch first
+		"transformers>=4.40.0",  # Latest transformers with flash attention support
+		"accelerate>=1.7.0",
+		"datasets>=3.2.0",
+		"model2vec[train]>=0.5.0",
+		"numpy>=1.26.4",
+		"scikit-learn>=1.6.1",
+		"sentence-transformers>=4.1.0",
+	]
+)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+def get_current_config_hash() -> str:
+	"""Generate a hash of current configuration parameters for checkpoint validation."""
+	import hashlib
+	config_params = {
+		"model_name": MODEL_NAME,
+		"pca_dims": PCA_DIMS,
+		"precision": TEACHER_MODEL_CONFIG["precision"],
+		"torch_dtype": str(TEACHER_MODEL_CONFIG["torch_dtype"]),
+		"max_samples": MAX_TRAINING_SAMPLES,
+		"codesearchnet_dataset": CODESEARCHNET_DATASET,
+	}
+	config_str = str(sorted(config_params.items()))
+	return hashlib.md5(config_str.encode()).hexdigest()[:12]  # noqa: S324
+def validate_checkpoint_compatibility(checkpoint_data: dict[str, Any]) -> bool:
+	"""
+	Validate if checkpoint is compatible with current configuration.
+	Args:
+	        checkpoint_data: Checkpoint data dictionary
+	Returns:
+	        True if compatible, False otherwise
+	"""
+	current_hash = get_current_config_hash()
+	checkpoint_hash = checkpoint_data.get("config_hash", "")
+	if checkpoint_hash != current_hash:
+		logger.warning(f"Configuration mismatch: current={current_hash}, checkpoint={checkpoint_hash}")
+		return False
+	# Additional validation checks
+	checkpoint_config = checkpoint_data.get("config", {})
+	# Check critical parameters
+	if checkpoint_config.get("pca_dims") != PCA_DIMS:
+		logger.warning(f"PCA dimensions mismatch: current={PCA_DIMS}, checkpoint={checkpoint_config.get('pca_dims')}")
+		return False
+	if checkpoint_config.get("precision") != TEACHER_MODEL_CONFIG["precision"]:
+		logger.warning(
+			f"Precision mismatch: current={TEACHER_MODEL_CONFIG['precision']}, checkpoint={checkpoint_config.get('precision')}"
+		)
+		return False
+	if checkpoint_config.get("max_samples") != MAX_TRAINING_SAMPLES:
+		logger.warning(
+			f"Max samples mismatch: current={MAX_TRAINING_SAMPLES}, checkpoint={checkpoint_config.get('max_samples')}"
+		)
+		return False
+	logger.info("✅ Checkpoint configuration is compatible")
+	return True
+def create_checkpoint_data(stage: str, data: dict[str, Any], step: int = 0) -> dict[str, Any]:
+	"""
+	Create checkpoint data with configuration metadata.
+	Args:
+	        stage: Checkpoint stage name
+	        data: Core checkpoint data
+	        step: Step number
+	Returns:
+	        Enhanced checkpoint data with configuration
+	"""
+	return {
+		"config_hash": get_current_config_hash(),
+		"config": {
+			"model_name": MODEL_NAME,
+			"pca_dims": PCA_DIMS,
+			"precision": TEACHER_MODEL_CONFIG["precision"],
+			"torch_dtype": str(TEACHER_MODEL_CONFIG["torch_dtype"]),
+			"max_samples": MAX_TRAINING_SAMPLES,
+			"codesearchnet_dataset": CODESEARCHNET_DATASET,
+		},
+		"stage": stage,
+		"step": step,
+		"timestamp": time.time(),
+		"data": data,
+	}
+def load_codesearchnet_dataset_with_resume(
+	max_samples: int = MAX_TRAINING_SAMPLES,
+	checkpoint_manager: BeamCheckpointManager | None = None,
+) -> list[str]:
+	"""Load and format the sentence-transformers/codesearchnet dataset with resume capability."""
+	logger.info(f"Loading CodeSearchNet dataset from {CODESEARCHNET_DATASET}")
+	logger.info(f"Limiting to {max_samples} samples for training efficiency")
+	# Check for existing dataset checkpoint with validation
+	if checkpoint_manager:
+		checkpoint_data = checkpoint_manager.load_checkpoint("dataset", 0)
+		if checkpoint_data:
+			if validate_checkpoint_compatibility(checkpoint_data):
+				texts = checkpoint_data.get("data", {}).get("texts", [])
+				if len(texts) >= max_samples:
+					logger.info(f"✅ Resumed dataset loading: {len(texts)} texts from checkpoint")
+					return texts[:max_samples]
+				logger.info(f"📋 Partial dataset found: {len(texts)} texts, continuing from there")
+				start_from = len(texts)
+			else:
+				logger.warning("🔄 Incompatible dataset checkpoint found, starting fresh")
+				# Clean up incompatible checkpoint
+				checkpoint_manager.cleanup_old_checkpoints("dataset", keep_latest=0)
+				texts = []
+				start_from = 0
+		else:
+			texts = []
+			start_from = 0
+	else:
+		texts = []
+		start_from = 0
+	try:
+		# Load the dataset
+		dataset = load_dataset(CODESEARCHNET_DATASET, split="train", streaming=True)
+		# Skip to where we left off
+		dataset_iter = iter(dataset)
+		for _ in range(start_from):
+			try:
+				next(dataset_iter)
+			except StopIteration:
+				break
+		for i, example in enumerate(dataset_iter, start=start_from):
+			if len(texts) >= max_samples:
+				break
+			comment = example.get("comment", "").strip()
+			code = example.get("code", "").strip()
+			if comment and code and len(comment) > 10 and len(code) > 50:
+				# Format as comment-code pair for training
+				text = f"Comment: {comment}\nCode:\n{code}"
+				# Ensure reasonable length
+				if len(text) <= 2048:  # Reasonable limit for embedding models
+					texts.append(text)
+			# Save checkpoint periodically
+			if checkpoint_manager and (i + 1) % CHECKPOINT_INTERVAL == 0:
+				checkpoint_data = create_checkpoint_data("dataset", {"texts": texts}, 0)
+				checkpoint_manager.save_checkpoint("dataset", checkpoint_data, 0)
+				logger.info(f"💾 Saved dataset checkpoint: {len(texts)} texts collected")
+			if (i + 1) % 10000 == 0:
+				logger.info(f"Processed {i + 1} examples, collected {len(texts)} valid pairs")
+		# Final checkpoint save
+		if checkpoint_manager:
+			checkpoint_data = create_checkpoint_data("dataset", {"texts": texts}, 0)
+			checkpoint_manager.save_checkpoint("dataset", checkpoint_data, 0)
+		logger.info(f"Successfully loaded {len(texts)} code-comment pairs from CodeSearchNet")
+		return texts
+	except Exception:
+		logger.exception("Error loading CodeSearchNet dataset")
+		return texts  # Return what we have so far
+def generate_teacher_embeddings_with_checkpoints(
+	teacher_model: SentenceTransformer,
+	texts: list[str],
+	checkpoint_manager: BeamCheckpointManager | None = None,
+) -> torch.Tensor:
+	"""Generate teacher embeddings for code training with checkpoint support."""
+	logger.info(f"Generating teacher embeddings for {len(texts)} texts...")
+	# Check for existing embeddings checkpoint using torch.save format
+	final_embeddings = None
+	if checkpoint_manager:
+		# Try to load complete embeddings tensor directly
+		embeddings_path = Path(VOLUME_PATH) / "embeddings_cache.pt"
+		config_path = Path(VOLUME_PATH) / "embeddings_config.json"
+		if embeddings_path.exists() and config_path.exists():
+			try:
+				# Load config first to validate compatibility
+				with config_path.open("r") as f:
+					config_data = json.load(f)
+				# Create a dummy checkpoint data structure for validation
+				checkpoint_data = {
+					"config_hash": config_data.get("config_hash"),
+					"config": config_data.get("config", {}),
+				}
+				if validate_checkpoint_compatibility(checkpoint_data):
+					# Load the embeddings tensor
+					final_embeddings = torch.load(embeddings_path, map_location="cpu")
+					num_expected = config_data.get("num_texts", len(texts))
+					if final_embeddings.shape[0] >= num_expected:
+						logger.info(
+							f"✅ Loaded complete embeddings from cache ({final_embeddings.shape[0]} embeddings)"
+						)
+						return final_embeddings[: len(texts)]  # Return only the needed amount
+					logger.info(
+						f"⚠️ Cached embeddings incomplete ({final_embeddings.shape[0]}/{num_expected}), regenerating"
+					)
+					final_embeddings = None
+				else:
+					logger.warning("🔄 Incompatible embeddings cache found, regenerating")
+					final_embeddings = None
+			except Exception as e:
+				logger.warning(f"Failed to load embeddings cache: {e}, regenerating...")
+				final_embeddings = None
+	# If we have complete embeddings, return them
+	if final_embeddings is not None:
+		return final_embeddings
+	# Generate embeddings from scratch
+	logger.info("Generating fresh teacher embeddings...")
+	# Use optimized batch size for large models with proper type casting
+	batch_size_raw = TEACHER_MODEL_CONFIG["batch_size"]
+	current_batch_size: int = batch_size_raw if isinstance(batch_size_raw, int) else 16
+	logger.info(f"Using optimized batch size: {current_batch_size} for 40GB VRAM (7B model)")
+	embeddings_list = []
+	for i in range(0, len(texts), current_batch_size):
+		batch_texts = texts[i : i + current_batch_size]
+		try:
+			# Use optimized encoding with convert_to_tensor=True for efficiency
+			batch_embeddings = teacher_model.encode(
+				batch_texts,
+				convert_to_tensor=True,
+				batch_size=current_batch_size,
+				show_progress_bar=False,  # Reduce overhead
+				normalize_embeddings=True,  # Pre-normalize for efficiency
+			)
+			embeddings_list.append(batch_embeddings)
+			if i % (current_batch_size * 10) == 0:
+				logger.info(f"Generated embeddings for {i + len(batch_texts)}/{len(texts)} texts")
+		except torch.cuda.OutOfMemoryError:
+			logger.warning(
+				f"GPU OOM with batch size {current_batch_size}, reducing to {max(1, current_batch_size // 2)}"
+			)
+			# Clear cache and reduce batch size
+			if torch.cuda.is_available():
+				torch.cuda.empty_cache()
+			current_batch_size = max(1, current_batch_size // 2)
+			# Retry with smaller batch size
+			batch_texts = texts[i : i + current_batch_size]
+			batch_embeddings = teacher_model.encode(
+				batch_texts,
+				convert_to_tensor=True,
+				batch_size=current_batch_size,
+				show_progress_bar=False,
+				normalize_embeddings=True,
+			)
+			embeddings_list.append(batch_embeddings)
+			logger.info(f"Successfully processed batch with reduced size {current_batch_size}")
+	# Combine all embeddings and force fp32 precision
+	teacher_embeddings = torch.cat(embeddings_list, dim=0)
+	# Ensure teacher embeddings are in fp32 for maximum quality
+	if teacher_embeddings.dtype != torch.float32:
+		logger.info(f"Converting teacher embeddings from {teacher_embeddings.dtype} to fp32")
+		teacher_embeddings = teacher_embeddings.to(torch.float32)
+	logger.info(f"Generated {teacher_embeddings.shape[0]} teacher embeddings in {teacher_embeddings.dtype}")
+	# Save embeddings cache using torch.save for future runs
+	if checkpoint_manager:
+		try:
+			embeddings_path = Path(VOLUME_PATH) / "embeddings_cache.pt"
+			config_path = Path(VOLUME_PATH) / "embeddings_config.json"
+			# Save embeddings tensor
+			torch.save(teacher_embeddings, embeddings_path)
+			# Save configuration
+			config_data = {
+				"config_hash": get_current_config_hash(),
+				"config": {
+					"model_name": MODEL_NAME,
+					"pca_dims": PCA_DIMS,
+					"precision": TEACHER_MODEL_CONFIG["precision"],
+					"torch_dtype": str(TEACHER_MODEL_CONFIG["torch_dtype"]),
+					"max_samples": MAX_TRAINING_SAMPLES,
+					"codesearchnet_dataset": CODESEARCHNET_DATASET,
+				},
+				"num_texts": len(texts),
+				"embedding_shape": list(teacher_embeddings.shape),
+				"timestamp": time.time(),
+			}
+			with config_path.open("w") as f:
+				json.dump(config_data, f, indent=2)
+			logger.info("💾 Saved embeddings cache for future runs")
+		except Exception as e:
+			logger.warning(f"Failed to save embeddings cache: {e}")
+	return teacher_embeddings
+def refine_with_code_training(
+	student_model: Any,
+	training_texts: list[str],
+	teacher_embeddings: torch.Tensor,
+	epochs: int = 2,
+	checkpoint_manager: BeamCheckpointManager | None = None,
+	model_manager: BeamModelManager | None = None,
+) -> Any:
+	"""Refine the student model with code-specific training."""
+	logger.info(f"Starting code specialization training for {epochs} epochs...")
+	# Validate input parameters
+	if student_model is None:
+		logger.error("student_model is None - cannot proceed with code training")
+		msg = "student_model cannot be None"
+		raise ValueError(msg)
+	if not hasattr(student_model, "embedding"):
+		logger.error(f"student_model of type {type(student_model)} does not have 'embedding' attribute")
+		msg = f"student_model must have 'embedding' attribute, got {type(student_model)}"
+		raise ValueError(msg)
+	logger.info(f"Student model type: {type(student_model)}")
+	logger.info(f"Student model embedding shape: {student_model.embedding.shape}")
+	try:
+		# Force fp32 precision throughout for maximum quality
+		target_dtype = torch.float32
+		logger.info("🎯 Enforcing fp32 precision throughout for maximum quality")
+		# Detect student model dtype for logging purposes
+		student_dtype = student_model.embedding.dtype
+		logger.info(f"Student model original embedding dtype: {student_dtype}")
+		# Force teacher embeddings to fp32 if not already
+		if teacher_embeddings.dtype != target_dtype:
+			logger.info(f"Converting teacher embeddings from {teacher_embeddings.dtype} to {target_dtype}")
+			teacher_embeddings = teacher_embeddings.to(target_dtype)
+		# Get dimensions
+		student_embedding_dim = student_model.embedding.shape[1]
+		teacher_embedding_dim = teacher_embeddings.shape[1]
+		logger.info(f"Student dims: {student_embedding_dim}, Teacher dims: {teacher_embedding_dim}")
+		# Project teacher embeddings if needed with high-precision PCA
+		if teacher_embedding_dim != student_embedding_dim:
+			from sklearn.decomposition import PCA
+			logger.info("Performing high-precision PCA projection for quality preservation...")
+			pca = PCA(n_components=student_embedding_dim)
+			# Use float64 for PCA computation to maximize precision
+			teacher_embeddings_np = teacher_embeddings.cpu().numpy().astype(np.float64)
+			teacher_embeddings_projected = pca.fit_transform(teacher_embeddings_np)
+			# Convert back to fp32 (always use fp32, never fp16)
+			teacher_embeddings = torch.tensor(
+				teacher_embeddings_projected.astype(np.float32),
+				dtype=target_dtype,
+			)
+			logger.info(f"PCA projection completed: {teacher_embeddings.shape} with dtype {target_dtype}")
+			logger.info(
+				f"PCA preserved variance ratio: {pca.explained_variance_ratio_[:5].sum():.4f} (first 5 components)"
+			)
+		# Create trainable model
+		trainable_model = FinetunableStaticModel.from_static_model(
+			model=student_model,
+			out_dim=student_embedding_dim,
+		)
+		# Force ALL model parameters to fp32 to ensure no precision loss
+		trainable_model = trainable_model.float()
+		# Additional explicit conversion of embedding weights to fp32
+		if hasattr(trainable_model, "embeddings") and hasattr(trainable_model.embeddings, "weight"):
+			trainable_model.embeddings.weight.data = trainable_model.embeddings.weight.data.to(target_dtype)
+		# Verify final model dtype after model2vec patch fix
+		actual_model_dtype = None
+		for param in trainable_model.parameters():
+			actual_model_dtype = param.dtype
+			break
+		logger.info(f"Model parameter dtype: {actual_model_dtype}")
+		logger.info(f"Embedding weight dtype: {trainable_model.embeddings.weight.dtype}")
+		# Ensure teacher embeddings are definitely in fp32
+		teacher_embeddings = teacher_embeddings.to(target_dtype)
+		logger.info(f"Final teacher embeddings dtype: {teacher_embeddings.dtype}")
+		logger.info(f"Final model parameter dtype: {actual_model_dtype}")
+		# Verify we're using fp32 throughout
+		if teacher_embeddings.dtype != target_dtype:
+			logger.warning(f"⚠️ Teacher embeddings not in {target_dtype}: {teacher_embeddings.dtype}")
+		if actual_model_dtype != target_dtype:
+			logger.warning(f"⚠️ Model parameters not in {target_dtype}: {actual_model_dtype}")
+		logger.info("✅ Confirmed fp32 precision throughout the training pipeline")
+		# Tokenize texts
+		tokenized_texts = []
+		for text in training_texts:
+			tokens = trainable_model.tokenize([text])
+			if tokens.shape[1] > 0:
+				tokenized_texts.append(tokens[0].tolist())
+		# Prepare training data with explicit fp32 casting
+		targets = teacher_embeddings[: len(tokenized_texts)]
+		# Force targets to fp32 to maintain maximum precision
+		targets = targets.to(target_dtype)
+		logger.info(f"Cast targets to fp32: {targets.dtype}")
+		train_texts, val_texts, train_targets, val_targets = train_test_split(
+			tokenized_texts, targets, test_size=0.2, random_state=42
+		)
+		logger.info(f"Train targets dtype: {train_targets.dtype}")
+		logger.info(f"Val targets dtype: {val_targets.dtype}")
+		# Training setup
+		train_dataset = TextDataset(train_texts, train_targets)
+		val_dataset = TextDataset(val_texts, val_targets)
+		optimizer = optim.Adam(trainable_model.parameters(), lr=LEARNING_RATE)
+		mse_loss = nn.MSELoss()
+		device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+		try:
+			trainable_model = trainable_model.to(device)
+			logger.info(f"Training on {device}")
+		except torch.cuda.OutOfMemoryError:
+			logger.warning("GPU OOM loading training model, using CPU")
+			device = torch.device("cpu")
+			trainable_model = trainable_model.to(device)
+			if torch.cuda.is_available():
+				torch.cuda.empty_cache()
+		# Adaptive batch size for training
+		adaptive_batch_size = BATCH_SIZE
+		# Quality monitoring: compute embedding similarity before training
+		logger.info("🔍 Quality monitoring: Computing pre-training teacher-student similarity...")
+		trainable_model.eval()
+		with torch.no_grad():
+			# Take a small sample of texts for quality measurement
+			sample_texts = training_texts[: min(5, len(training_texts))]
+			sample_tokens = trainable_model.tokenize(sample_texts)
+			sample_tokens = sample_tokens.to(device)
+			_, student_embeddings_before = trainable_model(sample_tokens)
+			sample_teacher_embeddings = targets[: len(sample_texts)].to(device)
+			# Compute average cosine similarity
+			similarities_before = []
+			for i in range(len(sample_texts)):
+				sim = torch.cosine_similarity(
+					student_embeddings_before[i].unsqueeze(0),
+					sample_teacher_embeddings[i].unsqueeze(0),
+				).item()
+				similarities_before.append(sim)
+			avg_similarity_before = np.mean(similarities_before)
+			logger.info(f"📊 Pre-training average teacher-student similarity: {avg_similarity_before:.4f}")
+		# Training loop with validation
+		for epoch in range(epochs):
+			# Training phase
+			trainable_model.train()
+			# Try with current batch size, reduce if OOM
+			train_successful = False
+			while not train_successful and adaptive_batch_size >= 1:
+				try:
+					train_loader = train_dataset.to_dataloader(shuffle=True, batch_size=adaptive_batch_size)
+					epoch_loss = 0.0
+					num_batches = 0
+					for batch_idx, (tokens, targets_batch) in enumerate(train_loader):
+						batch_tokens = tokens.to(device)
+						batch_targets = targets_batch.to(device)
+						optimizer.zero_grad()
+						_, student_embeddings = trainable_model(batch_tokens)
+						# Debug dtype information on first batch
+						if batch_idx == 0:
+							logger.info(
+								f"Batch {batch_idx}: tokens shape {batch_tokens.shape}, dtype {batch_tokens.dtype}"
+							)
+							logger.info(
+								f"Batch {batch_idx}: targets shape {batch_targets.shape}, dtype {batch_targets.dtype}"
+							)
+							logger.info(
+								f"Batch {batch_idx}: student_embeddings shape {student_embeddings.shape}, dtype {student_embeddings.dtype}"
+							)
+						# Force both tensors to fp32 to avoid any precision loss
+						if student_embeddings.dtype != target_dtype:
+							logger.warning(
+								f"Student embeddings not in fp32: {student_embeddings.dtype}, converting to fp32"
+							)
+							student_embeddings = student_embeddings.to(target_dtype)
+						if batch_targets.dtype != target_dtype:
+							logger.info(f"Converting targets from {batch_targets.dtype} to fp32")
+							batch_targets = batch_targets.to(target_dtype)
+						try:
+							loss = mse_loss(student_embeddings, batch_targets)
+							loss.backward()
+							optimizer.step()
+						except RuntimeError as e:
+							if "expected scalar type" in str(e):
+								logger.exception("Dtype mismatch error occurred:")
+								logger.exception(
+									f"student_embeddings: {student_embeddings.shape}, {student_embeddings.dtype}"
+								)
+								logger.exception(f"batch_targets: {batch_targets.shape}, {batch_targets.dtype}")
+								logger.exception(
+									f"MSE loss input dtypes: {student_embeddings.dtype} vs {batch_targets.dtype}"
+								)
+								# Force explicit casting to fp32 for maximum precision
+								batch_targets = batch_targets.to(target_dtype)
+								student_embeddings = student_embeddings.to(target_dtype)
+								logger.info("Emergency dtype fix: forced both to fp32")
+								loss = mse_loss(student_embeddings, batch_targets)
+								loss.backward()
+								optimizer.step()
+							else:
+								raise
+						epoch_loss += loss.item()
+						num_batches += 1
+						# Save training checkpoint periodically
+						if checkpoint_manager and batch_idx % 100 == 0:
+							training_state = {
+								"epoch": epoch,
+								"batch": batch_idx,
+								"model_state": trainable_model.state_dict(),
+								"optimizer_state": optimizer.state_dict(),
+								"loss": epoch_loss / max(1, num_batches),
+							}
+							checkpoint_data = create_checkpoint_data("training", training_state, epoch)
+							checkpoint_manager.save_checkpoint("training", checkpoint_data, epoch)
+					train_successful = True
+				except torch.cuda.OutOfMemoryError:
+					logger.warning(
+						f"Training OOM with batch size {adaptive_batch_size}, reducing to {adaptive_batch_size // 2}"
+					)
+					adaptive_batch_size = max(1, adaptive_batch_size // 2)
+					if torch.cuda.is_available():
+						torch.cuda.empty_cache()
+			if not train_successful:
+				logger.error("Unable to train even with batch size 1, skipping training")
+				break
+			avg_train_loss = epoch_loss / num_batches if num_batches > 0 else 0.0
+			# Validation phase
+			trainable_model.eval()
+			val_loader = val_dataset.to_dataloader(shuffle=False, batch_size=adaptive_batch_size)
+			val_loss = 0.0
+			val_batches = 0
+			with torch.no_grad():
+				for tokens, targets_batch in val_loader:
+					batch_tokens = tokens.to(device)
+					batch_targets = targets_batch.to(device)
+					_, student_embeddings = trainable_model(batch_tokens)
+					# Force both tensors to fp32 to avoid any precision loss in validation
+					if student_embeddings.dtype != target_dtype:
+						student_embeddings = student_embeddings.to(target_dtype)
+					if batch_targets.dtype != target_dtype:
+						batch_targets = batch_targets.to(target_dtype)
+					loss = mse_loss(student_embeddings, batch_targets)
+					val_loss += loss.item()
+					val_batches += 1
+			avg_val_loss = val_loss / val_batches if val_batches > 0 else 0.0
+			logger.info(
+				f"Epoch {epoch + 1}/{epochs} - Train Loss: {avg_train_loss:.6f}, Val Loss: {avg_val_loss:.6f}, Batch Size: {adaptive_batch_size}"
+			)
+			# Save epoch checkpoint
+			if checkpoint_manager:
+				epoch_state = {
+					"epoch": epoch + 1,
+					"model_state": trainable_model.state_dict(),
+					"optimizer_state": optimizer.state_dict(),
+					"train_loss": avg_train_loss,
+					"val_loss": avg_val_loss,
+				}
+				checkpoint_data = create_checkpoint_data("epoch", epoch_state, epoch + 1)
+				checkpoint_manager.save_checkpoint("epoch", checkpoint_data, epoch + 1)
+			# Quality monitoring: compute embedding similarity after training
+			logger.info("🔍 Quality monitoring: Computing post-training teacher-student similarity...")
+			trainable_model.eval()
+			with torch.no_grad():
+				# Use the same sample texts as before
+				sample_texts = training_texts[: min(5, len(training_texts))]
+				sample_tokens = trainable_model.tokenize(sample_texts)
+				sample_tokens = sample_tokens.to(device)
+				_, student_embeddings_after = trainable_model(sample_tokens)
+				sample_teacher_embeddings = targets[: len(sample_texts)].to(device)
+				# Compute average cosine similarity
+				similarities_after = []
+				for i in range(len(sample_texts)):
+					sim = torch.cosine_similarity(
+						student_embeddings_after[i].unsqueeze(0),
+						sample_teacher_embeddings[i].unsqueeze(0),
+					).item()
+					similarities_after.append(sim)
+				avg_similarity_after = np.mean(similarities_after)
+				logger.info(f"📊 Post-training average teacher-student similarity: {avg_similarity_after:.4f}")
+				# Quality assessment
+				quality_change = avg_similarity_after - avg_similarity_before
+				logger.info(f"📈 Quality change: {quality_change:+.4f}")
+				if abs(quality_change) < 0.01:
+					logger.info("✅ Quality well preserved during training!")
+				elif quality_change > 0:
+					logger.info("✅ Quality improved during training!")
+				else:
+					logger.warning(f"⚠️ Quality degraded by {abs(quality_change):.4f} during training")
+		# Convert back to static model
+		refined_model = trainable_model.to_static_model()
+		# Save final refined model to beam volume
+		if model_manager:
+			# Save to temporary local directory first
+			temp_refined_path = Path("./temp_refined_save")
+			temp_refined_path.mkdir(exist_ok=True)
+			refined_model.save_pretrained(str(temp_refined_path))
+			# Upload to beam volume
+			model_manager.save_model("refined_model", str(temp_refined_path))
+			# Clean up temp directory
+			import shutil
+			shutil.rmtree(temp_refined_path, ignore_errors=True)
+			logger.info("💾 Saved refined model to beam volume")
+		logger.info("Code specialization training completed")
+		return refined_model
+	except Exception as e:
+		logger.warning(f"Code training failed: {e}")
+		return student_model
+def apply_regularization(model: Any, weight: float = 0.01) -> Any:
+	"""Apply light regularization with overflow protection."""
+	# Validate input
+	if model is None:
+		logger.error("Cannot apply regularization: model is None")
+		msg = "model cannot be None"
+		raise ValueError(msg)
+	if not hasattr(model, "embedding"):
+		logger.error(f"Cannot apply regularization: model of type {type(model)} does not have 'embedding' attribute")
+		msg = f"model must have 'embedding' attribute, got {type(model)}"
+		raise ValueError(msg)
+	logger.info(f"Applying regularization to model of type: {type(model)}")
+	try:
+		embeddings = model.embedding.copy()
+		# Check for extreme values and clip if necessary
+		max_val = np.abs(embeddings).max()
+		if max_val > 1e6:  # Clip extremely large values
+			logger.warning(f"Large embedding values detected (max: {max_val:.2e}), clipping to prevent overflow")
+			embeddings = np.clip(embeddings, -1e6, 1e6)
+		# Apply regularization
+		regularized_embeddings = embeddings * (1.0 - weight)
+		# Stable normalization to prevent overflow
+		norms = np.linalg.norm(regularized_embeddings, axis=1, keepdims=True)
+		# Handle zero norms and potential overflow
+		norms = np.where(norms == 0, 1, norms)
+		norms = np.where(norms > 1e6, 1e6, norms)  # Prevent extremely large norms
+		regularized_embeddings = regularized_embeddings / norms
+		# Create new model
+		from model2vec.model import StaticModel
+		regularized_model = StaticModel(
+			vectors=regularized_embeddings,
+			tokenizer=model.tokenizer,
+			config=model.config,
+			base_model_name=model.base_model_name,
+			language=model.language,
+			normalize=True,
+		)
+		logger.info("Regularization applied successfully")
+		return regularized_model
+	except Exception as e:
+		logger.warning(f"Regularization failed: {e}")
+		return model
+def load_teacher_model_with_cache(
+	model_name: str,
+	output_dir: str,
+	device: str = "cuda",
+	resume: bool = True,
+) -> SentenceTransformer:
+	"""Load teacher model with local caching to avoid re-downloading."""
+	cache_dir = Path(output_dir) / "teacher_model_cache"
+	# Check if cached model exists
+	if resume and cache_dir.exists():
+		try:
+			logger.info(f"Loading cached teacher model from {cache_dir}")
+			teacher_model = SentenceTransformer(str(cache_dir), device=device)
+			# Set optimized sequence length
+			max_seq_len = TEACHER_MODEL_CONFIG.get("max_seq_length", 8192)
+			if isinstance(max_seq_len, int):
+				teacher_model.max_seq_length = max_seq_len
+			logger.info("Successfully loaded cached teacher model")
+			return teacher_model
+		except Exception as e:
+			logger.warning(f"Failed to load cached teacher model: {e}")
+			logger.info("Will download fresh model")
+	# Download and cache the model
+	logger.info(f"Downloading teacher model {model_name} (this may take a while)")
+	# Prepare model kwargs with flash attention
+	model_kwargs = {
+		"torch_dtype": TEACHER_MODEL_CONFIG["torch_dtype"],
+		"device_map": TEACHER_MODEL_CONFIG["device_map"],
+	}
+	# Try to add flash attention if available
+	if TEACHER_MODEL_CONFIG.get("use_flash_attention", False):
+		try:
+			model_kwargs["attn_implementation"] = TEACHER_MODEL_CONFIG["attn_implementation"]
+			logger.info("Flash Attention 2 enabled")
+		except Exception as e:
+			logger.warning(f"Flash Attention not available, using default attention: {e}")
+	try:
+		teacher_model = SentenceTransformer(
+			model_name,
+			device=device,
+			trust_remote_code=bool(TEACHER_MODEL_CONFIG["trust_remote_code"]),
+			model_kwargs=model_kwargs,
+		)
+	except ImportError as e:
+		if "flash_attn" in str(e):
+			logger.warning("Flash Attention 2 not available, falling back to default attention")
+			# Remove flash attention from model_kwargs and retry
+			model_kwargs_fallback = {k: v for k, v in model_kwargs.items() if k != "attn_implementation"}
+			teacher_model = SentenceTransformer(
+				model_name,
+				device=device,
+				trust_remote_code=bool(TEACHER_MODEL_CONFIG["trust_remote_code"]),
+				model_kwargs=model_kwargs_fallback,
+			)
+		else:
+			raise
+	# Set optimized sequence length
+	max_seq_len = TEACHER_MODEL_CONFIG.get("max_seq_length", 8192)
+	if isinstance(max_seq_len, int):
+		teacher_model.max_seq_length = max_seq_len
+		logger.info(f"Set max_seq_length to {max_seq_len} for better performance")
+	# Cache the model for future use
+	try:
+		cache_dir.mkdir(parents=True, exist_ok=True)
+		teacher_model.save(str(cache_dir))
+		logger.info(f"Cached teacher model to {cache_dir}")
+	except Exception as e:
+		logger.warning(f"Failed to cache teacher model: {e}")
+		# Continue without caching
+	return teacher_model
+def code_specialized_distillation(
+	model_name: str = MODEL_NAME,
+	output_dir: str = OUTPUT_DIR,
+	pca_dims: int = PCA_DIMS,
+	max_samples: int = MAX_TRAINING_SAMPLES,
+	resume: bool = True,
+) -> Any:
+	"""Main code-specialized distillation function using CodeSearchNet dataset with checkpoint support."""
+	output_path = Path(output_dir)
+	output_path.mkdir(parents=True, exist_ok=True)
+	# Initialize Beam utilities
+	volume_mgr, checkpoint_mgr, model_mgr, eval_mgr = create_beam_utilities(VOLUME_NAME, VOLUME_PATH)
+	logger.info(f"Starting code-specialized distillation of {model_name}")
+	logger.info(f"Using CodeSearchNet dataset: {CODESEARCHNET_DATASET}")
+	logger.info(f"Resume mode: {resume}")
+	# GPU Diagnostics
+	logger.info("=== GPU DIAGNOSTICS ===")
+	logger.info(f"CUDA available: {torch.cuda.is_available()}")
+	if torch.cuda.is_available():
+		logger.info(f"CUDA version: {torch.version.cuda}")
+		logger.info(f"GPU count: {torch.cuda.device_count()}")
+		for i in range(torch.cuda.device_count()):
+			gpu_name = torch.cuda.get_device_name(i)
+			gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
+			logger.info(f"GPU {i}: {gpu_name} ({gpu_memory:.1f} GB)")
+		# Current GPU memory
+		current_device = torch.cuda.current_device()
+		allocated = torch.cuda.memory_allocated(current_device) / 1024**3
+		total = torch.cuda.get_device_properties(current_device).total_memory / 1024**3
+		logger.info(f"Current GPU {current_device}: {allocated:.2f}GB allocated, {total:.1f}GB total")
+	else:
+		logger.warning("CUDA not available - will use CPU (much slower)")
+	logger.info("======================")
+	start_time = time.time()
+	# Step 1: Basic Model2Vec distillation with checkpoint support
+	logger.info("Step 1: Basic Model2Vec distillation...")
+	# Check for existing distilled model in beam volume
+	m2v_model = None
+	if resume:
+		# Check if model files exist directly in the volume root
+		try:
+			# Try to load from the volume root where the model was successfully saved
+			volume_root_path = Path(VOLUME_PATH)
+			if (volume_root_path / "config.json").exists() and (volume_root_path / "model.safetensors").exists():
+				logger.info("✅ Found existing model files in volume root")
+				from model2vec.model import StaticModel
+				m2v_model = StaticModel.from_pretrained(str(volume_root_path))
+				logger.info("✅ Successfully loaded existing distilled model from volume")
+			else:
+				logger.info("No existing model files found in volume root")
+		except Exception as e:
+			logger.warning(f"Failed to load existing model from volume: {e}")
+			m2v_model = None
+	if m2v_model is None:
+		# Clear GPU cache before starting
+		if torch.cuda.is_available():
+			torch.cuda.empty_cache()
+			current_device = torch.cuda.current_device()
+			allocated = torch.cuda.memory_allocated(current_device) / 1024**3
+			total = torch.cuda.get_device_properties(current_device).total_memory / 1024**3
+			logger.info(f"GPU memory before distillation: {allocated:.2f}GB allocated / {total:.1f}GB total")
+		else:
+			logger.info("Using CPU for distillation")
+		try:
+			m2v_model = distill(
+				model_name=model_name,
+				pca_dims=pca_dims,
+				apply_zipf=None,
+				sif_coefficient=1e-4,
+				trust_remote_code=True,
+			)
+			logger.info("Basic distillation completed with preserved precision")
+			# Validate the distilled model
+			if m2v_model is None:
+				msg = "Distillation returned None - this should not happen"
+				raise ValueError(msg) from None
+			logger.info(f"Distilled model type: {type(m2v_model)}")
+			logger.info(f"Distilled model has embedding attribute: {hasattr(m2v_model, 'embedding')}")
+			# Save the base distilled model - DISABLED due to recursive directory bug
+			# model_mgr.save_model("base_distilled_model", str(output_path))
+		except torch.cuda.OutOfMemoryError:
+			logger.warning("GPU OOM during distillation, clearing cache and retrying...")
+			torch.cuda.empty_cache()
+			# Force CPU-only distillation if GPU fails
+			os.environ["CUDA_VISIBLE_DEVICES"] = ""
+			logger.info("Retrying distillation on CPU...")
+			m2v_model = distill(
+				model_name=model_name,
+				pca_dims=pca_dims,
+				apply_zipf=None,
+				sif_coefficient=1e-4,
+				trust_remote_code=True,
+			)
+			logger.info("Basic distillation completed on CPU")
+			# Validate the distilled model
+			if m2v_model is None:
+				msg = "CPU distillation returned None - this should not happen"
+				raise ValueError(msg) from None
+			logger.info(f"CPU distilled model type: {type(m2v_model)}")
+			logger.info(f"CPU distilled model has embedding attribute: {hasattr(m2v_model, 'embedding')}")
+			# Save the base distilled model - DISABLED due to recursive directory bug
+			# model_mgr.save_model("base_distilled_model", str(output_path))
+		except Exception:
+			logger.exception("Distillation failed with error")
+			raise
+	# Validate m2v_model before proceeding
+	if m2v_model is None:
+		msg = "m2v_model is None after distillation step - cannot proceed"
+		raise ValueError(msg)
+	# Step 2: Load CodeSearchNet training data with resume
+	logger.info("Step 2: Loading CodeSearchNet training data...")
+	code_texts = load_codesearchnet_dataset_with_resume(max_samples, checkpoint_mgr)
+	if not code_texts:
+		logger.warning("No code training data available, skipping code specialization")
+	else:
+		logger.info("Step 3: Code specialization training...")
+		# Check for existing refined model
+		if resume:
+			# Check if refined model exists in beam volume
+			models = model_mgr.list_models()
+			refined_model_exists = any(model["name"] == "refined_model" for model in models)
+			if refined_model_exists:
+				# Download model to local path for loading
+				temp_model_path = Path("./temp_refined_model")
+				if model_mgr.load_model("refined_model", temp_model_path):
+					try:
+						from model2vec.model import StaticModel
+						refined_model = StaticModel.from_pretrained(str(temp_model_path / "refined_model"))
+						logger.info("✅ Resumed from existing refined model")
+						m2v_model = refined_model
+						# Clean up temp directory
+						import shutil
+						shutil.rmtree(temp_model_path, ignore_errors=True)
+					except Exception as e:
+						logger.warning(f"Failed to load existing refined model: {e}")
+						refined_model = None
+						# Clean up temp directory
+						import shutil
+						shutil.rmtree(temp_model_path, ignore_errors=True)
+				else:
+					refined_model = None
+			else:
+				refined_model = None
+			if refined_model is None:
+				# Load teacher model with memory management
+				try:
+					device = "cuda" if torch.cuda.is_available() else "cpu"
+					logger.info(f"Loading teacher model on {device} with optimized settings")
+					logger.info(
+						f"Using precision: {TEACHER_MODEL_CONFIG['precision']}, batch_size: {TEACHER_MODEL_CONFIG['batch_size']}"
+					)
+					logger.info("Attempting to enable Flash Attention 2 for maximum performance")
+					teacher_model = load_teacher_model_with_cache(model_name, output_dir, device=device, resume=resume)
+					# Generate teacher embeddings with checkpoints
+					teacher_embeddings = generate_teacher_embeddings_with_checkpoints(
+						teacher_model, code_texts, checkpoint_mgr
+					)
+					# Refine with code training
+					m2v_model = refine_with_code_training(
+						m2v_model,
+						code_texts,
+						teacher_embeddings,
+						epochs=TRAINING_EPOCHS,
+						checkpoint_manager=checkpoint_mgr,
+						model_manager=model_mgr,
+					)
+					del teacher_model
+					if torch.cuda.is_available():
+						torch.cuda.empty_cache()
+				except torch.cuda.OutOfMemoryError:
+					logger.warning("GPU OOM during code training, falling back to CPU...")
+					if torch.cuda.is_available():
+						torch.cuda.empty_cache()
+					# Force CPU for teacher model with optimized settings (no flash attention on CPU)
+					try:
+						teacher_model = load_teacher_model_with_cache(
+							model_name, output_dir, device="cpu", resume=resume
+						)
+					except ImportError as e:
+						if "flash_attn" in str(e):
+							logger.warning("Flash Attention 2 not available on CPU, using default attention")
+							# Fallback without any special attention implementation
+							teacher_model = load_teacher_model_with_cache(
+								model_name, output_dir, device="cpu", resume=resume
+							)
+						else:
+							raise
+					# Generate teacher embeddings on CPU with checkpoints
+					teacher_embeddings = generate_teacher_embeddings_with_checkpoints(
+						teacher_model, code_texts, checkpoint_mgr
+					)
+					# Refine with code training on CPU
+					m2v_model = refine_with_code_training(
+						m2v_model,
+						code_texts,
+						teacher_embeddings,
+						epochs=TRAINING_EPOCHS,
+						checkpoint_manager=checkpoint_mgr,
+						model_manager=model_mgr,
+					)
+					del teacher_model
+		else:
+			# Fresh training without resume
+			try:
+				device = "cuda" if torch.cuda.is_available() else "cpu"
+				logger.info(f"Loading teacher model on {device} with optimized settings")
+				logger.info(
+					f"Using precision: {TEACHER_MODEL_CONFIG['precision']}, batch_size: {TEACHER_MODEL_CONFIG['batch_size']}"
+				)
+				logger.info("Attempting to enable Flash Attention 2 for maximum performance")
+				teacher_model = load_teacher_model_with_cache(model_name, output_dir, device=device, resume=resume)
+				# Generate teacher embeddings with checkpoints
+				teacher_embeddings = generate_teacher_embeddings_with_checkpoints(
+					teacher_model, code_texts, checkpoint_mgr
+				)
+				# Refine with code training
+				m2v_model = refine_with_code_training(
+					m2v_model,
+					code_texts,
+					teacher_embeddings,
+					epochs=TRAINING_EPOCHS,
+					checkpoint_manager=checkpoint_mgr,
+					model_manager=model_mgr,
+				)
+				del teacher_model
+				if torch.cuda.is_available():
+					torch.cuda.empty_cache()
+			except torch.cuda.OutOfMemoryError:
+				logger.warning("GPU OOM during code training, falling back to CPU...")
+				if torch.cuda.is_available():
+					torch.cuda.empty_cache()
+				# Force CPU for teacher model with optimized settings (no flash attention on CPU)
+				try:
+					teacher_model = load_teacher_model_with_cache(model_name, output_dir, device="cpu", resume=resume)
+				except ImportError as e:
+					if "flash_attn" in str(e):
+						logger.warning("Flash Attention 2 not available on CPU, using default attention")
+						# Fallback without any special attention implementation
+						teacher_model = load_teacher_model_with_cache(
+							model_name, output_dir, device="cpu", resume=resume
+						)
+					else:
+						raise
+				# Generate teacher embeddings on CPU with checkpoints
+				teacher_embeddings = generate_teacher_embeddings_with_checkpoints(
+					teacher_model, code_texts, checkpoint_mgr
+				)
+				# Refine with code training on CPU
+				m2v_model = refine_with_code_training(
+					m2v_model,
+					code_texts,
+					teacher_embeddings,
+					epochs=TRAINING_EPOCHS,
+					checkpoint_manager=checkpoint_mgr,
+					model_manager=model_mgr,
+				)
+				del teacher_model
+		# Step 4: Light regularization
+		logger.info("Step 4: Applying regularization...")
+		m2v_model = apply_regularization(m2v_model, REGULARIZATION_WEIGHT)
+	# Save final model
+	logger.info("Saving code-specialized model...")
+	# Final validation before saving
+	if m2v_model is None:
+		msg = "Cannot save model: m2v_model is None"
+		raise ValueError(msg)
+	if not hasattr(m2v_model, "save_pretrained"):
+		msg = f"Cannot save model: m2v_model of type {type(m2v_model)} does not have save_pretrained method"
+		raise ValueError(msg)
+	logger.info(f"Final model type: {type(m2v_model)}")
+	logger.info(f"Final model has embedding attribute: {hasattr(m2v_model, 'embedding')}")
+	m2v_model.save_pretrained(str(output_path))
+	# Save final model to beam volume as well - DISABLED due to recursive directory bug
+	# model_mgr.save_model("final_model", str(output_path))
+	total_time = time.time() - start_time
+	logger.info(f"Code-specialized distillation completed in {total_time:.2f} seconds")
+	return m2v_model
+@function(
+	gpu=GPU_NAME,
+	volumes=[Volume(name=VOLUME_NAME, mount_path=VOLUME_PATH)],
+	image=IMAGE,
+	secrets=["HF_ACCESS_TOKEN"],
+	env={
+		"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True,max_split_size_mb:512",
+		"TOKENIZERS_PARALLELISM": "false",
+		"CUDA_LAUNCH_BLOCKING": "0",  # Allow async CUDA operations
+		"TORCH_CUDNN_V8_API_ENABLED": "1",  # Enable optimized cuDNN
+		"OMP_NUM_THREADS": "8",  # Limit CPU threads for better GPU utilization
+	},
+	timeout=3600 * 12,  # 12 hours
+)
+def beam_code_distillation(
+	model_name: str = MODEL_NAME,
+	output_dir: str = OUTPUT_DIR,
+	pca_dims: int = PCA_DIMS,
+	max_samples: int = MAX_TRAINING_SAMPLES,
+	resume: bool = True,
+) -> Any:
+	# Apply all patches from the patches directory
+	try:
+		from .patch_utils import apply_all_patches
+		logger.info("Applying all patches from patches directory...")
+		patches_applied = apply_all_patches()
+		logger.info(f"Successfully applied {patches_applied} patches")
+	except Exception as e:
+		logger.warning(f"Failed to apply patches: {e}. Continuing without patches.")
+	return code_specialized_distillation(
+		model_name=model_name,
+		output_dir=output_dir,
+		pca_dims=pca_dims,
+		max_samples=max_samples,
+		resume=resume,
+	)
+if __name__ == "__main__":
+	code_specialized_distillation()

src/distiller/distill_simplified.py ADDED Viewed

	@@ -0,0 +1,413 @@

+"""
+Simplified Code-Specialized Model2Vec Distillation Script.
+This script implements a focused, simplified approach for creating code-specialized embeddings
+using only the core Model2Vec distillation without additional fine-tuning that may degrade quality.
+Can run locally or on Beam with the --use-beam flag.
+"""
+import argparse
+import json
+import logging
+import sys
+import time
+from pathlib import Path
+from typing import Any
+from beam import GpuType, Image, Volume, function
+from model2vec.distill import distill
+# =============================================================================
+# SIMPLIFIED CONFIGURATION
+# =============================================================================
+# Use a code-specialized teacher model instead of general instruction model
+# Ordered by success likelihood and performance:
+CODE_TEACHER_MODELS = [
+	"sentence-transformers/all-MiniLM-L6-v2",
+	"sentence-transformers/all-mpnet-base-v2",
+	"microsoft/codebert-base",
+	"microsoft/graphcodebert-base",
+	"sentence-transformers/paraphrase-MiniLM-L6-v2",
+	"Alibaba-NLP/gte-Qwen2-7B-instruct",
+]
+OUTPUT_BASE_DIR = "code_model2vec"
+# Optimal Model2Vec parameters based on successful models
+OPTIMAL_PCA_DIMS = 256  # Match other successful Model2Vec models
+SIF_COEFFICIENT = 1e-3  # Slightly higher than default for code specialization
+APPLY_ZIPF = True  # Enable Zipf weighting for better word importance
+# =============================================================================
+# BEAM CONFIGURATION
+# =============================================================================
+GPU_NAME = GpuType.A100_40
+VOLUME_NAME = "code_model2vec"
+VOLUME_PATH = "./code_model2vec"
+IMAGE = Image(python_version="python3.12").add_python_packages(
+	[
+		"torch>=2.7.0",  # Install torch first
+		"transformers>=4.40.0",  # Latest transformers with flash attention support
+		"lightning>=2.5.1.post0",
+		"model2vec[train]>=0.5.0",
+		"numpy>=1.26.4",
+		"scikit-learn>=1.6.1",
+		"sentence-transformers>=4.1.0",
+		"datasets>=3.2.0",  # For evaluation
+		"pandas>=2.0.0",
+		"tqdm>=4.65.0",
+	]
+)
+# =============================================================================
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+# Add beam utilities for proper model persistence
+try:
+	from .beam_utils import (
+		create_beam_utilities,
+	)
+	BEAM_UTILS_AVAILABLE = True
+except ImportError:
+	print("Beam utilities not available - models will only be saved locally")
+	BEAM_UTILS_AVAILABLE = False
+def apply_local_patches() -> bool:
+	"""Apply patches locally without requiring Beam utilities."""
+	try:
+		# Try using patch_utils if available
+		try:
+			from .patch_utils import apply_all_patches
+			patches_applied = apply_all_patches()
+			logger.info(f"Successfully applied {patches_applied} patches via patch_utils")
+			return True
+		except ImportError:
+			logger.warning("patch_utils not available, trying direct patching")
+		return False
+	except Exception as e:
+		logger.warning(f"Failed to apply patches: {e}")
+		return False
+def simplified_code_distillation(
+	teacher_model: str,
+	output_dir: str,
+	pca_dims: int = OPTIMAL_PCA_DIMS,
+) -> Any:
+	"""
+	Simplified code-specialized distillation using only core Model2Vec.
+	This approach:
+	1. Uses a teacher model that already performs well on code tasks
+	2. Applies optimal Model2Vec parameters
+	3. Avoids additional training that may degrade quality
+	"""
+	output_path = Path(output_dir)
+	output_path.mkdir(parents=True, exist_ok=True)
+	logger.info(f"Starting simplified distillation from {teacher_model}")
+	logger.info(f"Target dimensions: {pca_dims}")
+	logger.info(f"SIF coefficient: {SIF_COEFFICIENT}")
+	logger.info(f"Zipf weighting: {APPLY_ZIPF}")
+	start_time = time.time()
+	try:
+		# Perform distillation with optimal parameters
+		model = distill(
+			model_name=teacher_model,
+			pca_dims=pca_dims,
+			apply_zipf=APPLY_ZIPF,
+			sif_coefficient=SIF_COEFFICIENT,
+			trust_remote_code=True,
+		)
+		logger.info("✅ Core distillation completed successfully")
+		# Save the model
+		model.save_pretrained(str(output_path))
+		logger.info(f"💾 Model saved to {output_path}")
+		# Log model info
+		logger.info(f"Model type: {type(model)}")
+		if hasattr(model, "embedding"):
+			logger.info(f"Embedding shape: {model.embedding.shape}")
+			logger.info(f"Embedding dtype: {model.embedding.dtype}")
+		total_time = time.time() - start_time
+		logger.info(f"🎉 Simplified distillation completed in {total_time:.2f} seconds")
+		return model
+	except ValueError as e:
+		if "Number of tokens" in str(e) and "does not match number of vectors" in str(e):
+			logger.warning(f"⚠️ Token-vector mismatch with {teacher_model} - this is a Model2Vec library issue")
+			logger.warning(f"Error details: {e}")
+			logger.warning("💡 This model has incompatible tokenization. Skipping...")
+			return None
+		raise
+	except Exception:
+		logger.exception("❌ Distillation failed")
+		return None
+def core_distill_all_teachers(use_beam_utilities: bool = False) -> dict[str, Any]:
+	"""
+	Core logic for distilling all teacher models.
+	Args:
+		use_beam_utilities: Whether to use Beam utilities for persistence
+	Returns:
+		Dictionary with distillation results
+	"""
+	# Apply patches
+	logger.info("Applying all patches...")
+	patch_success = apply_local_patches()
+	if patch_success:
+		logger.info("Successfully applied patches")
+	else:
+		logger.warning("Failed to apply patches - Microsoft models may fail")
+	# Initialize Beam utilities if requested and available
+	volume_mgr = None
+	model_mgr = None
+	if use_beam_utilities and BEAM_UTILS_AVAILABLE:
+		try:
+			volume_mgr, _, model_mgr, _ = create_beam_utilities(VOLUME_NAME, VOLUME_PATH)
+			logger.info("✅ Beam utilities initialized for model persistence")
+		except Exception as e:
+			logger.warning(f"Failed to initialize Beam utilities: {e}")
+			model_mgr = None
+	results = {}
+	successful_models = []
+	logger.info("🚀 Starting comprehensive teacher model distillation")
+	logger.info(f"📊 Processing {len(CODE_TEACHER_MODELS)} teacher models")
+	# Determine output base path
+	base_output_path = VOLUME_PATH if use_beam_utilities else OUTPUT_BASE_DIR
+	for teacher_model in CODE_TEACHER_MODELS:
+		try:
+			# Create output directory name based on teacher model
+			teacher_name = teacher_model.split("/")[-1].replace("-", "_")
+			output_dir = f"{base_output_path}/final/code_model2vec_{teacher_name}"
+			logger.info(f"\n{'=' * 60}")
+			logger.info(f"🔄 Processing teacher model: {teacher_model}")
+			logger.info(f"📁 Output directory: {output_dir}")
+			logger.info(f"{'=' * 60}")
+			# Check if model already exists
+			output_path = Path(output_dir)
+			if output_path.exists():
+				# Check for essential model files
+				has_config = (output_path / "config.json").exists()
+				has_model_file = any(
+					[
+						(output_path / "model.safetensors").exists(),
+						(output_path / "model.bin").exists(),
+						(output_path / "pytorch_model.bin").exists(),
+					]
+				)
+				if has_config and has_model_file:
+					logger.info(f"✅ Model {teacher_name} already exists - skipping distillation")
+					# Still record it as successful
+					model_info = {
+						"teacher_model": teacher_model,
+						"output_dir": output_dir,
+						"teacher_name": teacher_name,
+						"distillation_time": 0.0,
+						"status": "skipped_existing",
+					}
+					results[teacher_name] = model_info
+					successful_models.append(teacher_name)
+					logger.info(f"📁 Using existing model at: {output_dir}")
+					continue
+			# Perform distillation
+			start_time = time.time()
+			model = simplified_code_distillation(
+				teacher_model=teacher_model,
+				output_dir=output_dir,
+			)
+			distill_time = time.time() - start_time
+			if model is not None:
+				logger.info(f"✅ Distillation successful for {teacher_model}")
+				# Save to Beam volume for persistence if available
+				if model_mgr:
+					try:
+						# Save model to beam volume with teacher-specific name
+						beam_model_name = f"{teacher_name}_model"
+						model_mgr.save_model(beam_model_name, output_dir)
+						logger.info(f"💾 Saved {teacher_name} to Beam volume as {beam_model_name}")
+					except Exception as e:
+						logger.warning(f"Failed to save {teacher_name} to Beam volume: {e}")
+				# Store results
+				model_info = {
+					"teacher_model": teacher_model,
+					"output_dir": output_dir,
+					"teacher_name": teacher_name,
+					"distillation_time": distill_time,
+					"status": "success",
+				}
+				results[teacher_name] = model_info
+				successful_models.append(teacher_name)
+				logger.info(f"💾 Model saved to: {output_dir}")
+		except Exception as e:
+			logger.exception(f"❌ Failed with {teacher_model}")
+			results[teacher_model.split("/")[-1]] = {
+				"teacher_model": teacher_model,
+				"status": "failed",
+				"error": str(e),
+			}
+			continue
+	# Summary
+	if successful_models:
+		logger.info("\n🏆 DISTILLATION COMPLETE!")
+		logger.info(f"📊 Successful models: {len(successful_models)}")
+		for model_name in successful_models:
+			model_info = results[model_name]
+			logger.info(f"✅ {model_name}: {model_info['teacher_model']}")
+		# Save comprehensive results
+		results_summary = {
+			"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+			"successful_models": successful_models,
+			"all_results": results,
+			"total_successful": len(successful_models),
+			"total_attempted": len(CODE_TEACHER_MODELS),
+		}
+		# Save results to file
+		results_file = Path(f"{base_output_path}/distillation_results.json")
+		results_file.parent.mkdir(parents=True, exist_ok=True)
+		with results_file.open("w") as f:
+			json.dump(results_summary, f, indent=2)
+		logger.info(f"📊 Results summary saved to: {results_file}")
+		return results_summary
+	logger.error("❌ No models succeeded")
+	msg = "All teacher models failed distillation"
+	raise RuntimeError(msg)
+def run_local_distillation() -> dict[str, Any]:
+	"""Run distillation locally without Beam."""
+	logger.info("🖥️ Running simplified distillation locally")
+	return core_distill_all_teachers(use_beam_utilities=False)
+@function(
+	gpu=GPU_NAME,
+	volumes=[Volume(name=VOLUME_NAME, mount_path=VOLUME_PATH)],
+	image=IMAGE,
+	secrets=["HF_ACCESS_TOKEN"],
+	env={
+		"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True,max_split_size_mb:512",
+		"TOKENIZERS_PARALLELISM": "false",
+		"CUDA_LAUNCH_BLOCKING": "0",  # Allow async CUDA operations
+		"TORCH_CUDNN_V8_API_ENABLED": "1",  # Enable optimized cuDNN
+	},
+	timeout=3600 * 12,  # 12 hours
+)
+def beam_distill_all_teachers() -> dict[str, Any]:
+	"""
+	Beam version: Try all teacher models and create distilled models from each.
+	Returns information about all models that were successfully created.
+	"""
+	logger.info("☁️ Running simplified distillation on Beam")
+	return core_distill_all_teachers(use_beam_utilities=True)
+def main() -> None:
+	"""Main function with argument parsing."""
+	global OUTPUT_BASE_DIR  # Declare global at the top  # noqa: PLW0603
+	parser = argparse.ArgumentParser(
+		description="Simplified Code-Specialized Model2Vec Distillation",
+		formatter_class=argparse.RawDescriptionHelpFormatter,
+		epilog="""
+Examples:
+  python -m src.distiller.distill_simplified                    # Run locally
+  python -m src.distiller.distill_simplified --use-beam        # Run on Beam
+  distiller distill-simple                                      # CLI shortcut (runs on Beam)
+		""",
+	)
+	parser.add_argument(
+		"--use-beam",
+		action="store_true",
+		help="Run on Beam instead of locally",
+	)
+	parser.add_argument(
+		"--output-dir",
+		type=str,
+		default=OUTPUT_BASE_DIR,
+		help=f"Output directory for models (default: {OUTPUT_BASE_DIR})",
+	)
+	args = parser.parse_args()
+	# Update output directory if specified
+	if args.output_dir != OUTPUT_BASE_DIR:
+		OUTPUT_BASE_DIR = args.output_dir
+	try:
+		if args.use_beam:
+			logger.info("🚀 Starting Beam execution...")
+			results = beam_distill_all_teachers()
+		else:
+			logger.info("🖥️ Starting local execution...")
+			results = run_local_distillation()
+		# Print final summary
+		print("\n🎉 Distillation complete!")
+		print(f"📊 Successfully created {results['total_successful']} models")
+		if args.use_beam:
+			print(f"📁 Models location: {VOLUME_PATH}/final/")
+		else:
+			print(f"📁 Models location: {OUTPUT_BASE_DIR}/final/")
+		print("\n✅ Created models:")
+		for model_name in results["successful_models"]:
+			model_info = results["all_results"][model_name]
+			print(f"   • {model_name} (from {model_info['teacher_model']})")
+	except KeyboardInterrupt:
+		logger.info("🛑 Distillation interrupted by user")
+		sys.exit(1)
+	except Exception:
+		logger.exception("❌ Distillation failed with error")
+		sys.exit(1)
+if __name__ == "__main__":
+	main()

src/distiller/evaluate.py ADDED Viewed

	@@ -0,0 +1,839 @@

+"""
+CodeSearchNet Evaluation Script for Code-Specialized Embedding Models.
+This script evaluates embedding models on code search tasks using the CodeSearchNet
+dataset and methodology. It implements the same evaluation approach as the original
+CodeSearchNet challenge, including NDCG and other information retrieval metrics.
+Usage:
+    distiller evaluate  # Run evaluation on all default models with Beam
+"""
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Any
+import numpy as np
+import pandas as pd
+from beam import GpuType, Image, Volume, function
+from datasets import Dataset, load_dataset
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+from tqdm import tqdm
+from .beam_utils import (
+	BeamCheckpointManager,
+	BeamEvaluationManager,
+	create_beam_utilities,
+)
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+# =============================================================================
+# BEAM CONFIGURATION
+# =============================================================================
+GPU_NAME = GpuType.A100_40
+VOLUME_NAME = "code_model2vec"  # Same volume as distill_simplified.py
+VOLUME_PATH = "./code_model2vec"  # Same mount path as distill_simplified.py
+EVALUATION_RESULTS_DIR = "evaluation_results"  # Subdirectory within volume
+EVALUATION_CACHE_DIR = "evaluation_cache"  # Cache for datasets and models
+IMAGE = Image(python_version="python3.12").add_python_packages(
+	[
+		"torch>=2.7.0",
+		"transformers>=4.40.0",
+		"datasets>=3.2.0",
+		"sentence-transformers>=4.1.0",
+		"model2vec[train]>=0.5.0",
+		"numpy>=1.26.4",
+		"scikit-learn>=1.6.1",
+		"pandas>=2.0.0",
+		"tqdm>=4.65.0",
+	]
+)
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+CODESEARCHNET_EVAL_DATASET = "code_search_net"
+BATCH_SIZE = 32
+DEFAULT_OUTPUT_DIR = "code_evaluation_results"  # Local fallback directory
+EVALUATION_LANGUAGES = ["python", "javascript", "java", "php", "ruby", "go"]
+# Default models to evaluate (can be overridden via command line)
+DEFAULT_EVALUATION_MODELS = [
+	# Established Code Models
+	"sentence-transformers/all-MiniLM-L6-v2",
+	"microsoft/codebert-base",
+	"microsoft/graphcodebert-base",
+	"huggingface/CodeBERTa-small-v1",
+	"sentence-transformers/all-mpnet-base-v2",
+	"sentence-transformers/all-MiniLM-L12-v2",
+	# Model2Vec & Efficiency Models (Direct Competitors)
+	"minishlab/potion-base-8M",
+	"minishlab/potion-retrieval-32M",
+	# Small Transformer-Based Code Models
+	"Salesforce/codet5-base",
+]
+# =============================================================================
+# CHECKPOINT CONFIGURATION
+# =============================================================================
+# Prevent conflicts with distill.py checkpoints by using different prefixes
+EVAL_CHECKPOINT_PREFIX = "evaluation_checkpoints"
+DATASET_CHECKPOINT_PREFIX = "dataset_cache"
+MODEL_CACHE_PREFIX = "model_cache"
+# =============================================================================
+# CORE EVALUATION CLASSES
+# =============================================================================
+class CodeSearchNetEvaluator:
+	"""Evaluator for CodeSearchNet-style code search tasks."""
+	def __init__(
+		self,
+		model_path: str,
+		model_name: str | None = None,
+		checkpoint_manager: BeamCheckpointManager | None = None,
+		eval_manager: BeamEvaluationManager | None = None,
+	) -> None:
+		"""Initialize the evaluator with a model and optional Beam utilities."""
+		self.model_path = model_path
+		self.model_name = model_name or Path(model_path).name
+		self.model: SentenceTransformer | None = None
+		self.checkpoint_manager = checkpoint_manager
+		self.eval_manager = eval_manager
+		self._load_model()
+	def _load_model(self) -> None:
+		"""Load the embedding model with caching support."""
+		logger.info(f"Loading model from {self.model_path}")
+		# Check if we have a cached evaluation result for this model
+		if self.eval_manager:
+			cached_result = self.eval_manager.load_evaluation_results(self.model_name)
+			if cached_result:
+				logger.info(f"✅ Found cached evaluation results for {self.model_name}")
+				# Note: We still need to load the model for new evaluations
+		try:
+			self.model = SentenceTransformer(self.model_path, trust_remote_code=True)
+			logger.info(f"Successfully loaded model: {self.model_name}")
+		except Exception:
+			logger.exception(f"Failed to load model from {self.model_path}")
+			raise
+	def encode_texts(self, texts: list[str], desc: str = "Encoding") -> np.ndarray:
+		"""Encode texts into embeddings with batching."""
+		if self.model is None:
+			msg = "Model not loaded"
+			raise RuntimeError(msg)
+		embeddings = []
+		for i in tqdm(range(0, len(texts), BATCH_SIZE), desc=desc):
+			batch = texts[i : i + BATCH_SIZE]
+			batch_embeddings = self.model.encode(batch, convert_to_tensor=False, normalize_embeddings=True)
+			embeddings.append(batch_embeddings)
+		return np.vstack(embeddings)
+	def evaluate_language(self, language: str, max_queries: int = 1000) -> dict[str, Any]:
+		"""Evaluate on a specific programming language with checkpoint support."""
+		logger.info(f"Evaluating on {language} language (max {max_queries} queries)")
+		# Check for existing evaluation checkpoint
+		if self.checkpoint_manager:
+			cached_result = self.checkpoint_manager.load_checkpoint(f"{EVAL_CHECKPOINT_PREFIX}_{language}", 0)
+			if cached_result and cached_result.get("data", {}).get("model_name") == self.model_name:
+				logger.info(f"✅ Resuming from cached {language} evaluation")
+				return cached_result.get("data", {})
+		try:
+			# Load test split for the language
+			dataset = load_dataset(
+				CODESEARCHNET_EVAL_DATASET,
+				language,
+				split="test",
+				trust_remote_code=True,
+			)
+			# Ensure we have a Dataset object
+			if not isinstance(dataset, Dataset):
+				logger.error(f"Unexpected dataset type for {language}: {type(dataset)}")
+				return {}
+			# Sample queries for evaluation (to make it manageable)
+			if len(dataset) > max_queries:
+				rng = np.random.default_rng(42)  # Use seeded generator for reproducibility
+				indices = rng.choice(len(dataset), max_queries, replace=False)
+				dataset = dataset.select(indices)
+			queries = []
+			codes = []
+			query_ids = []
+			for i, example in enumerate(dataset):
+				doc_string = example.get("func_documentation_string", "").strip()
+				code_string = example.get("func_code_string", "").strip()
+				if doc_string and code_string and len(doc_string.split()) >= 3:
+					queries.append(doc_string)
+					codes.append(code_string)
+					query_ids.append(f"{language}_{i}")
+			if len(queries) == 0:
+				logger.warning(f"No valid query-code pairs found for {language}")
+				return {}
+			logger.info(f"Found {len(queries)} valid query-code pairs for {language}")
+			# Encode queries and codes
+			query_embeddings = self.encode_texts(queries, f"Encoding {language} queries")
+			code_embeddings = self.encode_texts(codes, f"Encoding {language} codes")
+			# Compute similarities
+			similarities = cosine_similarity(query_embeddings, code_embeddings)
+			# Evaluate retrieval metrics
+			metrics = self._compute_retrieval_metrics(similarities)
+			result = {
+				"language": language,
+				"num_queries": len(queries),
+				"metrics": metrics,
+				"model_name": self.model_name,
+			}
+			# Save checkpoint
+			if self.checkpoint_manager:
+				checkpoint_data = {
+					"data": result,
+					"timestamp": time.time(),
+					"config": {
+						"language": language,
+						"max_queries": max_queries,
+						"model_name": self.model_name,
+					},
+				}
+				self.checkpoint_manager.save_checkpoint(f"{EVAL_CHECKPOINT_PREFIX}_{language}", checkpoint_data, 0)
+				logger.info(f"💾 Saved {language} evaluation checkpoint")
+			return result
+		except Exception:
+			logger.exception(f"Error evaluating {language}")
+			return {}
+	def _compute_retrieval_metrics(self, similarities: np.ndarray) -> dict[str, float]:
+		"""Compute retrieval metrics like NDCG, MRR, etc."""
+		num_queries = similarities.shape[0]
+		# For each query, the correct code is at the same index (diagonal)
+		ranks = []
+		reciprocal_ranks = []
+		ndcg_scores = []
+		for i in range(num_queries):
+			# Get similarity scores for query i
+			scores = similarities[i]
+			# Rank all codes by similarity to query i
+			ranked_indices = np.argsort(scores)[::-1]  # Descending order
+			# Find rank of the correct code (index i)
+			correct_rank = np.where(ranked_indices == i)[0][0] + 1  # 1-indexed
+			ranks.append(correct_rank)
+			reciprocal_ranks.append(1.0 / correct_rank)
+			# Compute NDCG@10
+			ndcg_scores.append(self._compute_ndcg(ranked_indices, i, k=10))
+		return {
+			"mrr": float(np.mean(reciprocal_ranks)),
+			"ndcg@1": float(
+				np.mean([self._compute_ndcg(np.argsort(similarities[i])[::-1], i, k=1) for i in range(num_queries)])
+			),
+			"ndcg@5": float(
+				np.mean([self._compute_ndcg(np.argsort(similarities[i])[::-1], i, k=5) for i in range(num_queries)])
+			),
+			"ndcg@10": float(np.mean(ndcg_scores)),
+			"recall@1": float(np.mean([1.0 if rank == 1 else 0.0 for rank in ranks])),
+			"recall@5": float(np.mean([1.0 if rank <= 5 else 0.0 for rank in ranks])),
+			"recall@10": float(np.mean([1.0 if rank <= 10 else 0.0 for rank in ranks])),
+			"mean_rank": float(np.mean(ranks)),
+			"median_rank": float(np.median(ranks)),
+		}
+	def _compute_ndcg(self, ranked_indices: np.ndarray, correct_idx: int, k: int) -> float:
+		"""Compute NDCG@k for a single query."""
+		if k == 0:
+			return 0.0
+		# Find position of correct item in top-k
+		top_k = ranked_indices[:k]
+		if correct_idx in top_k:
+			position = np.where(top_k == correct_idx)[0][0]
+			return 1.0 / np.log2(position + 2)  # +2 because log2(1) is 0
+		return 0.0
+	def evaluate_all_languages(
+		self, max_queries_per_lang: int = 1000, languages: list[str] | None = None
+	) -> dict[str, Any]:
+		"""Evaluate on all supported programming languages with comprehensive result saving."""
+		if languages is None:
+			languages = EVALUATION_LANGUAGES
+		logger.info(f"Starting evaluation on all languages for model: {self.model_name}")
+		# Check for existing comprehensive evaluation results
+		if self.eval_manager:
+			cached_comprehensive = self.eval_manager.load_evaluation_results(self.model_name)
+			if cached_comprehensive:
+				logger.info(f"✅ Found comprehensive cached evaluation for {self.model_name}")
+				return cached_comprehensive
+		start_time = time.time()
+		results: dict[str, Any] = {
+			"model_name": self.model_name,
+			"model_path": self.model_path,
+			"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+			"languages": {},
+			"overall": {},
+		}
+		all_metrics = []
+		for language in languages:
+			logger.info(f"Evaluating {language}...")
+			lang_results = self.evaluate_language(language, max_queries_per_lang)
+			if lang_results:
+				results["languages"][language] = lang_results
+				all_metrics.append(lang_results["metrics"])
+			else:
+				logger.warning(f"Skipping {language} due to evaluation error")
+		# Compute overall metrics (average across languages)
+		if all_metrics:
+			overall_metrics = {}
+			for metric_name in all_metrics[0]:
+				values = [m[metric_name] for m in all_metrics if metric_name in m]
+				overall_metrics[metric_name] = np.mean(values)
+			results["overall"] = overall_metrics
+		total_time = time.time() - start_time
+		results["evaluation_time_seconds"] = total_time
+		# Save comprehensive results to Beam volume
+		if self.eval_manager:
+			self.eval_manager.save_evaluation_results(self.model_name, results)
+			logger.info("💾 Saved comprehensive evaluation results to Beam volume")
+		logger.info(f"Evaluation completed in {total_time:.2f} seconds")
+		return results
+def load_peer_models(peers_file: str) -> list[tuple[str, str]]:
+	"""Load peer models from CSV file."""
+	try:
+		df = pd.read_csv(peers_file)
+		models = []
+		for _, row in df.iterrows():
+			model_name = row.get("model_name", row.get("Model", ""))
+			model_path = row.get("model_path", row.get("Path", model_name))
+			if model_name:
+				models.append((model_name, model_path))
+		logger.info(f"Loaded {len(models)} peer models from {peers_file}")
+		return models
+	except Exception:
+		logger.exception("Error loading peer models from {peers_file}")
+		return []
+def save_results(
+	results: dict[str, Any],
+	output_dir: str,
+	model_name: str,
+	eval_manager: BeamEvaluationManager | None = None,
+	volume_results_dir: Path | None = None,
+) -> None:
+	"""Save evaluation results to JSON file with Beam volume support."""
+	# Save to Beam volume if available
+	if volume_results_dir:
+		volume_output_path = volume_results_dir / f"codesearchnet_eval_{model_name}.json"
+		try:
+			with volume_output_path.open("w") as f:
+				json.dump(results, f, indent=2, default=str)
+			logger.info(f"💾 Results saved to Beam volume: {volume_output_path}")
+		except Exception as e:
+			logger.warning(f"⚠️ Failed to save to Beam volume: {e}")
+	# Also try eval_manager if available (for compatibility)
+	if eval_manager:
+		success = eval_manager.save_evaluation_results(model_name, results)
+		if success:
+			logger.info(f"💾 Results also saved via eval_manager for {model_name}")
+		else:
+			logger.warning(f"⚠️ Failed to save via eval_manager for {model_name}")
+	# Always save local backup
+	output_path = Path(output_dir)
+	output_path.mkdir(parents=True, exist_ok=True)
+	# Clean model name for filename
+	safe_name = "".join(c for c in model_name if c.isalnum() or c in ("-", "_", "."))
+	filename = f"codesearchnet_eval_{safe_name}.json"
+	filepath = output_path / filename
+	with Path(filepath).open("w") as f:
+		json.dump(results, f, indent=2, default=str)
+	logger.info(f"📄 Local backup saved to {filepath}")
+def print_results_summary(results: dict[str, Any]) -> None:
+	"""Print a summary of evaluation results."""
+	model_name = results["model_name"]
+	overall = results.get("overall", {})
+	print(f"\n{'=' * 60}")
+	print(f"CodeSearchNet Evaluation Results: {model_name}")
+	print(f"{'=' * 60}")
+	if overall:
+		print("\nOverall Metrics (averaged across languages):")
+		print(f"  MRR:      {overall.get('mrr', 0):.4f}")
+		print(f"  NDCG@1:   {overall.get('ndcg@1', 0):.4f}")
+		print(f"  NDCG@5:   {overall.get('ndcg@5', 0):.4f}")
+		print(f"  NDCG@10:  {overall.get('ndcg@10', 0):.4f}")
+		print(f"  Recall@1: {overall.get('recall@1', 0):.4f}")
+		print(f"  Recall@5: {overall.get('recall@5', 0):.4f}")
+		print(f"  Recall@10: {overall.get('recall@10', 0):.4f}")
+	print("\nPer-Language Results:")
+	for lang, lang_results in results.get("languages", {}).items():
+		metrics = lang_results.get("metrics", {})
+		print(
+			f"  {lang:12s}: MRR={metrics.get('mrr', 0):.3f}, "
+			f"NDCG@10={metrics.get('ndcg@10', 0):.3f}, "
+			f"Recall@5={metrics.get('recall@5', 0):.3f}"
+		)
+def create_comparison_report(all_results: list[dict[str, Any]], output_dir: str) -> None:
+	"""Create a comparison report across all evaluated models."""
+	if not all_results:
+		return
+	output_path = Path(output_dir)
+	# Create comparison DataFrame
+	comparison_data = []
+	for results in all_results:
+		overall = results.get("overall", {})
+		row = {
+			"Model": results["model_name"],
+			"MRR": overall.get("mrr", 0),
+			"NDCG@1": overall.get("ndcg@1", 0),
+			"NDCG@5": overall.get("ndcg@5", 0),
+			"NDCG@10": overall.get("ndcg@10", 0),
+			"Recall@1": overall.get("recall@1", 0),
+			"Recall@5": overall.get("recall@5", 0),
+			"Recall@10": overall.get("recall@10", 0),
+			"Mean Rank": overall.get("mean_rank", 0),
+		}
+		comparison_data.append(row)
+	df = pd.DataFrame(comparison_data)
+	df = df.sort_values("NDCG@10", ascending=False)  # Sort by NDCG@10
+	# Save to CSV
+	csv_path = output_path / "codesearchnet_comparison.csv"
+	df.to_csv(csv_path, index=False, float_format="%.4f")
+	logger.info(f"Comparison report saved to {csv_path}")
+	# Print comparison table
+	print(f"\n{'=' * 80}")
+	print("CodeSearchNet Model Comparison")
+	print(f"{'=' * 80}")
+	print(df.to_string(index=False, float_format="%.4f"))
+def beam_evaluate_models(
+	models: list[str],
+	max_queries: int = 1000,
+	languages: list[str] | None = None,
+	output_dir: str = DEFAULT_OUTPUT_DIR,
+	volume_name: str = VOLUME_NAME,
+	mount_path: str = VOLUME_PATH,
+) -> list[dict[str, Any]]:
+	"""Main evaluation function for Beam execution with checkpoint support."""
+	logger.info("🚀 Starting Beam-powered CodeSearchNet evaluation")
+	logger.info(f"📊 Evaluating {len(models)} models on {len(languages or EVALUATION_LANGUAGES)} languages")
+	# Initialize Beam utilities
+	volume_mgr, checkpoint_mgr, model_mgr, eval_mgr = create_beam_utilities(volume_name, mount_path)
+	# Create evaluation results directory in volume
+	results_dir = Path(mount_path) / EVALUATION_RESULTS_DIR
+	results_dir.mkdir(parents=True, exist_ok=True)
+	logger.info(f"📁 Using Beam volume: {volume_name} at {mount_path}")
+	logger.info(f"💾 Evaluation results directory: {results_dir}")
+	all_results = []
+	skipped_models = []
+	for model_path in models:
+		model_name = Path(model_path).name
+		# Check for existing evaluation results
+		existing_result_file = results_dir / f"codesearchnet_eval_{model_name}.json"
+		if existing_result_file.exists():
+			logger.info(f"✅ Model {model_name} already evaluated - loading existing results")
+			try:
+				with existing_result_file.open("r") as f:
+					existing_results = json.load(f)
+				all_results.append(existing_results)
+				skipped_models.append(model_name)
+				continue
+			except Exception as e:
+				logger.warning(f"⚠️ Failed to load existing results for {model_name}: {e}")
+				# Continue with evaluation if loading fails
+		logger.info(f"\n{'=' * 60}")
+		logger.info(f"🔍 Evaluating model: {model_name}")
+		logger.info(f"📂 Path: {model_path}")
+		logger.info(f"{'=' * 60}")
+		try:
+			# Distinguish between local paths and HuggingFace model names
+			is_huggingface_model = (
+				"/" in model_path and not model_path.startswith("/") and not Path(model_path).exists()
+			)
+			if is_huggingface_model:
+				# This is a HuggingFace model name - pass directly to evaluator
+				logger.info(f"📥 Loading HuggingFace model: {model_path}")
+				evaluator = CodeSearchNetEvaluator(
+					model_path,
+					model_name,
+					checkpoint_manager=checkpoint_mgr,
+					eval_manager=eval_mgr,
+				)
+			else:
+				# This is a local path - check if it exists in Beam volume
+				actual_model_path = model_path  # Default to original path
+				if not Path(model_path).exists() and not model_path.startswith("/"):
+					# Try to load from Beam volume
+					local_model_path = Path(mount_path) / MODEL_CACHE_PREFIX / model_name
+					logger.info(f"🔍 Trying to load {model_name} from Beam volume: {local_model_path}")
+					if model_mgr.load_model(model_name, local_model_path.parent):
+						actual_model_path = str(local_model_path)
+						logger.info(f"✅ Loaded model from Beam volume: {actual_model_path}")
+					else:
+						logger.warning(f"⚠️ Model not found locally or in Beam volume: {model_name}")
+						continue
+				evaluator = CodeSearchNetEvaluator(
+					actual_model_path,
+					model_name,
+					checkpoint_manager=checkpoint_mgr,
+					eval_manager=eval_mgr,
+				)
+			results = evaluator.evaluate_all_languages(max_queries, languages)
+			# Save results with Beam support
+			save_results(results, output_dir, model_name, eval_mgr, results_dir)
+			# Print summary
+			print_results_summary(results)
+			all_results.append(results)
+		except Exception:
+			logger.exception(f"❌ Failed to evaluate {model_name}")
+			continue
+	# Create comparison report in Beam volume
+	if len(all_results) > 1:
+		comparison_dir = Path(mount_path) / EVALUATION_RESULTS_DIR / "comparisons"
+		comparison_dir.mkdir(parents=True, exist_ok=True)
+		create_comparison_report(all_results, str(comparison_dir))
+		logger.info(f"📊 Comparison report saved to Beam volume: {comparison_dir}")
+	# Log summary of what was done
+	newly_evaluated = len(all_results) - len(skipped_models)
+	logger.info("\n✅ Beam evaluation complete!")
+	logger.info(f"📊 Newly evaluated: {newly_evaluated} models")
+	logger.info(f"⏭️  Skipped (already done): {len(skipped_models)} models")
+	logger.info(f"📁 Total results: {len(all_results)} models")
+	logger.info(f"💾 Results available in Beam volume: {volume_name}")
+	if skipped_models:
+		logger.info(f"⏭️  Skipped models: {', '.join(skipped_models)}")
+	return all_results
+@function(
+	gpu=GPU_NAME,
+	volumes=[Volume(name=VOLUME_NAME, mount_path=VOLUME_PATH)],
+	image=IMAGE,
+	secrets=["HF_ACCESS_TOKEN"],
+	env={
+		"TOKENIZERS_PARALLELISM": "false",
+		"CUDA_LAUNCH_BLOCKING": "0",
+	},
+	timeout=3600 * 6,  # 6 hours for evaluation
+)
+def main(skip_third_party: bool = False) -> None:
+	"""Main evaluation function - runs all default models on Beam."""
+	logger.info("🚀 Starting comprehensive CodeSearchNet evaluation on Beam")
+	# Use default models or skip them based on flag
+	if skip_third_party:
+		logger.info("⏭️  Skipping 3rd party models - evaluating only simplified distillation models")
+		models = []
+	else:
+		logger.info("📊 Including 3rd party peer models for comparison")
+		models = DEFAULT_EVALUATION_MODELS.copy()
+	# Discover simplified distillation models in the current directory
+	logger.info("🔍 Discovering simplified distillation models...")
+	discovered_models = discover_simplified_models(".")
+	# Add discovered models (they're already sorted alphabetically)
+	if discovered_models:
+		logger.info(f"✅ Found {len(discovered_models)} simplified models:")
+		for model_path in discovered_models:
+			models.append(model_path)
+			logger.info(f"   📁 {model_path}")
+	else:
+		logger.warning("⚠️ No simplified distillation models found")
+		if skip_third_party:
+			logger.error("❌ No models to evaluate! Either create simplified models or include 3rd party models.")
+			return
+	logger.info(f"📊 Evaluating {len(models)} models:")
+	for i, model in enumerate(models, 1):
+		logger.info(f"  {i}. {model}")
+	logger.info("\n💡 Checkpoint Info:")
+	logger.info("   - Already evaluated models will be skipped")
+	logger.info("   - Results are saved persistently to Beam volume")
+	# Run comprehensive evaluation using Beam utilities
+	results = beam_evaluate_models(
+		models=models,
+		max_queries=1000,
+		languages=EVALUATION_LANGUAGES,
+		output_dir=str(Path(VOLUME_PATH) / EVALUATION_RESULTS_DIR),
+		volume_name=VOLUME_NAME,
+		mount_path=VOLUME_PATH,
+	)
+	# Print final summary
+	print("\n🎯 Evaluation Summary:")
+	print(f"📊 Total models processed: {len(results)}")
+	print(f"💾 Results saved to Beam volume: {VOLUME_NAME}")
+	print(f"📁 Directory: {EVALUATION_RESULTS_DIR}")
+	if skip_third_party:
+		print("⏭️  3rd party models were skipped")
+	print("\n🔍 To view analysis:")
+	print("   beam run src.distiller.analyze:beam_analysis")
+	print("\n📈 To run evaluations again:")
+	print("   distiller evaluate  (will skip already completed models)")
+	print("   distiller evaluate --skip-third-party  (evaluate only simplified models)")
+def discover_simplified_models(base_path: str = ".") -> list[str]:
+	"""
+	Discover all simplified distillation models in the correct directory.
+	Looks for directories matching the pattern: ./code_model2vec/final/code_model2vec_*
+	"""
+	discovered_models: list[str] = []
+	# Look in the correct location where distill_simplified.py saves models
+	models_dir = Path(base_path) / "code_model2vec" / "final"
+	if not models_dir.exists():
+		logger.warning(f"Models directory not found: {models_dir}")
+		return discovered_models
+	# Look for simplified model directories with the updated pattern
+	pattern = "code_model2vec_*"
+	for model_dir in models_dir.glob(pattern):
+		if model_dir.is_dir() and (model_dir / "config.json").exists():
+			discovered_models.append(str(model_dir))
+			logger.info(f"🔍 Discovered simplified model: {model_dir}")
+	# Sort alphabetically for consistent ordering
+	discovered_models.sort()
+	return discovered_models
+@function(
+	gpu=GPU_NAME,
+	volumes=[Volume(name=VOLUME_NAME, mount_path=VOLUME_PATH)],
+	image=IMAGE,
+	secrets=["HF_ACCESS_TOKEN"],
+	env={
+		"TOKENIZERS_PARALLELISM": "false",
+		"CUDA_LAUNCH_BLOCKING": "0",
+	},
+	timeout=3600 * 6,  # 6 hours for evaluation
+)
+def evaluate_simplified_only() -> None:
+	"""Evaluate only simplified distillation models, skipping 3rd party models."""
+	main(skip_third_party=True)
+def run_local_evaluation(
+	models: list[str] | None = None,
+	max_queries: int = 1000,
+	languages: list[str] | None = None,
+	output_dir: str = DEFAULT_OUTPUT_DIR,
+) -> list[dict[str, Any]]:
+	"""Main evaluation function for local execution without Beam utilities."""
+	logger.info("🖥️ Running CodeSearchNet evaluation locally")
+	if models is None:
+		models = DEFAULT_EVALUATION_MODELS.copy()
+		# Discover simplified distillation models in the current directory
+		logger.info("🔍 Discovering simplified distillation models...")
+		discovered_models = discover_simplified_models(".")
+		# Add discovered models
+		if discovered_models:
+			logger.info(f"✅ Found {len(discovered_models)} simplified models:")
+			for model_path in discovered_models:
+				models.append(model_path)
+				logger.info(f"   📁 {model_path}")
+		else:
+			logger.warning("⚠️ No simplified distillation models found")
+	if languages is None:
+		languages = EVALUATION_LANGUAGES
+	logger.info(f"📊 Evaluating {len(models)} models on {len(languages)} languages")
+	logger.info(f"📁 Using local output directory: {output_dir}")
+	# Create local output directory
+	output_path = Path(output_dir)
+	output_path.mkdir(parents=True, exist_ok=True)
+	all_results = []
+	skipped_models = []
+	for model_path in models:
+		model_name = Path(model_path).name
+		# Check for existing evaluation results locally
+		safe_name = "".join(c for c in model_name if c.isalnum() or c in ("-", "_", "."))
+		result_file = output_path / f"codesearchnet_eval_{safe_name}.json"
+		if result_file.exists():
+			logger.info(f"✅ Model {model_name} already evaluated - loading existing results")
+			try:
+				with result_file.open("r") as f:
+					existing_results = json.load(f)
+				all_results.append(existing_results)
+				skipped_models.append(model_name)
+				continue
+			except Exception as e:
+				logger.warning(f"⚠️ Failed to load existing results for {model_name}: {e}")
+		logger.info(f"\n{'=' * 60}")
+		logger.info(f"🔍 Evaluating model: {model_name}")
+		logger.info(f"📂 Path: {model_path}")
+		logger.info(f"{'=' * 60}")
+		try:
+			# Create evaluator without Beam utilities (no checkpointing)
+			evaluator = CodeSearchNetEvaluator(
+				model_path,
+				model_name,
+				checkpoint_manager=None,  # No checkpointing for local evaluation
+				eval_manager=None,
+			)
+			results = evaluator.evaluate_all_languages(max_queries, languages)
+			# Save results locally only
+			save_results(results, output_dir, model_name, eval_manager=None, volume_results_dir=None)
+			# Print summary
+			print_results_summary(results)
+			all_results.append(results)
+		except Exception:
+			logger.exception(f"❌ Failed to evaluate {model_name}")
+			continue
+	# Create comparison report locally
+	if len(all_results) > 1:
+		create_comparison_report(all_results, output_dir)
+		logger.info(f"📊 Comparison report saved locally: {output_dir}")
+	# Log summary
+	newly_evaluated = len(all_results) - len(skipped_models)
+	logger.info("\n✅ Local evaluation complete!")
+	logger.info(f"📊 Newly evaluated: {newly_evaluated} models")
+	logger.info(f"⏭️  Skipped (already done): {len(skipped_models)} models")
+	logger.info(f"📁 Total results: {len(all_results)} models")
+	logger.info(f"💾 Results available locally: {output_dir}")
+	if skipped_models:
+		logger.info(f"⏭️  Skipped models: {', '.join(skipped_models)}")
+	return all_results
+def run_local_evaluation_simplified(
+	max_queries: int = 1000,
+	languages: list[str] | None = None,
+	output_dir: str = DEFAULT_OUTPUT_DIR,
+) -> list[dict[str, Any]]:
+	"""Local evaluation function for simplified models only."""
+	logger.info("🖥️ Running simplified model evaluation locally")
+	# Discover simplified distillation models only
+	logger.info("🔍 Discovering simplified distillation models...")
+	discovered_models = discover_simplified_models(".")
+	if not discovered_models:
+		logger.error("❌ No simplified distillation models found! Run 'distiller distill-simple' first.")
+		return []
+	logger.info(f"✅ Found {len(discovered_models)} simplified models:")
+	for model_path in discovered_models:
+		logger.info(f"   📁 {model_path}")
+	return run_local_evaluation(
+		models=discovered_models,
+		max_queries=max_queries,
+		languages=languages,
+		output_dir=output_dir,
+	)
+if __name__ == "__main__":
+	main()

src/distiller/patch_utils.py ADDED Viewed

	@@ -0,0 +1,145 @@

+"""
+Patch utilities for applying fixes to installed packages.
+This module provides functionality to automatically apply all patches
+from the patches directory to fix bugs in third-party libraries.
+"""
+import logging
+import subprocess
+import sys
+from pathlib import Path
+logger = logging.getLogger(__name__)
+def find_patches_directory() -> Path:
+	"""Find the patches directory relative to the current script location."""
+	# Go up from src/distiller/ to project root, then to patches/
+	current_file = Path(__file__)
+	project_root = current_file.parent.parent.parent  # Go up 3 levels: distiller -> src -> project_root
+	patches_dir = project_root / "patches"
+	if not patches_dir.exists():
+		# Alternative: try relative to current working directory
+		patches_dir = Path("patches")
+	return patches_dir
+def get_site_packages_path() -> Path:
+	"""Get the site-packages directory path."""
+	import site
+	# Try to get the site-packages from the current environment
+	site_packages_dirs = site.getsitepackages()
+	# Prefer the first site-packages directory
+	if site_packages_dirs:
+		return Path(site_packages_dirs[0])
+	# Fallback: try to find it relative to Python executable
+	python_path = Path(sys.executable)
+	if python_path.name == "python" or python_path.name.startswith("python"):
+		# Standard virtual environment structure
+		venv_lib = python_path.parent.parent / "lib"
+		for item in venv_lib.iterdir():
+			if item.name.startswith("python"):
+				site_packages = item / "site-packages"
+				if site_packages.exists():
+					return site_packages
+	# Last resort: use current directory
+	return Path()
+def apply_patch_file(patch_file: Path, target_dir: Path) -> bool:
+	"""
+	Apply a single patch file to the target directory.
+	Args:
+	    patch_file: Path to the .patch file
+	    target_dir: Target directory (usually site-packages)
+	Returns:
+	    True if patch was applied successfully, False otherwise
+	"""
+	try:
+		logger.info(f"Applying patch: {patch_file.name}")
+		# Use patch command with the following options:
+		# -p1: strip 1 leading directory from paths
+		# -d: change to directory before applying
+		# -f: force (don't ask questions)
+		# -N: don't reverse patches that appear to be already applied
+		result = subprocess.run(  # noqa: S603
+			["patch", "-p1", "-d", str(target_dir), "-f", "-N"],  # noqa: S607
+			input=patch_file.read_text(),
+			text=True,
+			capture_output=True,
+			check=False,  # Don't raise exception on non-zero exit
+		)
+		if result.returncode == 0:
+			logger.info(f"Successfully applied patch: {patch_file.name}")
+			return True
+		if "already applied" in result.stderr.lower() or "reversed" in result.stderr.lower():
+			logger.info(f"Patch {patch_file.name} already applied")
+			return True
+		logger.warning(f"Failed to apply patch {patch_file.name}: {result.stderr}")
+		return False
+	except FileNotFoundError:
+		logger.exception("'patch' command not found. Please install patch utility.")
+		return False
+	except Exception:
+		logger.exception(f"Error applying patch {patch_file.name}")
+		return False
+def apply_all_patches() -> int:
+	"""
+	Apply all patches from the patches directory.
+	Returns:
+	    Number of patches successfully applied
+	"""
+	patches_dir = find_patches_directory()
+	if not patches_dir.exists():
+		logger.warning(f"Patches directory not found: {patches_dir}")
+		return 0
+	# Find all .patch files
+	patch_files = list(patches_dir.glob("*.patch"))
+	if not patch_files:
+		logger.info("No patch files found")
+		return 0
+	# Get target directory (site-packages)
+	target_dir = get_site_packages_path()
+	logger.info(f"Applying patches to: {target_dir}")
+	success_count = 0
+	# Sort patch files for consistent ordering
+	for patch_file in sorted(patch_files):
+		if apply_patch_file(patch_file, target_dir):
+			success_count += 1
+	logger.info(f"Applied {success_count}/{len(patch_files)} patches successfully")
+	return success_count
+def main() -> None:
+	"""Main function for standalone execution."""
+	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+	print("Applying all patches...")
+	success_count = apply_all_patches()
+	print(f"Done. Applied {success_count} patches.")
+if __name__ == "__main__":
+	main()

src/distiller/sync.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""
+Sync utility for downloading files from Beam volume to local directory.
+This module provides functionality to download generated files from the Beam volume
+back to the local filesystem, including:
+- Final distilled model files (model.safetensors, tokenizer.json, etc.)
+- Analysis reports and charts (README.md, comparison charts, etc.)
+"""
+import logging
+import shutil
+from pathlib import Path
+from .beam_utils import create_beam_utilities
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+# Beam volume configuration (must match distill.py)
+VOLUME_NAME = "gte_qwen2_m2v_code"
+VOLUME_PATH = "./gte_qwen2_m2v_code"
+# Model files to sync
+MODEL_FILES = [
+	"model.safetensors",
+	"tokenizer.json",
+	"modules.json",
+	"config.json",
+	"pytorch_model.bin",  # Backup format
+	"vocab.txt",  # If present
+]
+# Analysis directories and files
+ANALYSIS_DIRS = [
+	"analysis_results/reports",
+	"analysis_results/charts",
+	"evaluation_results",
+]
+ANALYSIS_FILES = [
+	"analysis_results/reports/analysis_report.md",
+	"analysis_results/reports/README.md",
+	"analysis_results/charts/*.png",
+	"analysis_results/charts/*.html",
+	"evaluation_results/*.json",
+	"evaluation_results/comparisons/*.csv",
+]
+def sync_model_files(output_dir: str) -> bool:
+	"""Download final model files from Beam volume."""
+	logger.info("🔄 Syncing model files from Beam volume...")
+	output_path = Path(output_dir)
+	output_path.mkdir(parents=True, exist_ok=True)
+	# First, let's debug what's actually in the volume
+	volume_root = Path(VOLUME_PATH)
+	logger.info(f"🔍 Debugging volume contents at: {volume_root}")
+	if volume_root.exists():
+		logger.info("📁 Volume root directory contents:")
+		for item in volume_root.iterdir():
+			if item.is_file():
+				logger.info(f"   📄 {item.name} ({item.stat().st_size} bytes)")
+			elif item.is_dir():
+				logger.info(f"   📁 {item.name}/ (directory)")
+				# List files in important subdirectories
+				if item.name in ["models", "checkpoints", "gte_qwen2_m2v_code"]:
+					try:
+						logger.info(f"      Contents of {item.name}/:")
+						for subitem in item.iterdir():
+							if subitem.is_file():
+								logger.info(f"         📄 {subitem.name} ({subitem.stat().st_size} bytes)")
+							else:
+								logger.info(f"         📁 {subitem.name}/")
+								# Check one level deeper for model files
+								if subitem.is_dir():
+									for subsubitem in subitem.iterdir():
+										if subsubitem.is_file() and subsubitem.name in MODEL_FILES:
+											logger.info(f"            🎯 FOUND MODEL FILE: {subsubitem}")
+					except Exception as e:
+						logger.warning(f"      Error exploring {item.name}: {e}")
+		# Also check for model files directly in root
+		logger.info("🔍 Checking for model files directly in volume root:")
+		for model_file in MODEL_FILES:
+			root_file = volume_root / model_file
+			if root_file.exists():
+				logger.info(f"   🎯 FOUND: {model_file} in root ({root_file.stat().st_size} bytes)")
+	else:
+		logger.error(f"❌ Volume root does not exist: {volume_root}")
+		return False
+	# Since training completed successfully, look for model files in all possible locations
+	model_locations = [
+		Path(VOLUME_PATH),  # Root of volume (where final model was saved)
+		Path(VOLUME_PATH) / "models" / "refined_model",  # Refined model directory
+	]
+	synced_files = []
+	for location in model_locations:
+		logger.info(f"📂 Checking model location: {location}")
+		if not location.exists():
+			logger.info(f"   ⚠️ Location does not exist: {location}")
+			continue
+		# Try to download each model file directly
+		for model_file in MODEL_FILES:
+			source_path = location / model_file
+			dest_path = output_path / model_file
+			if source_path.exists():
+				try:
+					shutil.copy2(source_path, dest_path)
+					synced_files.append(model_file)
+					logger.info(f"✅ Downloaded: {model_file}")
+				except Exception as e:
+					logger.warning(f"⚠️ Failed to copy {model_file}: {e}")
+	if synced_files:
+		logger.info(f"🎉 Successfully synced {len(synced_files)} model files:")
+		for file in synced_files:
+			logger.info(f"   ✓ {file}")
+		return True
+	logger.error("❌ No model files found to sync")
+	return False
+def sync_analysis_files(output_dir: str) -> bool:
+	"""Download analysis reports and charts from Beam volume."""
+	logger.info("🔄 Syncing analysis files from Beam volume...")
+	output_path = Path(output_dir)
+	output_path.mkdir(parents=True, exist_ok=True)
+	synced_files = []
+	# Sync analysis reports (including README.md)
+	analysis_reports_dir = Path(VOLUME_PATH) / "analysis_results" / "reports"
+	if analysis_reports_dir.exists():
+		for report_file in analysis_reports_dir.glob("*.md"):
+			dest_path = output_path / report_file.name
+			try:
+				shutil.copy2(report_file, dest_path)
+				synced_files.append(str(report_file.name))
+				logger.info(f"✅ Downloaded report: {report_file.name}")
+				# Special handling for README.md - copy to root
+				if report_file.name in {"analysis_report.md", "README.md"}:
+					root_readme = Path(output_dir) / "README.md"
+					shutil.copy2(report_file, root_readme)
+					logger.info("✅ Updated root README.md")
+			except Exception as e:
+				logger.warning(f"⚠️ Failed to copy {report_file.name}: {e}")
+	# Sync charts
+	charts_dir = Path(VOLUME_PATH) / "analysis_results" / "charts"
+	local_charts_dir = output_path / "charts"
+	if charts_dir.exists():
+		local_charts_dir.mkdir(exist_ok=True)
+		for chart_file in charts_dir.glob("*"):
+			if chart_file.is_file():
+				dest_path = local_charts_dir / chart_file.name
+				try:
+					shutil.copy2(chart_file, dest_path)
+					synced_files.append(f"charts/{chart_file.name}")
+					logger.info(f"✅ Downloaded chart: {chart_file.name}")
+				except Exception as e:
+					logger.warning(f"⚠️ Failed to copy chart {chart_file.name}: {e}")
+	# Sync evaluation results
+	eval_dir = Path(VOLUME_PATH) / "evaluation_results"
+	local_eval_dir = output_path / "evaluation_results"
+	if eval_dir.exists():
+		local_eval_dir.mkdir(exist_ok=True)
+		for eval_file in eval_dir.glob("*.json"):
+			dest_path = local_eval_dir / eval_file.name
+			try:
+				shutil.copy2(eval_file, dest_path)
+				synced_files.append(f"evaluation_results/{eval_file.name}")
+				logger.info(f"✅ Downloaded evaluation: {eval_file.name}")
+			except Exception as e:
+				logger.warning(f"⚠️ Failed to copy evaluation {eval_file.name}: {e}")
+	if synced_files:
+		logger.info(f"🎉 Successfully synced {len(synced_files)} analysis files:")
+		for file in synced_files[:10]:  # Show first 10
+			logger.info(f"   ✓ {file}")
+		if len(synced_files) > 10:
+			logger.info(f"   ... and {len(synced_files) - 10} more files")
+		return True
+	logger.error("❌ No analysis files found to sync")
+	return False
+def sync_files(
+	model_files: bool = False,
+	analysis_files: bool = False,
+	all_files: bool = False,
+	output_dir: str = ".",
+) -> None:
+	"""Main sync function to download files from Beam volume."""
+	logger.info("🚀 Starting file sync from Beam volume")
+	logger.info(f"📁 Local output directory: {output_dir}")
+	# Initialize Beam utilities (read-only)
+	try:
+		volume_mgr, checkpoint_mgr, model_mgr, eval_mgr = create_beam_utilities(VOLUME_NAME, VOLUME_PATH)
+		logger.info(f"✅ Connected to Beam volume: {VOLUME_NAME}")
+	except Exception:
+		logger.exception("❌ Failed to connect to Beam volume")
+		logger.info("Make sure you have run the distillation/evaluation on Beam first")
+		return
+	# Check what files to sync
+	sync_model = model_files or all_files
+	sync_analysis = analysis_files or all_files
+	if not (sync_model or sync_analysis):
+		logger.error("❌ No file types specified. Use --model-files, --analysis-files, or --all")
+		return
+	success_count = 0
+	# Sync model files
+	if sync_model:
+		logger.info("\n" + "=" * 60)  # noqa: G003
+		logger.info("MODEL FILES SYNC")
+		logger.info("=" * 60)
+		if sync_model_files(output_dir):
+			success_count += 1
+	# Sync analysis files
+	if sync_analysis:
+		logger.info("\n" + "=" * 60)  # noqa: G003
+		logger.info("ANALYSIS FILES SYNC")
+		logger.info("=" * 60)
+		if sync_analysis_files(output_dir):
+			success_count += 1
+	# Summary
+	logger.info("\n" + "=" * 60)  # noqa: G003
+	logger.info("SYNC SUMMARY")
+	logger.info("=" * 60)
+	total_requested = sum([sync_model, sync_analysis])
+	if success_count == total_requested:
+		logger.info("🎉 All requested files synced successfully!")
+	elif success_count > 0:
+		logger.info(f"⚠️ Partial sync: {success_count}/{total_requested} file types synced")
+	else:
+		logger.error("❌ No files were synced")
+	logger.info(f"📂 Files saved to: {Path(output_dir).absolute()}")