""" Shared configuration for the distiller package. This module centralizes all configuration constants, default values, and common settings used across distillation, evaluation, and benchmarking modules. """ import logging from pathlib import Path from typing import Any from beam import GpuType, Image from pydantic import BaseModel # ============================================================================= # LOGGING CONFIGURATION # ============================================================================= def setup_logging(level: int = logging.INFO) -> None: """Set up consistent logging across the package.""" log_dir = Path("logs") log_dir.mkdir(parents=True, exist_ok=True) log_path = log_dir / "distiller.log" logging.basicConfig( level=level, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler(), logging.FileHandler(log_path, mode="a")], ) # ============================================================================= # BEAM CLOUD CONFIGURATION # ============================================================================= # Comprehensive Beam function configuration class BeamFunctionConfig(BaseModel): """Complete configuration for Beam @function decorator parameters.""" # Resource allocation cpu: float = 2.0 # Number of CPU cores memory: int = 8192 # Memory in MiB (8GB) gpu: GpuType | list[GpuType] = GpuType.A100_40 # GPU type # Execution settings timeout: int = 3600 * 12 # 12 hours timeout for long distillation jobs retries: int = 2 # Retry failed tasks up to 2 times headless: bool = False # Keep connected during execution # Optional settings callback_url: str | None = None # Webhook URL for task completion name: str | None = None # Function name for deployment task_policy: Any | None = None # Task lifecycle policy retry_for: list[str] | None = None # Specific exceptions to retry on # Environment and dependencies secrets: list[str] = ["HF_ACCESS_TOKEN"] # Required secrets env_vars: dict[str, str] = { "TOKENIZERS_PARALLELISM": "false", "CUDA_LAUNCH_BLOCKING": "0", "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", "TORCH_CUDNN_V8_API_ENABLED": "1", # Flash attention environment variables "FLASH_ATTENTION_FORCE_USE": "1", "TORCH_COMPILE_DISABLE": "1", } # Configuration for different types of Beam jobs BEAM_CONFIGS: dict[str, BeamFunctionConfig] = { "distillation": BeamFunctionConfig( cpu=4.0, memory=16384, # 8GB for distillation gpu=GpuType.A100_40, timeout=3600 * 12, # 12 hours retries=2, secrets=["HF_ACCESS_TOKEN"], ), "training": BeamFunctionConfig( cpu=4.0, memory=16384, # 8GB for distillation gpu=[GpuType.H100, GpuType.A100_40], timeout=3600 * 12, # 12 hours retries=2, secrets=["HF_ACCESS_TOKEN"], ), "evaluation": BeamFunctionConfig( cpu=2.0, memory=8192, # 8GB for evaluation gpu=GpuType.A100_40, # Smaller GPU for evaluation timeout=3600 * 4, # 4 hours retries=3, secrets=["HF_ACCESS_TOKEN"], ), } # Default beam configuration DEFAULT_BEAM_CONFIG = BEAM_CONFIGS["distillation"] # Volume configurations for different workflows class VolumeConfig(BaseModel): """Volume configuration container.""" name: str mount_path: str description: str = "" # Define volume configurations - code_model2vec is the primary volume for all workflows VOLUMES: dict[str, VolumeConfig] = { "primary": VolumeConfig( name="code_model2vec", mount_path="./code_model2vec", description="Primary volume for all distillation models, evaluations, benchmarks, and checkpoints", ), # Legacy volume name mapping for backwards compatibility "simplified": VolumeConfig( name="code_model2vec", mount_path="./code_model2vec", description="Primary volume for all distillation models, evaluations, benchmarks, and checkpoints", ), } # Default volume name for all workflows DEFAULT_VOLUME = "primary" # Legacy environment settings (now part of BeamFunctionConfig) BEAM_ENV_SETTINGS: dict[str, str] = DEFAULT_BEAM_CONFIG.env_vars # Common Python packages for Beam images COMMON_PACKAGES: list[str] = [ "torch>=2.7.0", "transformers>=4.40.0", "datasets>=3.2.0", "sentence-transformers>=4.1.0", "model2vec[train]>=0.5.0", "tokenlearn>=0.2.0", "numpy>=1.26.4", "scikit-learn>=1.6.1", "pandas>=2.0.0", "tqdm>=4.65.0", "plotly>=5.0.0", "matplotlib>=3.7.0", "seaborn>=0.12.0", "typer>=0.16.0", "pydantic>=2.11.5", "hatchling>=1.27.0", ] # Create common Beam image without flash-attn due to PyTorch version conflicts IMAGE = Image(python_version="python3.12").add_python_packages(COMMON_PACKAGES) # ============================================================================= # MODEL CONFIGURATION # ============================================================================= # Teacher model configurations TEACHER_MODELS: list[str] = [ "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "BAAI/bge-m3", "jinaai/jina-embeddings-v3", "lightonai/Reason-ModernColBERT", "Linq-AI-Research/Linq-Embed-Mistral", "microsoft/codebert-base", "microsoft/graphcodebert-base", "nomic-ai/nomic-embed-text-v2-moe", "Qodo/Qodo-Embed-1-1.5B", "sentence-transformers/all-MiniLM-L6-v2", "sentence-transformers/all-mpnet-base-v2", "sentence-transformers/paraphrase-MiniLM-L6-v2", "jinaai/jina-embeddings-v2-base-code", ] # Default evaluation models for comparison DEFAULT_EVALUATION_MODELS: list[str] = [ "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "BAAI/bge-m3", "huggingface/CodeBERTa-small-v1", "jinaai/jina-embeddings-v3", "lightonai/Reason-ModernColBERT", "Linq-AI-Research/Linq-Embed-Mistral", "microsoft/codebert-base", "microsoft/graphcodebert-base", "minishlab/potion-base-8M", "minishlab/potion-retrieval-32M", "minishlab/potion-multilingual-128M", "nomic-ai/nomic-embed-text-v2-moe", "Qodo/Qodo-Embed-1-1.5B", "Salesforce/codet5-base", "sentence-transformers/all-MiniLM-L12-v2", "sentence-transformers/all-MiniLM-L6-v2", "sentence-transformers/all-mpnet-base-v2", "sentence-transformers/paraphrase-MiniLM-L6-v2", "jinaai/jina-embeddings-v2-base-code", ] # Model2Vec distillation parameters class DistillationConfig(BaseModel): """Configuration for Model2Vec distillation parameters.""" # Teacher models for distillation code_teacher_models: list[str] = TEACHER_MODELS # Basic distillation parameters optimal_pca_dims: int = 256 sif_coefficient: float = 1e-3 apply_zipf: bool = True # Tokenlearn-specific parameters (POTION approach) tokenlearn_dataset: str = "allenai/c4" # Dataset for tokenlearn featurization (following POTION paper) tokenlearn_dataset_name: str = "en" # Use 'en' configuration for English text tokenlearn_text_key: str = "text" # Text field to use from the dataset tokenlearn_timeout_featurize: int = 21600 # 6 hour timeout for featurization (dataset needs ~5 hours) tokenlearn_timeout_train: int = 7200 # 2 hour timeout for training # Dataset sampling configuration tokenlearn_max_samples: int = 50000 # Maximum samples to use for tokenlearn training # Dataset configuration use_optimized_dataset: bool = True # Use the pre-created optimized dataset from dataset.py custom_dataset_path: str | None = "code_model2vec/dataset" # Path to custom dataset directory distillation_config = DistillationConfig() # ============================================================================= # DATASET CONFIGURATION # ============================================================================= # Add a LanguagesConfig Pydantic model class LanguagesConfig(BaseModel): """Configuration for languages used in evaluation.""" all: list[str] = [ "python", "java", "javascript", "php", "ruby", "go", ] languages_config = LanguagesConfig() # Update CodeSearchNetConfig to use languages_config.all as the default for evaluation_languages class CodeSearchNetConfig(BaseModel): """Configuration for CodeSearchNet evaluation settings.""" dataset_name: str = "code_search_net" evaluation_languages: list[str] = languages_config.all max_queries_per_language: int = 1000 similarity_threshold: float = 0.7 evaluation_metrics: list[str] = ["ndcg@1", "ndcg@5", "ndcg@10", "mrr", "recall@1", "recall@5", "recall@10"] codesearchnet_config = CodeSearchNetConfig() # Training dataset configuration TRAINING_DATASET: str = "sentence-transformers/codesearchnet" # ============================================================================= # OUTPUT DIRECTORY CONFIGURATION # ============================================================================= # Standardized directory structure within code_model2vec class StandardDirectories(BaseModel): """Standardized directory structure for code_model2vec workspace.""" # Root directory root: str = "code_model2vec" # Model directories base: str = "code_model2vec/base" # Basic distilled models final: str = "code_model2vec/final" # Final trained models models: str = "code_model2vec/models" # Legacy/alternative models location # Results directories evaluation_results: str = "code_model2vec/evaluation_results" benchmark_results: str = "code_model2vec/benchmark_results" analysis_results: str = "code_model2vec/analysis_results" # Working directories checkpoints: str = "code_model2vec/checkpoints" cache: str = "code_model2vec/cache" temp: str = "code_model2vec/temp" # Create global instance directories = StandardDirectories() # Legacy OutputDirs for backwards compatibility class OutputDirs(BaseModel): """Base output directory structure for storing models, checkpoints, and results.""" base: str = "base" models: str = "final" checkpoints: str = "checkpoints" evaluation_results: str = "evaluation_results" benchmark_results: str = "benchmark_results" analysis_results: str = "analysis_results" cache: str = "cache" output_dirs = OutputDirs() # File naming patterns class FilenamePatterns(BaseModel): """File naming patterns for evaluation, benchmark, checkpoint, and model files.""" evaluation: str = "codesearchnet_eval_{model_name}.json" bencmark: str = "benchmark_{model_name}.json" checkpoint: str = "checkpoints_{stage}_step_{step}.json" model: str = "{teacher_model}_{dims}d" filename_patterns = FilenamePatterns() # ============================================================================= # ANALYSIS AND VISUALIZATION # ============================================================================= # Chart configuration class ChartConfig(BaseModel): """Chart configuration for analysis and visualization.""" figsize: tuple[int, int] = (12, 8) dpi: int = 300 style: str = "whitegrid" color_palette: str = "Set2" save_formats: list[str] = ["png", "pdf"] chart_config = ChartConfig() # Performance thresholds for analysis class PerformanceThresholds(BaseModel): """Performance thresholds for analysis results.""" excellent: float = 0.7 good: float = 0.5 fair: float = 0.3 pour: float = 0.1 performance_thresholds = PerformanceThresholds() # ============================================================================= # HELPER FUNCTIONS # ============================================================================= def get_volume_config() -> VolumeConfig: """Get volume configuration for any workflow - always returns the primary code_model2vec volume.""" return VOLUMES["primary"] def get_output_path(base_path: str | Path, output_type: str) -> Path: """Get standardized output path for different types of outputs.""" base = Path(base_path) if hasattr(output_dirs, output_type): return base / getattr(output_dirs, output_type) return base / output_type def get_standard_directory(dir_type: str) -> str: """Get standardized directory path for any directory type.""" if hasattr(directories, dir_type): return getattr(directories, dir_type) # Default to relative path within code_model2vec return f"code_model2vec/{dir_type}" def ensure_checkpoint_directory(stage: str) -> str: """Ensure checkpoint directory exists for a specific stage and return the path.""" checkpoint_dir = f"{directories.checkpoints}/{stage}" Path(checkpoint_dir).mkdir(parents=True, exist_ok=True) return checkpoint_dir def format_filename(pattern_key: str, **kwargs: Any) -> str: """Format filename using predefined patterns.""" if hasattr(filename_patterns, pattern_key): return getattr(filename_patterns, pattern_key).format(**kwargs) msg = f"Unknown filename pattern: {pattern_key}" raise ValueError(msg) def get_safe_model_name(model_name: str) -> str: """Convert model name to filesystem-safe name.""" return "".join(c for c in model_name if c.isalnum() or c in ("-", "_", ".")).replace("/", "_") def get_beam_config(job_type: str = "distillation") -> BeamFunctionConfig: """Get Beam configuration for a specific job type.""" if job_type in BEAM_CONFIGS: return BEAM_CONFIGS[job_type] return DEFAULT_BEAM_CONFIG def create_beam_function_kwargs( job_type: str = "distillation", volume_config: VolumeConfig | None = None ) -> dict[str, Any]: """Create kwargs dictionary for @function decorator.""" from beam import Volume config = get_beam_config(job_type) volume_cfg = volume_config or get_volume_config() # Convert GPU string to proper type if needed gpu_type = config.gpu kwargs: dict[str, Any] = { "cpu": config.cpu, "memory": config.memory, "gpu": gpu_type, "image": IMAGE, "timeout": config.timeout, "retries": config.retries, "headless": config.headless, "volumes": [Volume(name=volume_cfg.name, mount_path=volume_cfg.mount_path)], "secrets": config.secrets, "env": config.env_vars, } # Add optional parameters if they're set if config.callback_url: kwargs["callback_url"] = config.callback_url if config.name: kwargs["name"] = config.name if config.task_policy: kwargs["task_policy"] = config.task_policy if config.retry_for: kwargs["retry_for"] = config.retry_for return kwargs def get_distillation_function_kwargs() -> dict[str, Any]: """Get function kwargs specifically for distillation jobs.""" return create_beam_function_kwargs("distillation") def get_training_function_kwargs() -> dict[str, Any]: """Get function kwargs specifically for training jobs.""" return create_beam_function_kwargs("training") def get_evaluation_function_kwargs() -> dict[str, Any]: """Get function kwargs specifically for evaluation jobs.""" return create_beam_function_kwargs("evaluation")