feat(distiller): configure beam functions with resource settings

This commit introduces a configuration system for Beam functions, allowing for better resource management, execution settings, and environment configurations for different types of Beam jobs like distillation and evaluation. It also simplifies the function deployment process by providing pre-defined configurations and utilities for creating @function decorator kwargs.

Files changed (1) hide show

src/distiller/config.py +140 -24

src/distiller/config.py CHANGED Viewed

@@ -37,6 +37,63 @@ def setup_logging(level: int = logging.INFO) -> None:
 GPU_NAME = GpuType.A100_40
 # Volume configurations for different workflows
 class VolumeConfig(BaseModel):
 	"""Volume configuration container."""
@@ -64,13 +121,8 @@ VOLUMES: dict[str, VolumeConfig] = {
 # Default volume name for all workflows
 DEFAULT_VOLUME = "primary"
-# Beam environment settings
-BEAM_ENV_SETTINGS: dict[str, str] = {
-	"TOKENIZERS_PARALLELISM": "false",
-	"CUDA_LAUNCH_BLOCKING": "0",
-	"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True,max_split_size_mb:512",
-	"TORCH_CUDNN_V8_API_ENABLED": "1",
-}
 # Common Python packages for Beam images
 COMMON_PACKAGES: list[str] = [
@@ -79,6 +131,7 @@ COMMON_PACKAGES: list[str] = [
 	"datasets>=3.2.0",
 	"sentence-transformers>=4.1.0",
 	"model2vec[train]>=0.5.0",
 	"numpy>=1.26.4",
 	"scikit-learn>=1.6.1",
 	"pandas>=2.0.0",
@@ -86,9 +139,12 @@ COMMON_PACKAGES: list[str] = [
 	"plotly>=5.0.0",
 	"matplotlib>=3.7.0",
 	"seaborn>=0.12.0",
 ]
-# Create common Beam image
 IMAGE = Image(python_version="python3.12").add_python_packages(COMMON_PACKAGES)
 # =============================================================================
@@ -109,8 +165,7 @@ TEACHER_MODELS: list[str] = [
 	"sentence-transformers/all-MiniLM-L6-v2",
 	"sentence-transformers/all-mpnet-base-v2",
 	"sentence-transformers/paraphrase-MiniLM-L6-v2",
-	"nomic-ai/nomic-embed-code",
-	"nomic-ai/CodeRankEmbed",
 ]
 # Default evaluation models for comparison
@@ -125,6 +180,7 @@ DEFAULT_EVALUATION_MODELS: list[str] = [
 	"microsoft/graphcodebert-base",
 	"minishlab/potion-base-8M",
 	"minishlab/potion-retrieval-32M",
 	"nomic-ai/nomic-embed-text-v2-moe",
 	"Qodo/Qodo-Embed-1-1.5B",
 	"Salesforce/codet5-base",
@@ -132,9 +188,7 @@ DEFAULT_EVALUATION_MODELS: list[str] = [
 	"sentence-transformers/all-MiniLM-L6-v2",
 	"sentence-transformers/all-mpnet-base-v2",
 	"sentence-transformers/paraphrase-MiniLM-L6-v2",
-	"nvidia/NV-Embed-v2",
-	"nomic-ai/nomic-embed-code",
-	"nomic-ai/CodeRankEmbed",
 ]
@@ -150,12 +204,12 @@ class DistillationConfig(BaseModel):
 	sif_coefficient: float = 1e-3
 	apply_zipf: bool = True
-	# Training parameters (used when --train flag is enabled)
-	training_epochs: int = 2
-	learning_rate: float = 1e-4
-	batch_size: int = 32
-	max_training_samples: int = 50000
-	teacher_model_config: dict[str, Any] = {}
 distillation_config = DistillationConfig()
@@ -196,11 +250,8 @@ class CodeSearchNetConfig(BaseModel):
 codesearchnet_config = CodeSearchNetConfig()
-# Training dataset configurations
-TRAINING_DATASETS: dict[str, str] = {
-	"codesearchnet": "sentence-transformers/codesearchnet",
-	"code_search_net": "code_search_net",
-}
 # =============================================================================
 # OUTPUT DIRECTORY CONFIGURATION
@@ -337,3 +388,68 @@ def format_filename(pattern_key: str, **kwargs: Any) -> str:
 def get_safe_model_name(model_name: str) -> str:
 	"""Convert model name to filesystem-safe name."""
 	return "".join(c for c in model_name if c.isalnum() or c in ("-", "_", ".")).replace("/", "_")

 GPU_NAME = GpuType.A100_40
+# Comprehensive Beam function configuration
+class BeamFunctionConfig(BaseModel):
+	"""Complete configuration for Beam @function decorator parameters."""
+	# Resource allocation
+	cpu: float = 2.0  # Number of CPU cores
+	memory: int = 8192  # Memory in MiB (8GB)
+	gpu: str = "A100_40"  # GPU type
+	# Execution settings
+	timeout: int = 3600 * 12  # 12 hours timeout for long distillation jobs
+	retries: int = 2  # Retry failed tasks up to 2 times
+	headless: bool = False  # Keep connected during execution
+	# Optional settings
+	callback_url: str | None = None  # Webhook URL for task completion
+	name: str | None = None  # Function name for deployment
+	task_policy: Any | None = None  # Task lifecycle policy
+	retry_for: list[str] | None = None  # Specific exceptions to retry on
+	# Environment and dependencies
+	secrets: list[str] = ["HF_ACCESS_TOKEN"]  # Required secrets
+	env_vars: dict[str, str] = {
+		"TOKENIZERS_PARALLELISM": "false",
+		"CUDA_LAUNCH_BLOCKING": "0",
+		"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
+		"TORCH_CUDNN_V8_API_ENABLED": "1",
+		# Flash attention environment variables
+		"FLASH_ATTENTION_FORCE_USE": "1",
+		"TORCH_COMPILE_DISABLE": "1",
+	}
+# Configuration for different types of Beam jobs
+BEAM_CONFIGS: dict[str, BeamFunctionConfig] = {
+	"distillation": BeamFunctionConfig(
+		cpu=4.0,
+		memory=16384,  # 8GB for distillation
+		gpu="A100_40",
+		timeout=3600 * 12,  # 12 hours
+		retries=2,
+		secrets=["HF_ACCESS_TOKEN"],
+	),
+	"evaluation": BeamFunctionConfig(
+		cpu=2.0,
+		memory=8192,  # 8GB for evaluation
+		gpu="A100_40",  # Smaller GPU for evaluation
+		timeout=3600 * 4,  # 4 hours
+		retries=3,
+		secrets=["HF_ACCESS_TOKEN"],
+	),
+}
+# Default beam configuration
+DEFAULT_BEAM_CONFIG = BEAM_CONFIGS["distillation"]
 # Volume configurations for different workflows
 class VolumeConfig(BaseModel):
 	"""Volume configuration container."""
 # Default volume name for all workflows
 DEFAULT_VOLUME = "primary"
+# Legacy environment settings (now part of BeamFunctionConfig)
+BEAM_ENV_SETTINGS: dict[str, str] = DEFAULT_BEAM_CONFIG.env_vars
 # Common Python packages for Beam images
 COMMON_PACKAGES: list[str] = [
 	"datasets>=3.2.0",
 	"sentence-transformers>=4.1.0",
 	"model2vec[train]>=0.5.0",
+	"tokenlearn>=0.2.0",
 	"numpy>=1.26.4",
 	"scikit-learn>=1.6.1",
 	"pandas>=2.0.0",
 	"plotly>=5.0.0",
 	"matplotlib>=3.7.0",
 	"seaborn>=0.12.0",
+	"typer>=0.16.0",
+	"pydantic>=2.11.5",
+	"hatchling>=1.27.0",
 ]
+# Create common Beam image without flash-attn due to PyTorch version conflicts
 IMAGE = Image(python_version="python3.12").add_python_packages(COMMON_PACKAGES)
 # =============================================================================
 	"sentence-transformers/all-MiniLM-L6-v2",
 	"sentence-transformers/all-mpnet-base-v2",
 	"sentence-transformers/paraphrase-MiniLM-L6-v2",
+	"jinaai/jina-embeddings-v2-base-code",
 ]
 # Default evaluation models for comparison
 	"microsoft/graphcodebert-base",
 	"minishlab/potion-base-8M",
 	"minishlab/potion-retrieval-32M",
+	"minishlab/potion-multilingual-128M",
 	"nomic-ai/nomic-embed-text-v2-moe",
 	"Qodo/Qodo-Embed-1-1.5B",
 	"Salesforce/codet5-base",
 	"sentence-transformers/all-MiniLM-L6-v2",
 	"sentence-transformers/all-mpnet-base-v2",
 	"sentence-transformers/paraphrase-MiniLM-L6-v2",
+	"jinaai/jina-embeddings-v2-base-code",
 ]
 	sif_coefficient: float = 1e-3
 	apply_zipf: bool = True
+	# Tokenlearn-specific parameters (POTION approach)
+	tokenlearn_dataset: str = "sentence-transformers/codesearchnet"  # Dataset for tokenlearn featurization
+	tokenlearn_dataset_name: str = "pair"  # Use 'pair' configuration (only available config)
+	tokenlearn_text_key: str = "code"  # Text field to use from the dataset ('code' or 'comment')
+	tokenlearn_timeout_featurize: int = 21600  # 6 hour timeout for featurization (dataset needs ~5 hours)
+	tokenlearn_timeout_train: int = 7200  # 2 hour timeout for training
 distillation_config = DistillationConfig()
 codesearchnet_config = CodeSearchNetConfig()
+# Training dataset configuration
+TRAINING_DATASET: str = "sentence-transformers/codesearchnet"
 # =============================================================================
 # OUTPUT DIRECTORY CONFIGURATION
 def get_safe_model_name(model_name: str) -> str:
 	"""Convert model name to filesystem-safe name."""
 	return "".join(c for c in model_name if c.isalnum() or c in ("-", "_", ".")).replace("/", "_")
+def get_beam_config(job_type: str = "distillation") -> BeamFunctionConfig:
+	"""Get Beam configuration for a specific job type."""
+	if job_type in BEAM_CONFIGS:
+		return BEAM_CONFIGS[job_type]
+	return DEFAULT_BEAM_CONFIG
+def create_beam_function_kwargs(
+	job_type: str = "distillation", volume_config: VolumeConfig | None = None
+) -> dict[str, Any]:
+	"""Create kwargs dictionary for @function decorator."""
+	from beam import Volume
+	config = get_beam_config(job_type)
+	volume_cfg = volume_config or get_volume_config()
+	# Convert GPU string to proper type if needed
+	gpu_type = config.gpu
+	if isinstance(gpu_type, str):
+		# Map string to GpuType if it's a known type
+		gpu_mapping = {
+			"A100_40": GpuType.A100_40,
+			"A100_80": GpuType.A100_80,
+			"T4": GpuType.T4,
+			"A10G": GpuType.A10G,
+			"NoGPU": GpuType.NoGPU,
+		}
+		gpu_type = gpu_mapping.get(config.gpu, config.gpu)
+	kwargs = {
+		"cpu": config.cpu,
+		"memory": config.memory,
+		"gpu": gpu_type,
+		"image": IMAGE,
+		"timeout": config.timeout,
+		"retries": config.retries,
+		"headless": config.headless,
+		"volumes": [Volume(name=volume_cfg.name, mount_path=volume_cfg.mount_path)],
+		"secrets": config.secrets,
+		"env": config.env_vars,
+	}
+	# Add optional parameters if they're set
+	if config.callback_url:
+		kwargs["callback_url"] = config.callback_url
+	if config.name:
+		kwargs["name"] = config.name
+	if config.task_policy:
+		kwargs["task_policy"] = config.task_policy
+	if config.retry_for:
+		kwargs["retry_for"] = config.retry_for
+	return kwargs
+def get_distillation_function_kwargs() -> dict[str, Any]:
+	"""Get function kwargs specifically for distillation jobs."""
+	return create_beam_function_kwargs("distillation")
+def get_evaluation_function_kwargs() -> dict[str, Any]:
+	"""Get function kwargs specifically for evaluation jobs."""
+	return create_beam_function_kwargs("evaluation")