feat(config): allow multiple GPU types for training and simplify GPU handling

This change allows training jobs to be configured with either a single GPU type or a list of GPU types, providing more flexibility in resource allocation. It also adds a function to get function kwargs specifically for training jobs.

The `create_beam_function_kwargs` function was updated to accept both single and list of GPU types without converting from string, simplifying the GPU type handling.

Files changed (1) hide show

src/distiller/config.py +18 -18

src/distiller/config.py CHANGED Viewed

@@ -33,9 +33,6 @@ def setup_logging(level: int = logging.INFO) -> None:
 # BEAM CLOUD CONFIGURATION
 # =============================================================================
-# Beam execution settings
-GPU_NAME = GpuType.A100_40
 # Comprehensive Beam function configuration
 class BeamFunctionConfig(BaseModel):
@@ -44,7 +41,7 @@ class BeamFunctionConfig(BaseModel):
 	# Resource allocation
 	cpu: float = 2.0  # Number of CPU cores
 	memory: int = 8192  # Memory in MiB (8GB)
-	gpu: str = "A100_40"  # GPU type
 	# Execution settings
 	timeout: int = 3600 * 12  # 12 hours timeout for long distillation jobs
@@ -75,7 +72,15 @@ BEAM_CONFIGS: dict[str, BeamFunctionConfig] = {
 	"distillation": BeamFunctionConfig(
 		cpu=4.0,
 		memory=16384,  # 8GB for distillation
-		gpu="A100_40",
 		timeout=3600 * 12,  # 12 hours
 		retries=2,
 		secrets=["HF_ACCESS_TOKEN"],
@@ -83,7 +88,7 @@ BEAM_CONFIGS: dict[str, BeamFunctionConfig] = {
 	"evaluation": BeamFunctionConfig(
 		cpu=2.0,
 		memory=8192,  # 8GB for evaluation
-		gpu="A100_40",  # Smaller GPU for evaluation
 		timeout=3600 * 4,  # 4 hours
 		retries=3,
 		secrets=["HF_ACCESS_TOKEN"],
@@ -408,18 +413,8 @@ def create_beam_function_kwargs(
 	# Convert GPU string to proper type if needed
 	gpu_type = config.gpu
-	if isinstance(gpu_type, str):
-		# Map string to GpuType if it's a known type
-		gpu_mapping = {
-			"A100_40": GpuType.A100_40,
-			"A100_80": GpuType.A100_80,
-			"T4": GpuType.T4,
-			"A10G": GpuType.A10G,
-			"NoGPU": GpuType.NoGPU,
-		}
-		gpu_type = gpu_mapping.get(config.gpu, config.gpu)
-	kwargs = {
 		"cpu": config.cpu,
 		"memory": config.memory,
 		"gpu": gpu_type,
@@ -450,6 +445,11 @@ def get_distillation_function_kwargs() -> dict[str, Any]:
 	return create_beam_function_kwargs("distillation")
 def get_evaluation_function_kwargs() -> dict[str, Any]:
 	"""Get function kwargs specifically for evaluation jobs."""
 	return create_beam_function_kwargs("evaluation")

 # BEAM CLOUD CONFIGURATION
 # =============================================================================
 # Comprehensive Beam function configuration
 class BeamFunctionConfig(BaseModel):
 	# Resource allocation
 	cpu: float = 2.0  # Number of CPU cores
 	memory: int = 8192  # Memory in MiB (8GB)
+	gpu: GpuType | list[GpuType] = GpuType.A100_40  # GPU type
 	# Execution settings
 	timeout: int = 3600 * 12  # 12 hours timeout for long distillation jobs
 	"distillation": BeamFunctionConfig(
 		cpu=4.0,
 		memory=16384,  # 8GB for distillation
+		gpu=GpuType.A100_40,
+		timeout=3600 * 12,  # 12 hours
+		retries=2,
+		secrets=["HF_ACCESS_TOKEN"],
+	),
+	"training": BeamFunctionConfig(
+		cpu=4.0,
+		memory=16384,  # 8GB for distillation
+		gpu=[GpuType.H100, GpuType.A100_40],
 		timeout=3600 * 12,  # 12 hours
 		retries=2,
 		secrets=["HF_ACCESS_TOKEN"],
 	"evaluation": BeamFunctionConfig(
 		cpu=2.0,
 		memory=8192,  # 8GB for evaluation
+		gpu=GpuType.A100_40,  # Smaller GPU for evaluation
 		timeout=3600 * 4,  # 4 hours
 		retries=3,
 		secrets=["HF_ACCESS_TOKEN"],
 	# Convert GPU string to proper type if needed
 	gpu_type = config.gpu
+	kwargs: dict[str, Any] = {
 		"cpu": config.cpu,
 		"memory": config.memory,
 		"gpu": gpu_type,
 	return create_beam_function_kwargs("distillation")
+def get_training_function_kwargs() -> dict[str, Any]:
+	"""Get function kwargs specifically for training jobs."""
+	return create_beam_function_kwargs("training")
 def get_evaluation_function_kwargs() -> dict[str, Any]:
 	"""Get function kwargs specifically for evaluation jobs."""
 	return create_beam_function_kwargs("evaluation")