Spaces:
Runtime error
Runtime error
File size: 7,500 Bytes
178abc4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | """
Training configuration schemas — Pydantic v2.
All training jobs are validated against these models before execution.
No raw dicts escape into the pipeline; everything is typed and constrained.
"""
from __future__ import annotations
from enum import StrEnum
from typing import Annotated
from pydantic import BaseModel, Field, HttpUrl, model_validator
from pydantic import PositiveFloat, PositiveInt
# ---------------------------------------------------------------------------
# Enums
# ---------------------------------------------------------------------------
class EvalStrategy(StrEnum):
NO = "no"
STEPS = "steps"
EPOCH = "epoch"
class Precision(StrEnum):
FP32 = "fp32"
FP16 = "fp16"
BF16 = "bf16"
INT8 = "int8"
class OptimizerType(StrEnum):
ADAMW = "adamw_torch"
ADAMW_8BIT = "adamw_8bit"
PAGED_ADAMW_8BIT = "paged_adamw_8bit"
SGD = "sgd"
class EvalMetric(StrEnum):
PASS_AT_1 = "pass_at_1"
PASS_AT_10 = "pass_at_10"
BLEU = "bleu"
EXECUTION_ACCURACY = "execution_accuracy"
EXACT_MATCH = "exact_match"
# ---------------------------------------------------------------------------
# Sub-configs
# ---------------------------------------------------------------------------
class LoRAConfig(BaseModel):
"""LoRA adapter configuration. Omit to disable LoRA (full fine-tune)."""
enabled: bool = True
r: Annotated[int, Field(ge=1, le=256)] = 16
alpha: Annotated[int, Field(ge=1)] = 32
dropout: Annotated[float, Field(ge=0.0, lt=1.0)] = 0.05
target_modules: list[str] = Field(
default_factory=lambda: ["q_proj", "v_proj"],
min_length=1,
)
bias: str = "none"
@model_validator(mode="after")
def alpha_geq_r(self) -> "LoRAConfig":
if self.alpha < self.r:
raise ValueError(f"lora.alpha ({self.alpha}) should be >= lora.r ({self.r})")
return self
class TrainingHyperparams(BaseModel):
num_epochs: Annotated[int, Field(ge=1, le=100)] = 3
batch_size: Annotated[int, Field(ge=1, le=256)] = 8
gradient_accumulation_steps: Annotated[int, Field(ge=1, le=128)] = 4
learning_rate: Annotated[float, Field(gt=0.0, lt=1.0)] = 2e-5
weight_decay: Annotated[float, Field(ge=0.0, lt=1.0)] = 0.01
warmup_ratio: Annotated[float, Field(ge=0.0, lt=1.0)] = 0.1
max_seq_length: Annotated[int, Field(ge=64, le=32768)] = 1024
max_grad_norm: Annotated[float, Field(gt=0.0)] = 1.0
optimizer: OptimizerType = OptimizerType.ADAMW
precision: Precision = Precision.BF16
lr_scheduler: str = "cosine"
seed: int = 42
dataloader_num_workers: Annotated[int, Field(ge=0, le=32)] = 4
@property
def effective_batch_size(self) -> int:
return self.batch_size * self.gradient_accumulation_steps
class EvaluationConfig(BaseModel):
enabled: bool = True
strategy: EvalStrategy = EvalStrategy.EPOCH
eval_steps: PositiveInt | None = None # required when strategy=STEPS
metrics: list[EvalMetric] = Field(
default_factory=lambda: [EvalMetric.PASS_AT_1, EvalMetric.BLEU]
)
num_samples_per_problem: Annotated[int, Field(ge=1, le=200)] = 10
timeout_seconds: Annotated[int, Field(ge=1, le=60)] = 10
load_best_model_at_end: bool = True
metric_for_best_model: EvalMetric = EvalMetric.PASS_AT_1
greater_is_better: bool = True
@model_validator(mode="after")
def eval_steps_required_for_steps_strategy(self) -> "EvaluationConfig":
if self.strategy == EvalStrategy.STEPS and self.eval_steps is None:
raise ValueError("evaluation.eval_steps is required when strategy='steps'")
return self
class CheckpointConfig(BaseModel):
save_strategy: EvalStrategy = EvalStrategy.EPOCH
save_steps: PositiveInt | None = None
save_total_limit: Annotated[int, Field(ge=1, le=20)] = 3
output_dir: str = "./checkpoints"
resume_from_checkpoint: str | None = None
@model_validator(mode="after")
def save_steps_required_for_steps_strategy(self) -> "CheckpointConfig":
if self.save_strategy == EvalStrategy.STEPS and self.save_steps is None:
raise ValueError("checkpoint.save_steps required when save_strategy='steps'")
return self
class HubConfig(BaseModel):
push_to_hub: bool = False
repo_id: str | None = None
private: bool = True
commit_message: str = "Training checkpoint"
@model_validator(mode="after")
def repo_id_required_if_pushing(self) -> "HubConfig":
if self.push_to_hub and not self.repo_id:
raise ValueError("hub.repo_id is required when hub.push_to_hub=true")
return self
class DatasetConfig(BaseModel):
dataset_id: str # internal UUID or HF Hub dataset path
split_ratio: Annotated[float, Field(gt=0.0, lt=1.0)] = 0.9 # train split
max_samples: PositiveInt | None = None # None = use all
text_column: str = "content"
shuffle: bool = True
shuffle_seed: int = 42
# ---------------------------------------------------------------------------
# Root job config
# ---------------------------------------------------------------------------
class TrainingJobConfig(BaseModel):
"""
Complete training job specification.
Validated at job submission time. If validation passes, the job is
guaranteed to reach the pipeline with a coherent configuration.
"""
job_name: Annotated[str, Field(min_length=1, max_length=128, pattern=r"^[\w\-]+$")]
base_model: str = Field(
description="HuggingFace model ID or local path",
examples=["Salesforce/codegen-350M-mono", "deepseek-ai/deepseek-coder-1.3b-base"],
)
dataset: DatasetConfig
training: TrainingHyperparams = Field(default_factory=TrainingHyperparams)
lora: LoRAConfig | None = Field(default_factory=LoRAConfig)
evaluation: EvaluationConfig = Field(default_factory=EvaluationConfig)
checkpoint: CheckpointConfig = Field(default_factory=CheckpointConfig)
hub: HubConfig = Field(default_factory=HubConfig)
tags: list[str] = Field(default_factory=list, max_length=20)
notes: str | None = None
model_config = {
"json_schema_extra": {
"examples": [
{
"job_name": "codegen-finetune-v1",
"base_model": "Salesforce/codegen-350M-mono",
"dataset": {"dataset_id": "ds_abc123"},
"training": {
"num_epochs": 3,
"batch_size": 8,
"learning_rate": 2e-5,
},
"hub": {
"push_to_hub": True,
"repo_id": "your-org/codegen-finetune-v1",
},
}
]
}
}
# ---------------------------------------------------------------------------
# Inference config (served separately but validated here for consistency)
# ---------------------------------------------------------------------------
class InferenceConfig(BaseModel):
model_id: str
max_new_tokens: Annotated[int, Field(ge=1, le=4096)] = 256
temperature: Annotated[float, Field(ge=0.0, le=2.0)] = 0.2
top_p: Annotated[float, Field(ge=0.0, le=1.0)] = 0.95
top_k: Annotated[int, Field(ge=0, le=1000)] = 50
do_sample: bool = True
num_return_sequences: Annotated[int, Field(ge=1, le=200)] = 1
stop_sequences: list[str] = Field(default_factory=list)
precision: Precision = Precision.BF16
|