S-Dreamer commited on
Commit
178abc4
·
verified ·
1 Parent(s): 6cf4784

Upload 3 files

Browse files
Files changed (3) hide show
  1. .env.example +36 -0
  2. Dockerfile +45 -0
  3. config.py +206 -0
.env.example ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CodeCraftLab — Environment Configuration
2
+ # Copy to .env and fill in values. Never commit .env to git.
3
+
4
+ # --------------------------------------------------------------------------
5
+ # App
6
+ # --------------------------------------------------------------------------
7
+ ENV=development # development | staging | production
8
+ LOG_LEVEL=INFO # DEBUG | INFO | WARNING | ERROR
9
+
10
+ # --------------------------------------------------------------------------
11
+ # Auth (REQUIRED)
12
+ # --------------------------------------------------------------------------
13
+ SECRET_KEY=change-me-to-at-least-32-random-chars-in-production
14
+ ACCESS_TOKEN_EXPIRE_MINUTES=60
15
+
16
+ # --------------------------------------------------------------------------
17
+ # Database (REQUIRED)
18
+ # --------------------------------------------------------------------------
19
+ DATABASE_URL=postgresql+asyncpg://codecraftlab:password@localhost:5432/codecraftlab
20
+
21
+ # --------------------------------------------------------------------------
22
+ # HuggingFace (required for Hub push)
23
+ # --------------------------------------------------------------------------
24
+ HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxx
25
+ MODEL_CACHE_DIR=./cache
26
+
27
+ # --------------------------------------------------------------------------
28
+ # Training
29
+ # --------------------------------------------------------------------------
30
+ MAX_CONCURRENT_JOBS=2
31
+ JOB_OUTPUT_DIR=./checkpoints
32
+
33
+ # --------------------------------------------------------------------------
34
+ # CORS (comma-separated list for production)
35
+ # --------------------------------------------------------------------------
36
+ CORS_ORIGINS=["http://localhost:3000"]
Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------------------------
2
+ # CodeCraftLab — Dockerfile
3
+ # FastAPI + Uvicorn on port 8000
4
+ # Runs as non-root user (HF Spaces requirement)
5
+ # --------------------------------------------------------------------------
6
+ FROM python:3.11-slim AS base
7
+
8
+ # System deps
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ git \
11
+ git-lfs \
12
+ build-essential \
13
+ && git lfs install \
14
+ && apt-get clean \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Non-root user (required by HuggingFace Spaces)
18
+ RUN useradd -m -u 1000 appuser
19
+ WORKDIR /app
20
+
21
+ # --------------------------------------------------------------------------
22
+ FROM base AS deps
23
+
24
+ COPY pyproject.toml uv.lock* ./
25
+ RUN pip install uv --no-cache-dir && \
26
+ uv sync --no-dev --frozen
27
+
28
+ # --------------------------------------------------------------------------
29
+ FROM base AS runtime
30
+
31
+ COPY --from=deps /app/.venv /app/.venv
32
+ ENV PATH="/app/.venv/bin:$PATH"
33
+
34
+ COPY --chown=appuser:appuser . .
35
+
36
+ USER appuser
37
+
38
+ EXPOSE 8000
39
+
40
+ # Uvicorn — 4 workers in production, 1 in development (override with env)
41
+ CMD ["uvicorn", "app:app", \
42
+ "--host", "0.0.0.0", \
43
+ "--port", "8000", \
44
+ "--workers", "4", \
45
+ "--log-config", "null"]
config.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training configuration schemas — Pydantic v2.
3
+
4
+ All training jobs are validated against these models before execution.
5
+ No raw dicts escape into the pipeline; everything is typed and constrained.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from enum import StrEnum
11
+ from typing import Annotated
12
+
13
+ from pydantic import BaseModel, Field, HttpUrl, model_validator
14
+ from pydantic import PositiveFloat, PositiveInt
15
+
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Enums
19
+ # ---------------------------------------------------------------------------
20
+ class EvalStrategy(StrEnum):
21
+ NO = "no"
22
+ STEPS = "steps"
23
+ EPOCH = "epoch"
24
+
25
+
26
+ class Precision(StrEnum):
27
+ FP32 = "fp32"
28
+ FP16 = "fp16"
29
+ BF16 = "bf16"
30
+ INT8 = "int8"
31
+
32
+
33
+ class OptimizerType(StrEnum):
34
+ ADAMW = "adamw_torch"
35
+ ADAMW_8BIT = "adamw_8bit"
36
+ PAGED_ADAMW_8BIT = "paged_adamw_8bit"
37
+ SGD = "sgd"
38
+
39
+
40
+ class EvalMetric(StrEnum):
41
+ PASS_AT_1 = "pass_at_1"
42
+ PASS_AT_10 = "pass_at_10"
43
+ BLEU = "bleu"
44
+ EXECUTION_ACCURACY = "execution_accuracy"
45
+ EXACT_MATCH = "exact_match"
46
+
47
+
48
+ # ---------------------------------------------------------------------------
49
+ # Sub-configs
50
+ # ---------------------------------------------------------------------------
51
+ class LoRAConfig(BaseModel):
52
+ """LoRA adapter configuration. Omit to disable LoRA (full fine-tune)."""
53
+
54
+ enabled: bool = True
55
+ r: Annotated[int, Field(ge=1, le=256)] = 16
56
+ alpha: Annotated[int, Field(ge=1)] = 32
57
+ dropout: Annotated[float, Field(ge=0.0, lt=1.0)] = 0.05
58
+ target_modules: list[str] = Field(
59
+ default_factory=lambda: ["q_proj", "v_proj"],
60
+ min_length=1,
61
+ )
62
+ bias: str = "none"
63
+
64
+ @model_validator(mode="after")
65
+ def alpha_geq_r(self) -> "LoRAConfig":
66
+ if self.alpha < self.r:
67
+ raise ValueError(f"lora.alpha ({self.alpha}) should be >= lora.r ({self.r})")
68
+ return self
69
+
70
+
71
+ class TrainingHyperparams(BaseModel):
72
+ num_epochs: Annotated[int, Field(ge=1, le=100)] = 3
73
+ batch_size: Annotated[int, Field(ge=1, le=256)] = 8
74
+ gradient_accumulation_steps: Annotated[int, Field(ge=1, le=128)] = 4
75
+ learning_rate: Annotated[float, Field(gt=0.0, lt=1.0)] = 2e-5
76
+ weight_decay: Annotated[float, Field(ge=0.0, lt=1.0)] = 0.01
77
+ warmup_ratio: Annotated[float, Field(ge=0.0, lt=1.0)] = 0.1
78
+ max_seq_length: Annotated[int, Field(ge=64, le=32768)] = 1024
79
+ max_grad_norm: Annotated[float, Field(gt=0.0)] = 1.0
80
+ optimizer: OptimizerType = OptimizerType.ADAMW
81
+ precision: Precision = Precision.BF16
82
+ lr_scheduler: str = "cosine"
83
+ seed: int = 42
84
+ dataloader_num_workers: Annotated[int, Field(ge=0, le=32)] = 4
85
+
86
+ @property
87
+ def effective_batch_size(self) -> int:
88
+ return self.batch_size * self.gradient_accumulation_steps
89
+
90
+
91
+ class EvaluationConfig(BaseModel):
92
+ enabled: bool = True
93
+ strategy: EvalStrategy = EvalStrategy.EPOCH
94
+ eval_steps: PositiveInt | None = None # required when strategy=STEPS
95
+ metrics: list[EvalMetric] = Field(
96
+ default_factory=lambda: [EvalMetric.PASS_AT_1, EvalMetric.BLEU]
97
+ )
98
+ num_samples_per_problem: Annotated[int, Field(ge=1, le=200)] = 10
99
+ timeout_seconds: Annotated[int, Field(ge=1, le=60)] = 10
100
+ load_best_model_at_end: bool = True
101
+ metric_for_best_model: EvalMetric = EvalMetric.PASS_AT_1
102
+ greater_is_better: bool = True
103
+
104
+ @model_validator(mode="after")
105
+ def eval_steps_required_for_steps_strategy(self) -> "EvaluationConfig":
106
+ if self.strategy == EvalStrategy.STEPS and self.eval_steps is None:
107
+ raise ValueError("evaluation.eval_steps is required when strategy='steps'")
108
+ return self
109
+
110
+
111
+ class CheckpointConfig(BaseModel):
112
+ save_strategy: EvalStrategy = EvalStrategy.EPOCH
113
+ save_steps: PositiveInt | None = None
114
+ save_total_limit: Annotated[int, Field(ge=1, le=20)] = 3
115
+ output_dir: str = "./checkpoints"
116
+ resume_from_checkpoint: str | None = None
117
+
118
+ @model_validator(mode="after")
119
+ def save_steps_required_for_steps_strategy(self) -> "CheckpointConfig":
120
+ if self.save_strategy == EvalStrategy.STEPS and self.save_steps is None:
121
+ raise ValueError("checkpoint.save_steps required when save_strategy='steps'")
122
+ return self
123
+
124
+
125
+ class HubConfig(BaseModel):
126
+ push_to_hub: bool = False
127
+ repo_id: str | None = None
128
+ private: bool = True
129
+ commit_message: str = "Training checkpoint"
130
+
131
+ @model_validator(mode="after")
132
+ def repo_id_required_if_pushing(self) -> "HubConfig":
133
+ if self.push_to_hub and not self.repo_id:
134
+ raise ValueError("hub.repo_id is required when hub.push_to_hub=true")
135
+ return self
136
+
137
+
138
+ class DatasetConfig(BaseModel):
139
+ dataset_id: str # internal UUID or HF Hub dataset path
140
+ split_ratio: Annotated[float, Field(gt=0.0, lt=1.0)] = 0.9 # train split
141
+ max_samples: PositiveInt | None = None # None = use all
142
+ text_column: str = "content"
143
+ shuffle: bool = True
144
+ shuffle_seed: int = 42
145
+
146
+
147
+ # ---------------------------------------------------------------------------
148
+ # Root job config
149
+ # ---------------------------------------------------------------------------
150
+ class TrainingJobConfig(BaseModel):
151
+ """
152
+ Complete training job specification.
153
+
154
+ Validated at job submission time. If validation passes, the job is
155
+ guaranteed to reach the pipeline with a coherent configuration.
156
+ """
157
+
158
+ job_name: Annotated[str, Field(min_length=1, max_length=128, pattern=r"^[\w\-]+$")]
159
+ base_model: str = Field(
160
+ description="HuggingFace model ID or local path",
161
+ examples=["Salesforce/codegen-350M-mono", "deepseek-ai/deepseek-coder-1.3b-base"],
162
+ )
163
+ dataset: DatasetConfig
164
+ training: TrainingHyperparams = Field(default_factory=TrainingHyperparams)
165
+ lora: LoRAConfig | None = Field(default_factory=LoRAConfig)
166
+ evaluation: EvaluationConfig = Field(default_factory=EvaluationConfig)
167
+ checkpoint: CheckpointConfig = Field(default_factory=CheckpointConfig)
168
+ hub: HubConfig = Field(default_factory=HubConfig)
169
+ tags: list[str] = Field(default_factory=list, max_length=20)
170
+ notes: str | None = None
171
+
172
+ model_config = {
173
+ "json_schema_extra": {
174
+ "examples": [
175
+ {
176
+ "job_name": "codegen-finetune-v1",
177
+ "base_model": "Salesforce/codegen-350M-mono",
178
+ "dataset": {"dataset_id": "ds_abc123"},
179
+ "training": {
180
+ "num_epochs": 3,
181
+ "batch_size": 8,
182
+ "learning_rate": 2e-5,
183
+ },
184
+ "hub": {
185
+ "push_to_hub": True,
186
+ "repo_id": "your-org/codegen-finetune-v1",
187
+ },
188
+ }
189
+ ]
190
+ }
191
+ }
192
+
193
+
194
+ # ---------------------------------------------------------------------------
195
+ # Inference config (served separately but validated here for consistency)
196
+ # ---------------------------------------------------------------------------
197
+ class InferenceConfig(BaseModel):
198
+ model_id: str
199
+ max_new_tokens: Annotated[int, Field(ge=1, le=4096)] = 256
200
+ temperature: Annotated[float, Field(ge=0.0, le=2.0)] = 0.2
201
+ top_p: Annotated[float, Field(ge=0.0, le=1.0)] = 0.95
202
+ top_k: Annotated[int, Field(ge=0, le=1000)] = 50
203
+ do_sample: bool = True
204
+ num_return_sequences: Annotated[int, Field(ge=1, le=200)] = 1
205
+ stop_sequences: list[str] = Field(default_factory=list)
206
+ precision: Precision = Precision.BF16