File size: 5,231 Bytes
36ac84e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
Pydantic configuration schema for Humigence training pipeline
"""
from pydantic import BaseModel, Field, validator
from typing import List, Optional, Union
from pathlib import Path

class TrainConfig(BaseModel):
    """Strict configuration schema for Humigence training"""
    
    # Model configuration
    model_name: str = Field(..., description="Hugging Face model name")
    training_recipe: str = Field(default="LoRA (FP16)", description="Training recipe")
    
    # Training hyperparameters
    learning_rate: float = Field(..., ge=1e-6, le=1.0, description="Learning rate")
    num_train_epochs: int = Field(..., ge=1, le=100, description="Number of training epochs")
    per_device_train_batch_size: int = Field(..., ge=1, le=32, description="Batch size per device")
    gradient_accumulation_steps: int = Field(..., ge=1, le=32, description="Gradient accumulation steps")
    eval_batch_size: int = Field(..., ge=1, le=32, description="Evaluation batch size")
    
    # Precision settings
    fp16: bool = Field(default=True, description="Use FP16 precision")
    bf16: bool = Field(default=False, description="Use BF16 precision")
    
    # Multi-GPU settings
    multi_gpu: bool = Field(default=False, description="Enable multi-GPU training")
    selected_gpus: List[int] = Field(default=[0], description="Selected GPU indices")
    
    # Dataset configuration
    dataset_path: str = Field(..., description="Path to dataset file")
    data_schema: str = Field(default="instruction_output", description="Dataset schema")
    train_val_test_split: List[float] = Field(default=[0.8, 0.1, 0.1], description="Dataset split ratios")
    split_seed: int = Field(default=42, description="Random seed for dataset split")
    max_seq_length: int = Field(default=1024, ge=64, le=4096, description="Maximum sequence length")
    
    # LoRA configuration
    lora_r: int = Field(default=16, ge=1, le=256, description="LoRA rank")
    lora_alpha: int = Field(default=32, ge=1, le=512, description="LoRA alpha")
    lora_dropout: float = Field(default=0.05, ge=0.0, le=0.5, description="LoRA dropout")
    
    # Logging and evaluation
    logging_steps: int = Field(default=10, ge=1, le=1000, description="Logging frequency")
    eval_steps: int = Field(default=100, ge=1, le=10000, description="Evaluation frequency")
    save_steps: int = Field(default=500, ge=1, le=10000, description="Save frequency")
    
    # Output configuration
    output_dir: str = Field(default="runs/humigence", description="Output directory")
    eval_single_gpu: bool = Field(default=True, description="Evaluate on single GPU")
    eval_gpu_index: int = Field(default=0, description="GPU index for evaluation")
    
    # System configuration
    num_workers: int = Field(default=4, ge=0, le=16, description="Number of data loader workers")
    pin_memory: bool = Field(default=True, description="Pin memory for data loading")
    
    @validator('train_val_test_split')
    def validate_split(cls, v):
        if len(v) != 3:
            raise ValueError("train_val_test_split must have exactly 3 values")
        if abs(sum(v) - 1.0) > 1e-6:
            raise ValueError("train_val_test_split values must sum to 1.0")
        return v
    
    @validator('fp16', 'bf16')
    def validate_precision(cls, v, values):
        if values.get('fp16') and values.get('bf16'):
            raise ValueError("Cannot use both fp16 and bf16 simultaneously")
        return v
    
    @validator('dataset_path')
    def validate_dataset_path(cls, v):
        path = Path(v)
        if not path.exists():
            raise ValueError(f"Dataset file not found: {v}")
        if not path.suffix == '.jsonl':
            raise ValueError(f"Dataset must be a .jsonl file: {v}")
        return str(path)
    
    @validator('model_name')
    def validate_model_name(cls, v):
        # Basic validation for Hugging Face model names
        if not v or len(v.strip()) == 0:
            raise ValueError("Model name cannot be empty")
        return v.strip()
    
    class Config:
        """Pydantic configuration"""
        validate_assignment = True
        extra = "forbid"  # Reject extra fields
        use_enum_values = True

def load_config(config_path: str) -> TrainConfig:
    """Load and validate configuration from JSON file"""
    import json
    
    with open(config_path, 'r') as f:
        config_dict = json.load(f)
    
    try:
        return TrainConfig(**config_dict)
    except Exception as e:
        raise ValueError(f"Configuration validation failed: {e}")

def save_config(config: TrainConfig, output_path: str) -> None:
    """Save configuration to JSON file (legacy function)"""
    import json
    from pathlib import Path
    
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, 'w') as f:
        json.dump(config.dict(), f, indent=2)

def save_config_snapshot(config_dict: dict, output_path: str = "runs/humigence/config.snapshot.json") -> TrainConfig:
    """Save config with automatic migration and validation"""
    from config_migration import save_config_snapshot as _save_config_snapshot
    return _save_config_snapshot(config_dict, output_path)