| { |
| "model_config": { |
| "model_name_or_path": "gpt2", |
| "model_size": "124M", |
| "description": "GPT-2 Small (124M) - v3 with proper end markers" |
| }, |
| "training_args": { |
| "num_train_epochs": 3, |
| "per_device_train_batch_size": 8, |
| "per_device_eval_batch_size": 8, |
| "gradient_accumulation_steps": 4, |
| "effective_batch_size": 32, |
| "learning_rate": 5e-5, |
| "weight_decay": 0.01, |
| "warmup_steps": 100, |
| "max_grad_norm": 1.0, |
| "lr_scheduler_type": "cosine", |
| "fp16": true, |
| "seed": 42, |
| "block_size": 128 |
| }, |
| "evaluation_args": { |
| "eval_strategy": "epoch", |
| "eval_steps": null, |
| "metric_for_best_model": "eval_loss", |
| "greater_is_better": false, |
| "load_best_model_at_end": true |
| }, |
| "save_args": { |
| "save_strategy": "epoch", |
| "save_steps": null, |
| "save_total_limit": 2 |
| }, |
| "logging_args": { |
| "logging_dir": "./output/logs", |
| "logging_steps": 50, |
| "report_to": "wandb" |
| }, |
| "lora_config": { |
| "r": 8, |
| "lora_alpha": 32, |
| "target_modules": ["c_attn"], |
| "lora_dropout": 0.05, |
| "bias": "none", |
| "task_type": "CAUSAL_LM" |
| }, |
| "dataset_config": { |
| "use_local_csvs": true, |
| "train_file": "./data/processed/700K_fixed/train_700K.csv", |
| "validation_file": "./data/processed/700K_fixed/validation_700K.csv", |
| "test_file": "./data/processed/700K_fixed/test_700K.csv", |
| "data_column": "text" |
| }, |
| "hub_config": { |
| "push_to_hub": true, |
| "hub_model_id": "augustocsc/Se124M_700K_infix_v3" |
| }, |
| "special_tokens": { |
| "start_token": "<|startofex|>", |
| "end_token": "<|endofex|>", |
| "notes": "End token configured as EOS token for proper stopping" |
| }, |
| "estimated_time": { |
| "per_epoch_minutes": 45, |
| "total_hours": 2.25, |
| "notes": "Estimated for AWS g5.xlarge with A10G GPU, GPT-2 Small, 3 epochs" |
| }, |
| "version_info": { |
| "model_version": "v3", |
| "improvements": [ |
| "Training data includes proper <|endofex|> markers", |
| "100% validation rate on prepared dataset", |
| "Addresses v1 non-stopping issue and v2 garbage generation", |
| "Uses local CSVs with validated end markers" |
| ], |
| "training_date": "2026-02-01" |
| } |
| } |
|
|