gpt2_base_prefix_682k / configs /training_v3.json
augustocsc's picture
GPT-2 Base trained on prefix dataset (682K)
5faf2eb verified
{
"model_config": {
"model_name_or_path": "gpt2",
"model_size": "124M",
"description": "GPT-2 Small (124M) - v3 with proper end markers"
},
"training_args": {
"num_train_epochs": 3,
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"gradient_accumulation_steps": 4,
"effective_batch_size": 32,
"learning_rate": 5e-5,
"weight_decay": 0.01,
"warmup_steps": 100,
"max_grad_norm": 1.0,
"lr_scheduler_type": "cosine",
"fp16": true,
"seed": 42,
"block_size": 128
},
"evaluation_args": {
"eval_strategy": "epoch",
"eval_steps": null,
"metric_for_best_model": "eval_loss",
"greater_is_better": false,
"load_best_model_at_end": true
},
"save_args": {
"save_strategy": "epoch",
"save_steps": null,
"save_total_limit": 2
},
"logging_args": {
"logging_dir": "./output/logs",
"logging_steps": 50,
"report_to": "wandb"
},
"lora_config": {
"r": 8,
"lora_alpha": 32,
"target_modules": ["c_attn"],
"lora_dropout": 0.05,
"bias": "none",
"task_type": "CAUSAL_LM"
},
"dataset_config": {
"use_local_csvs": true,
"train_file": "./data/processed/700K_fixed/train_700K.csv",
"validation_file": "./data/processed/700K_fixed/validation_700K.csv",
"test_file": "./data/processed/700K_fixed/test_700K.csv",
"data_column": "text"
},
"hub_config": {
"push_to_hub": true,
"hub_model_id": "augustocsc/Se124M_700K_infix_v3"
},
"special_tokens": {
"start_token": "<|startofex|>",
"end_token": "<|endofex|>",
"notes": "End token configured as EOS token for proper stopping"
},
"estimated_time": {
"per_epoch_minutes": 45,
"total_hours": 2.25,
"notes": "Estimated for AWS g5.xlarge with A10G GPU, GPT-2 Small, 3 epochs"
},
"version_info": {
"model_version": "v3",
"improvements": [
"Training data includes proper <|endofex|> markers",
"100% validation rate on prepared dataset",
"Addresses v1 non-stopping issue and v2 garbage generation",
"Uses local CSVs with validated end markers"
],
"training_date": "2026-02-01"
}
}