augustocsc
/

gpt2_base_prefix_682k

Model card Files Files and versions

gpt2_base_prefix_682k / configs /training_v3.json

augustocsc's picture

GPT-2 Base trained on prefix dataset (682K)

5faf2eb verified 3 months ago

history blame contribute delete

2.44 kB

	{
	"model_config": {
	"model_name_or_path": "gpt2",
	"model_size": "124M",
	"description": "GPT-2 Small (124M) - v3 with proper end markers"
	},
	"training_args": {
	"num_train_epochs": 3,
	"per_device_train_batch_size": 8,
	"per_device_eval_batch_size": 8,
	"gradient_accumulation_steps": 4,
	"effective_batch_size": 32,
	"learning_rate": 5e-5,
	"weight_decay": 0.01,
	"warmup_steps": 100,
	"max_grad_norm": 1.0,
	"lr_scheduler_type": "cosine",
	"fp16": true,
	"seed": 42,
	"block_size": 128
	},
	"evaluation_args": {
	"eval_strategy": "epoch",
	"eval_steps": null,
	"metric_for_best_model": "eval_loss",
	"greater_is_better": false,
	"load_best_model_at_end": true
	},
	"save_args": {
	"save_strategy": "epoch",
	"save_steps": null,
	"save_total_limit": 2
	},
	"logging_args": {
	"logging_dir": "./output/logs",
	"logging_steps": 50,
	"report_to": "wandb"
	},
	"lora_config": {
	"r": 8,
	"lora_alpha": 32,
	"target_modules": ["c_attn"],
	"lora_dropout": 0.05,
	"bias": "none",
	"task_type": "CAUSAL_LM"
	},
	"dataset_config": {
	"use_local_csvs": true,
	"train_file": "./data/processed/700K_fixed/train_700K.csv",
	"validation_file": "./data/processed/700K_fixed/validation_700K.csv",
	"test_file": "./data/processed/700K_fixed/test_700K.csv",
	"data_column": "text"
	},
	"hub_config": {
	"push_to_hub": true,
	"hub_model_id": "augustocsc/Se124M_700K_infix_v3"
	},
	"special_tokens": {
	"start_token": "<\|startofex\|>",
	"end_token": "<\|endofex\|>",
	"notes": "End token configured as EOS token for proper stopping"
	},
	"estimated_time": {
	"per_epoch_minutes": 45,
	"total_hours": 2.25,
	"notes": "Estimated for AWS g5.xlarge with A10G GPU, GPT-2 Small, 3 epochs"
	},
	"version_info": {
	"model_version": "v3",
	"improvements": [
	"Training data includes proper <\|endofex\|> markers",
	"100% validation rate on prepared dataset",
	"Addresses v1 non-stopping issue and v2 garbage generation",
	"Uses local CSVs with validated end markers"
	],
	"training_date": "2026-02-01"
	}
	}