jeffreywallphd's picture
Publish run artifacts
780d17a verified
{
"model_name_or_path": "Qwen/Qwen3-8B",
"architecture_type": "dense",
"max_seq_length": 4096,
"trust_remote_code": true,
"recurrent_staged_loras": {
"baseline": "stage_specialized_recurrence",
"variant": {
"name": "stage_specialized_recurrence",
"base": {
"model_name": "Qwen/Qwen3-8B",
"tokenizer_name": "Qwen/Qwen3-8B",
"freeze_base": true,
"trust_remote_code": true,
"dtype": "bfloat16",
"device_map": "auto",
"max_seq_length": 4096,
"load_in_4bit": false,
"bnb_4bit_compute_dtype": "bfloat16",
"attn_implementation": "sdpa",
"gradient_checkpointing": true,
"architecture_type": "dense",
"model_loading_mode": "auto",
"model_loading_allow_offload": true,
"model_loading_require_no_meta_for_training": true
},
"standard_lora": {
"enabled": false,
"rank": 16,
"alpha": 32,
"dropout": 0.05,
"target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"up_proj",
"down_proj",
"gate_proj"
]
},
"refiner": {
"enabled": true,
"num_steps": 3,
"hidden_size": 0,
"recurrence_mode": "stage_specialized",
"adapter_sharing": "per_step"
},
"refiner_adapter": {
"enabled": true,
"rank": 8,
"alpha": 16,
"dropout": 0.0,
"target_modules": [
"refiner_proj"
]
},
"trainable_modules": []
},
"training": {
"batch_size": 1,
"num_epochs": 2,
"max_steps": 2000,
"learning_rate": 5e-05,
"weight_decay": 0.0,
"seed": 11,
"eval_interval_steps": 200,
"log_interval_steps": 200,
"checkpoint_interval_steps": 200,
"eval_enabled": true,
"deterministic": false,
"compute_control": {
"enabled": false,
"mode": "effective_forward_passes",
"max_wall_time_seconds": null,
"max_tokens": null
}
},
"publish": {
"enabled": true,
"hub_model_repo": "WallResearch/recurrent-staged-loras-model",
"hub_dataset_repo": "WallResearch/recurrent-staged-loras-dataset",
"private": false,
"commit_message": "Publish run artifacts",
"include_checkpoint": true,
"max_shard_size": "4GB",
"include_metrics": true,
"include_dataset_partitions": true
},
"validation": {
"enabled": true,
"blocking": true,
"write_json_diff": true,
"lora_expected": null,
"recurrent_expected": null,
"lora_merged_before_save": false,
"lora_key_patterns": [
"lora_A",
"lora_B",
"lora_embedding_A",
"lora_embedding_B",
"lora",
"adapter_bank.adapters"
],
"recurrent_key_patterns": [
"recurrent",
"recurrence",
"rnn",
"recurrent_layer",
"recurrent_projection",
"recurrent_gate",
"refiner."
]
},
"dataset": {
"name": "metamath_qa",
"settings": {
"subset_size": 25000,
"eval_fraction": 0.1,
"seed": 11,
"cache_dir": "./.cache/hf_datasets",
"split": "train"
},
"external_evaluations": []
},
"output": {
"dir": "D:\\huggingface\\generated_models\\stage_specialized_recurrence"
},
"raw": {
"baseline": "stage_specialized_recurrence",
"model": {
"name": "Qwen/Qwen3-8B",
"tokenizer_name": "Qwen/Qwen3-8B",
"trust_remote_code": true,
"dtype": "bfloat16",
"device_map": "auto",
"max_seq_length": 4096,
"load_in_4bit": false,
"bnb_4bit_compute_dtype": "bfloat16",
"attn_implementation": "sdpa",
"gradient_checkpointing": true,
"architecture_type": "dense",
"frozen_base": true,
"standard_lora": {
"enabled": false,
"rank": 16,
"alpha": 32,
"dropout": 0.05,
"target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"up_proj",
"down_proj",
"gate_proj"
]
},
"latent_refiner": {
"enabled": true,
"num_recurrent_steps": 3,
"recurrence_mode": "stage_specialized",
"adapter_sharing": "per_step",
"adapter": {
"enabled": true,
"rank": 8,
"alpha": 16,
"dropout": 0.0,
"target_modules": [
"refiner_proj"
]
}
}
},
"dataset": {
"name": "metamath_qa",
"settings": {
"subset_size": 25000,
"eval_fraction": 0.1,
"seed": 11,
"cache_dir": "./.cache/hf_datasets",
"split": "train"
}
},
"training": {
"batch_size": 1,
"num_epochs": 2,
"max_steps": 2000,
"learning_rate": 5e-05,
"weight_decay": 0.0,
"seed": 11,
"eval_interval_steps": 200,
"checkpoint_interval_steps": 200,
"eval_enabled": true,
"deterministic": false
},
"publish": {
"enabled": true,
"hub_model_repo": "WallResearch/recurrent-staged-loras-model",
"hub_dataset_repo": "WallResearch/recurrent-staged-loras-dataset",
"private": false,
"commit_message": "Publish run artifacts",
"include_checkpoint": true,
"include_metrics": true,
"include_dataset_partitions": true
},
"output": {
"dir": "outputs/stage_specialized_recurrence"
}
}
}
}