| { | |
| "model_name_or_path": "Qwen/Qwen3-8B", | |
| "architecture_type": "dense", | |
| "max_seq_length": 4096, | |
| "trust_remote_code": true, | |
| "recurrent_staged_loras": { | |
| "baseline": "stage_specialized_recurrence", | |
| "variant": { | |
| "name": "stage_specialized_recurrence", | |
| "base": { | |
| "model_name": "Qwen/Qwen3-8B", | |
| "tokenizer_name": "Qwen/Qwen3-8B", | |
| "freeze_base": true, | |
| "trust_remote_code": true, | |
| "dtype": "bfloat16", | |
| "device_map": "auto", | |
| "max_seq_length": 4096, | |
| "load_in_4bit": false, | |
| "bnb_4bit_compute_dtype": "bfloat16", | |
| "attn_implementation": "sdpa", | |
| "gradient_checkpointing": true, | |
| "architecture_type": "dense", | |
| "model_loading_mode": "auto", | |
| "model_loading_allow_offload": true, | |
| "model_loading_require_no_meta_for_training": true | |
| }, | |
| "standard_lora": { | |
| "enabled": false, | |
| "rank": 16, | |
| "alpha": 32, | |
| "dropout": 0.05, | |
| "target_modules": [ | |
| "q_proj", | |
| "k_proj", | |
| "v_proj", | |
| "o_proj", | |
| "up_proj", | |
| "down_proj", | |
| "gate_proj" | |
| ] | |
| }, | |
| "refiner": { | |
| "enabled": true, | |
| "num_steps": 3, | |
| "hidden_size": 0, | |
| "recurrence_mode": "stage_specialized", | |
| "adapter_sharing": "per_step" | |
| }, | |
| "refiner_adapter": { | |
| "enabled": true, | |
| "rank": 8, | |
| "alpha": 16, | |
| "dropout": 0.0, | |
| "target_modules": [ | |
| "refiner_proj" | |
| ] | |
| }, | |
| "trainable_modules": [] | |
| }, | |
| "training": { | |
| "batch_size": 1, | |
| "num_epochs": 2, | |
| "max_steps": 2000, | |
| "learning_rate": 5e-05, | |
| "weight_decay": 0.0, | |
| "seed": 11, | |
| "eval_interval_steps": 200, | |
| "log_interval_steps": 200, | |
| "checkpoint_interval_steps": 200, | |
| "eval_enabled": true, | |
| "deterministic": false, | |
| "compute_control": { | |
| "enabled": false, | |
| "mode": "effective_forward_passes", | |
| "max_wall_time_seconds": null, | |
| "max_tokens": null | |
| } | |
| }, | |
| "publish": { | |
| "enabled": true, | |
| "hub_model_repo": "WallResearch/recurrent-staged-loras-model", | |
| "hub_dataset_repo": "WallResearch/recurrent-staged-loras-dataset", | |
| "private": false, | |
| "commit_message": "Publish run artifacts", | |
| "include_checkpoint": true, | |
| "max_shard_size": "4GB", | |
| "include_metrics": true, | |
| "include_dataset_partitions": true | |
| }, | |
| "validation": { | |
| "enabled": true, | |
| "blocking": true, | |
| "write_json_diff": true, | |
| "lora_expected": null, | |
| "recurrent_expected": null, | |
| "lora_merged_before_save": false, | |
| "lora_key_patterns": [ | |
| "lora_A", | |
| "lora_B", | |
| "lora_embedding_A", | |
| "lora_embedding_B", | |
| "lora", | |
| "adapter_bank.adapters" | |
| ], | |
| "recurrent_key_patterns": [ | |
| "recurrent", | |
| "recurrence", | |
| "rnn", | |
| "recurrent_layer", | |
| "recurrent_projection", | |
| "recurrent_gate", | |
| "refiner." | |
| ] | |
| }, | |
| "dataset": { | |
| "name": "metamath_qa", | |
| "settings": { | |
| "subset_size": 25000, | |
| "eval_fraction": 0.1, | |
| "seed": 11, | |
| "cache_dir": "./.cache/hf_datasets", | |
| "split": "train" | |
| }, | |
| "external_evaluations": [] | |
| }, | |
| "output": { | |
| "dir": "D:\\huggingface\\generated_models\\stage_specialized_recurrence" | |
| }, | |
| "raw": { | |
| "baseline": "stage_specialized_recurrence", | |
| "model": { | |
| "name": "Qwen/Qwen3-8B", | |
| "tokenizer_name": "Qwen/Qwen3-8B", | |
| "trust_remote_code": true, | |
| "dtype": "bfloat16", | |
| "device_map": "auto", | |
| "max_seq_length": 4096, | |
| "load_in_4bit": false, | |
| "bnb_4bit_compute_dtype": "bfloat16", | |
| "attn_implementation": "sdpa", | |
| "gradient_checkpointing": true, | |
| "architecture_type": "dense", | |
| "frozen_base": true, | |
| "standard_lora": { | |
| "enabled": false, | |
| "rank": 16, | |
| "alpha": 32, | |
| "dropout": 0.05, | |
| "target_modules": [ | |
| "q_proj", | |
| "k_proj", | |
| "v_proj", | |
| "o_proj", | |
| "up_proj", | |
| "down_proj", | |
| "gate_proj" | |
| ] | |
| }, | |
| "latent_refiner": { | |
| "enabled": true, | |
| "num_recurrent_steps": 3, | |
| "recurrence_mode": "stage_specialized", | |
| "adapter_sharing": "per_step", | |
| "adapter": { | |
| "enabled": true, | |
| "rank": 8, | |
| "alpha": 16, | |
| "dropout": 0.0, | |
| "target_modules": [ | |
| "refiner_proj" | |
| ] | |
| } | |
| } | |
| }, | |
| "dataset": { | |
| "name": "metamath_qa", | |
| "settings": { | |
| "subset_size": 25000, | |
| "eval_fraction": 0.1, | |
| "seed": 11, | |
| "cache_dir": "./.cache/hf_datasets", | |
| "split": "train" | |
| } | |
| }, | |
| "training": { | |
| "batch_size": 1, | |
| "num_epochs": 2, | |
| "max_steps": 2000, | |
| "learning_rate": 5e-05, | |
| "weight_decay": 0.0, | |
| "seed": 11, | |
| "eval_interval_steps": 200, | |
| "checkpoint_interval_steps": 200, | |
| "eval_enabled": true, | |
| "deterministic": false | |
| }, | |
| "publish": { | |
| "enabled": true, | |
| "hub_model_repo": "WallResearch/recurrent-staged-loras-model", | |
| "hub_dataset_repo": "WallResearch/recurrent-staged-loras-dataset", | |
| "private": false, | |
| "commit_message": "Publish run artifacts", | |
| "include_checkpoint": true, | |
| "include_metrics": true, | |
| "include_dataset_partitions": true | |
| }, | |
| "output": { | |
| "dir": "outputs/stage_specialized_recurrence" | |
| } | |
| } | |
| } | |
| } |