{ "model_name_or_path": "Qwen/Qwen3-8B", "architecture_type": "dense", "max_seq_length": 4096, "trust_remote_code": true, "recurrent_staged_loras": { "baseline": "stage_specialized_recurrence", "variant": { "name": "stage_specialized_recurrence", "base": { "model_name": "Qwen/Qwen3-8B", "tokenizer_name": "Qwen/Qwen3-8B", "freeze_base": true, "trust_remote_code": true, "dtype": "bfloat16", "device_map": "auto", "max_seq_length": 4096, "load_in_4bit": false, "bnb_4bit_compute_dtype": "bfloat16", "attn_implementation": "sdpa", "gradient_checkpointing": true, "architecture_type": "dense", "model_loading_mode": "auto", "model_loading_allow_offload": true, "model_loading_require_no_meta_for_training": true }, "standard_lora": { "enabled": false, "rank": 16, "alpha": 32, "dropout": 0.05, "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj", "gate_proj" ] }, "refiner": { "enabled": true, "num_steps": 3, "hidden_size": 0, "recurrence_mode": "stage_specialized", "adapter_sharing": "per_step" }, "refiner_adapter": { "enabled": true, "rank": 8, "alpha": 16, "dropout": 0.0, "target_modules": [ "refiner_proj" ] }, "trainable_modules": [] }, "training": { "batch_size": 1, "num_epochs": 2, "max_steps": 2000, "learning_rate": 5e-05, "weight_decay": 0.0, "seed": 11, "eval_interval_steps": 200, "log_interval_steps": 200, "checkpoint_interval_steps": 200, "eval_enabled": true, "deterministic": false, "compute_control": { "enabled": false, "mode": "effective_forward_passes", "max_wall_time_seconds": null, "max_tokens": null } }, "publish": { "enabled": true, "hub_model_repo": "WallResearch/recurrent-staged-loras-model", "hub_dataset_repo": "WallResearch/recurrent-staged-loras-dataset", "private": false, "commit_message": "Publish run artifacts", "include_checkpoint": true, "max_shard_size": "4GB", "include_metrics": true, "include_dataset_partitions": true }, "validation": { "enabled": true, "blocking": true, "write_json_diff": true, "lora_expected": null, "recurrent_expected": null, "lora_merged_before_save": false, "lora_key_patterns": [ "lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B", "lora", "adapter_bank.adapters" ], "recurrent_key_patterns": [ "recurrent", "recurrence", "rnn", "recurrent_layer", "recurrent_projection", "recurrent_gate", "refiner." ] }, "dataset": { "name": "metamath_qa", "settings": { "subset_size": 25000, "eval_fraction": 0.1, "seed": 11, "cache_dir": "./.cache/hf_datasets", "split": "train" }, "external_evaluations": [] }, "output": { "dir": "D:\\huggingface\\generated_models\\stage_specialized_recurrence" }, "raw": { "baseline": "stage_specialized_recurrence", "model": { "name": "Qwen/Qwen3-8B", "tokenizer_name": "Qwen/Qwen3-8B", "trust_remote_code": true, "dtype": "bfloat16", "device_map": "auto", "max_seq_length": 4096, "load_in_4bit": false, "bnb_4bit_compute_dtype": "bfloat16", "attn_implementation": "sdpa", "gradient_checkpointing": true, "architecture_type": "dense", "frozen_base": true, "standard_lora": { "enabled": false, "rank": 16, "alpha": 32, "dropout": 0.05, "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj", "gate_proj" ] }, "latent_refiner": { "enabled": true, "num_recurrent_steps": 3, "recurrence_mode": "stage_specialized", "adapter_sharing": "per_step", "adapter": { "enabled": true, "rank": 8, "alpha": 16, "dropout": 0.0, "target_modules": [ "refiner_proj" ] } } }, "dataset": { "name": "metamath_qa", "settings": { "subset_size": 25000, "eval_fraction": 0.1, "seed": 11, "cache_dir": "./.cache/hf_datasets", "split": "train" } }, "training": { "batch_size": 1, "num_epochs": 2, "max_steps": 2000, "learning_rate": 5e-05, "weight_decay": 0.0, "seed": 11, "eval_interval_steps": 200, "checkpoint_interval_steps": 200, "eval_enabled": true, "deterministic": false }, "publish": { "enabled": true, "hub_model_repo": "WallResearch/recurrent-staged-loras-model", "hub_dataset_repo": "WallResearch/recurrent-staged-loras-dataset", "private": false, "commit_message": "Publish run artifacts", "include_checkpoint": true, "include_metrics": true, "include_dataset_partitions": true }, "output": { "dir": "outputs/stage_specialized_recurrence" } } } }