{ "_comment": "Base models tested with RAE training methodology. Ordered by recommendation.", "models": [ { "id": "Qwen/Qwen2.5-7B-Instruct", "params": "7B", "recommended_hardware": "1x A100-40GB or 1x A10G (with int4)", "rae_fit": "excellent", "notes": "Strong baseline reasoning. Best balance of capability and trainability. Recommended default.", "lora_r": 32, "lr": 5e-6, "block_size": 4096 }, { "id": "HuggingFaceTB/SmolLM2-1.7B-Instruct", "params": "1.7B", "recommended_hardware": "1x T4 (even with fp16)", "rae_fit": "good", "notes": "Fastest iteration. Use for methodology validation before scaling. Can train on free Colab/Kaggle.", "lora_r": 16, "lr": 1e-5, "block_size": 2048 }, { "id": "meta-llama/Llama-3.2-3B-Instruct", "params": "3B", "recommended_hardware": "1x T4 (with int4)", "rae_fit": "good", "notes": "Good middle ground. Strong instruction following helps with phase structure adherence.", "lora_r": 16, "lr": 8e-6, "block_size": 4096 }, { "id": "mistralai/Mistral-7B-Instruct-v0.3", "params": "7B", "recommended_hardware": "1x A100-40GB or 1x A10G (with int4)", "rae_fit": "good", "notes": "Fast inference. Sliding window attention handles long RAE chains well.", "lora_r": 32, "lr": 5e-6, "block_size": 4096 }, { "id": "google/gemma-2-9b-it", "params": "9B", "recommended_hardware": "1x A100-80GB or 1x A10G (with int4)", "rae_fit": "excellent", "notes": "Strong reasoning baseline. Larger capacity absorbs RAE structure well.", "lora_r": 32, "lr": 3e-6, "block_size": 4096 }, { "id": "Qwen/Qwen2.5-3B-Instruct", "params": "3B", "recommended_hardware": "1x T4", "rae_fit": "good", "notes": "Excellent for cost-efficient experimentation. Strong reasoning for size.", "lora_r": 16, "lr": 8e-6, "block_size": 4096 } ], "hardware_tiers": { "free": { "gpu": "T4 16GB (Kaggle/Colab free)", "max_model": "3B (int4) or 1.7B (fp16)", "training_speed": "~50 examples/hour", "recommended_model": "HuggingFaceTB/SmolLM2-1.7B-Instruct" }, "budget": { "gpu": "A10G 24GB (HF Spaces ~$1.05/hr)", "max_model": "7B (int4)", "training_speed": "~150 examples/hour", "recommended_model": "Qwen/Qwen2.5-7B-Instruct" }, "standard": { "gpu": "A100 40GB (HF Spaces ~$4.13/hr)", "max_model": "13B (int4) or 7B (fp16)", "training_speed": "~400 examples/hour", "recommended_model": "Qwen/Qwen2.5-7B-Instruct" }, "premium": { "gpu": "A100 80GB", "max_model": "70B (int4) or 13B (fp16)", "training_speed": "~600 examples/hour", "recommended_model": "google/gemma-2-9b-it" } } }