| { |
| "_comment": "Base models tested with RAE training methodology. Ordered by recommendation.", |
| "models": [ |
| { |
| "id": "Qwen/Qwen2.5-7B-Instruct", |
| "params": "7B", |
| "recommended_hardware": "1x A100-40GB or 1x A10G (with int4)", |
| "rae_fit": "excellent", |
| "notes": "Strong baseline reasoning. Best balance of capability and trainability. Recommended default.", |
| "lora_r": 32, |
| "lr": 5e-6, |
| "block_size": 4096 |
| }, |
| { |
| "id": "HuggingFaceTB/SmolLM2-1.7B-Instruct", |
| "params": "1.7B", |
| "recommended_hardware": "1x T4 (even with fp16)", |
| "rae_fit": "good", |
| "notes": "Fastest iteration. Use for methodology validation before scaling. Can train on free Colab/Kaggle.", |
| "lora_r": 16, |
| "lr": 1e-5, |
| "block_size": 2048 |
| }, |
| { |
| "id": "meta-llama/Llama-3.2-3B-Instruct", |
| "params": "3B", |
| "recommended_hardware": "1x T4 (with int4)", |
| "rae_fit": "good", |
| "notes": "Good middle ground. Strong instruction following helps with phase structure adherence.", |
| "lora_r": 16, |
| "lr": 8e-6, |
| "block_size": 4096 |
| }, |
| { |
| "id": "mistralai/Mistral-7B-Instruct-v0.3", |
| "params": "7B", |
| "recommended_hardware": "1x A100-40GB or 1x A10G (with int4)", |
| "rae_fit": "good", |
| "notes": "Fast inference. Sliding window attention handles long RAE chains well.", |
| "lora_r": 32, |
| "lr": 5e-6, |
| "block_size": 4096 |
| }, |
| { |
| "id": "google/gemma-2-9b-it", |
| "params": "9B", |
| "recommended_hardware": "1x A100-80GB or 1x A10G (with int4)", |
| "rae_fit": "excellent", |
| "notes": "Strong reasoning baseline. Larger capacity absorbs RAE structure well.", |
| "lora_r": 32, |
| "lr": 3e-6, |
| "block_size": 4096 |
| }, |
| { |
| "id": "Qwen/Qwen2.5-3B-Instruct", |
| "params": "3B", |
| "recommended_hardware": "1x T4", |
| "rae_fit": "good", |
| "notes": "Excellent for cost-efficient experimentation. Strong reasoning for size.", |
| "lora_r": 16, |
| "lr": 8e-6, |
| "block_size": 4096 |
| } |
| ], |
| "hardware_tiers": { |
| "free": { |
| "gpu": "T4 16GB (Kaggle/Colab free)", |
| "max_model": "3B (int4) or 1.7B (fp16)", |
| "training_speed": "~50 examples/hour", |
| "recommended_model": "HuggingFaceTB/SmolLM2-1.7B-Instruct" |
| }, |
| "budget": { |
| "gpu": "A10G 24GB (HF Spaces ~$1.05/hr)", |
| "max_model": "7B (int4)", |
| "training_speed": "~150 examples/hour", |
| "recommended_model": "Qwen/Qwen2.5-7B-Instruct" |
| }, |
| "standard": { |
| "gpu": "A100 40GB (HF Spaces ~$4.13/hr)", |
| "max_model": "13B (int4) or 7B (fp16)", |
| "training_speed": "~400 examples/hour", |
| "recommended_model": "Qwen/Qwen2.5-7B-Instruct" |
| }, |
| "premium": { |
| "gpu": "A100 80GB", |
| "max_model": "70B (int4) or 13B (fp16)", |
| "training_speed": "~600 examples/hour", |
| "recommended_model": "google/gemma-2-9b-it" |
| } |
| } |
| } |
|
|