rae-training / configs /base_models.json
TrueV1sion123's picture
Upload configs/base_models.json with huggingface_hub
eca5cb6 verified
{
"_comment": "Base models tested with RAE training methodology. Ordered by recommendation.",
"models": [
{
"id": "Qwen/Qwen2.5-7B-Instruct",
"params": "7B",
"recommended_hardware": "1x A100-40GB or 1x A10G (with int4)",
"rae_fit": "excellent",
"notes": "Strong baseline reasoning. Best balance of capability and trainability. Recommended default.",
"lora_r": 32,
"lr": 5e-6,
"block_size": 4096
},
{
"id": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
"params": "1.7B",
"recommended_hardware": "1x T4 (even with fp16)",
"rae_fit": "good",
"notes": "Fastest iteration. Use for methodology validation before scaling. Can train on free Colab/Kaggle.",
"lora_r": 16,
"lr": 1e-5,
"block_size": 2048
},
{
"id": "meta-llama/Llama-3.2-3B-Instruct",
"params": "3B",
"recommended_hardware": "1x T4 (with int4)",
"rae_fit": "good",
"notes": "Good middle ground. Strong instruction following helps with phase structure adherence.",
"lora_r": 16,
"lr": 8e-6,
"block_size": 4096
},
{
"id": "mistralai/Mistral-7B-Instruct-v0.3",
"params": "7B",
"recommended_hardware": "1x A100-40GB or 1x A10G (with int4)",
"rae_fit": "good",
"notes": "Fast inference. Sliding window attention handles long RAE chains well.",
"lora_r": 32,
"lr": 5e-6,
"block_size": 4096
},
{
"id": "google/gemma-2-9b-it",
"params": "9B",
"recommended_hardware": "1x A100-80GB or 1x A10G (with int4)",
"rae_fit": "excellent",
"notes": "Strong reasoning baseline. Larger capacity absorbs RAE structure well.",
"lora_r": 32,
"lr": 3e-6,
"block_size": 4096
},
{
"id": "Qwen/Qwen2.5-3B-Instruct",
"params": "3B",
"recommended_hardware": "1x T4",
"rae_fit": "good",
"notes": "Excellent for cost-efficient experimentation. Strong reasoning for size.",
"lora_r": 16,
"lr": 8e-6,
"block_size": 4096
}
],
"hardware_tiers": {
"free": {
"gpu": "T4 16GB (Kaggle/Colab free)",
"max_model": "3B (int4) or 1.7B (fp16)",
"training_speed": "~50 examples/hour",
"recommended_model": "HuggingFaceTB/SmolLM2-1.7B-Instruct"
},
"budget": {
"gpu": "A10G 24GB (HF Spaces ~$1.05/hr)",
"max_model": "7B (int4)",
"training_speed": "~150 examples/hour",
"recommended_model": "Qwen/Qwen2.5-7B-Instruct"
},
"standard": {
"gpu": "A100 40GB (HF Spaces ~$4.13/hr)",
"max_model": "13B (int4) or 7B (fp16)",
"training_speed": "~400 examples/hour",
"recommended_model": "Qwen/Qwen2.5-7B-Instruct"
},
"premium": {
"gpu": "A100 80GB",
"max_model": "70B (int4) or 13B (fp16)",
"training_speed": "~600 examples/hour",
"recommended_model": "google/gemma-2-9b-it"
}
}
}