rae-training / configs /base_models.json

Upload configs/base_models.json with huggingface_hub

eca5cb6 verified 8 days ago

2.95 kB

	{
	"_comment": "Base models tested with RAE training methodology. Ordered by recommendation.",
	"models": [
	{
	"id": "Qwen/Qwen2.5-7B-Instruct",
	"params": "7B",
	"recommended_hardware": "1x A100-40GB or 1x A10G (with int4)",
	"rae_fit": "excellent",
	"notes": "Strong baseline reasoning. Best balance of capability and trainability. Recommended default.",
	"lora_r": 32,
	"lr": 5e-6,
	"block_size": 4096
	},
	{
	"id": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
	"params": "1.7B",
	"recommended_hardware": "1x T4 (even with fp16)",
	"rae_fit": "good",
	"notes": "Fastest iteration. Use for methodology validation before scaling. Can train on free Colab/Kaggle.",
	"lora_r": 16,
	"lr": 1e-5,
	"block_size": 2048
	},
	{
	"id": "meta-llama/Llama-3.2-3B-Instruct",
	"params": "3B",
	"recommended_hardware": "1x T4 (with int4)",
	"rae_fit": "good",
	"notes": "Good middle ground. Strong instruction following helps with phase structure adherence.",
	"lora_r": 16,
	"lr": 8e-6,
	"block_size": 4096
	},
	{
	"id": "mistralai/Mistral-7B-Instruct-v0.3",
	"params": "7B",
	"recommended_hardware": "1x A100-40GB or 1x A10G (with int4)",
	"rae_fit": "good",
	"notes": "Fast inference. Sliding window attention handles long RAE chains well.",
	"lora_r": 32,
	"lr": 5e-6,
	"block_size": 4096
	},
	{
	"id": "google/gemma-2-9b-it",
	"params": "9B",
	"recommended_hardware": "1x A100-80GB or 1x A10G (with int4)",
	"rae_fit": "excellent",
	"notes": "Strong reasoning baseline. Larger capacity absorbs RAE structure well.",
	"lora_r": 32,
	"lr": 3e-6,
	"block_size": 4096
	},
	{
	"id": "Qwen/Qwen2.5-3B-Instruct",
	"params": "3B",
	"recommended_hardware": "1x T4",
	"rae_fit": "good",
	"notes": "Excellent for cost-efficient experimentation. Strong reasoning for size.",
	"lora_r": 16,
	"lr": 8e-6,
	"block_size": 4096
	}
	],
	"hardware_tiers": {
	"free": {
	"gpu": "T4 16GB (Kaggle/Colab free)",
	"max_model": "3B (int4) or 1.7B (fp16)",
	"training_speed": "~50 examples/hour",
	"recommended_model": "HuggingFaceTB/SmolLM2-1.7B-Instruct"
	},
	"budget": {
	"gpu": "A10G 24GB (HF Spaces ~$1.05/hr)",
	"max_model": "7B (int4)",
	"training_speed": "~150 examples/hour",
	"recommended_model": "Qwen/Qwen2.5-7B-Instruct"
	},
	"standard": {
	"gpu": "A100 40GB (HF Spaces ~$4.13/hr)",
	"max_model": "13B (int4) or 7B (fp16)",
	"training_speed": "~400 examples/hour",
	"recommended_model": "Qwen/Qwen2.5-7B-Instruct"
	},
	"premium": {
	"gpu": "A100 80GB",
	"max_model": "70B (int4) or 13B (fp16)",
	"training_speed": "~600 examples/hour",
	"recommended_model": "google/gemma-2-9b-it"
	}
	}
	}