Kirim-ai
/

Kirim-1-Math

+{
+  "model_config": {
+    "base_model": "Kirim-ai/Kirim-V1-base",
+    "target_model": "Kirim-ai/Kirim-1-Math",
+    "parameters": "30B",
+    "architecture": "KirimForCausalLM",
+    "expansion_method": "width_and_depth"
+  },
+  "training_stages": {
+    "stage_1_expansion": {
+      "description": "Expand from 13B to 30B",
+      "duration_days": 15,
+      "hardware": "512x H100 80GB",
+      "method": "progressive_expansion",
+      "hidden_size": {
+        "from": 4096,
+        "to": 5120
+      },
+      "num_layers": {
+        "from": 32,
+        "to": 48
+      }
+    },
+    "stage_2_math_pretraining": {
+      "description": "Mathematical corpus pre-training",
+      "duration_days": 30,
+      "hardware": "512x H100 80GB",
+      "data": {
+        "total_tokens": "500B",
+        "sources": [
+          "mathematical_proofs",
+          "olympiad_problems",
+          "arxiv_math_papers",
+          "textbooks",
+          "math_stackexchange"
+        ],
+        "distribution": {
+          "proofs": 0.25,
+          "problems": 0.30,
+          "papers": 0.20,
+          "textbooks": 0.15,
+          "qa": 0.10
+        }
+      },
+      "hyperparameters": {
+        "batch_size": 2048,
+        "learning_rate": 1.5e-4,
+        "warmup_steps": 2000,
+        "weight_decay": 0.1,
+        "gradient_clipping": 1.0,
+        "optimizer": "AdamW",
+        "scheduler": "cosine"
+      }
+    },
+    "stage_3_instruction_tuning": {
+      "description": "Mathematical instruction following",
+      "duration_days": 5,
+      "hardware": "128x H100 80GB",
+      "data": {
+        "total_examples": 200000,
+        "categories": {
+          "algebra": 40000,
+          "calculus": 35000,
+          "geometry": 30000,
+          "number_theory": 25000,
+          "probability": 20000,
+          "linear_algebra": 20000,
+          "discrete_math": 15000,
+          "topology": 10000,
+          "other": 5000
+        }
+      },
+      "hyperparameters": {
+        "batch_size": 128,
+        "learning_rate": 2e-5,
+        "num_epochs": 3,
+        "warmup_ratio": 0.1,
+        "weight_decay": 0.01
+      }
+    },
+    "stage_4_tool_calling": {
+      "description": "Tool calling capability training",
+      "duration_days": 3,
+      "hardware": "64x H100 80GB",
+      "data": {
+        "total_examples": 50000,
+        "tool_types": {
+          "calculator": 15000,
+          "symbolic_solver": 12000,
+          "code_executor": 10000,
+          "derivative": 5000,
+          "integrate": 5000,
+          "other_tools": 3000
+        }
+      },
+      "hyperparameters": {
+        "batch_size": 64,
+        "learning_rate": 1e-5,
+        "num_epochs": 2,
+        "gradient_accumulation_steps": 4
+      }
+    },
+    "stage_5_reinforcement_learning": {
+      "description": "RL for solution correctness",
+      "duration_days": 7,
+      "hardware": "256x H100 80GB",
+      "method": "PPO",
+      "reward_model": {
+        "type": "outcome_based",
+        "verification_methods": [
+          "symbolic_verification",
+          "numerical_check",
+          "unit_tests"
+        ]
+      },
+      "hyperparameters": {
+        "ppo_epochs": 4,
+        "batch_size": 512,
+        "learning_rate": 1e-6,
+        "clip_range": 0.2,
+        "value_loss_coef": 0.5,
+        "entropy_coef": 0.01
+      }
+    }
+  },
+  "total_training": {
+    "duration_days": 60,
+    "gpu_hours": 30720,
+    "estimated_cost_usd": 450000,
+    "total_tokens_processed": "1.5T",
+    "checkpoint_frequency": "every_1000_steps"
+  },
+  "optimization": {
+    "precision": "BF16",
+    "gradient_checkpointing": true,
+    "flash_attention": true,
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": true,
+      "offload_param": false
+    },
+    "tensor_parallelism": 8,
+    "pipeline_parallelism": 4,
+    "data_parallelism": 16
+  },
+  "evaluation": {
+    "frequency": "every_500_steps",
+    "benchmarks": [
+      "GSM8K",
+      "MATH",
+      "MMLU-Math",
+      "Minerva",
+      "AMC10",
+      "AMC12",
+      "AIME"
+    ],
+    "tool_calling_tests": true,
+    "unit_test_coverage": 0.95
+  },
+  "data_sources": {
+    "mathematical_corpus": {
+      "proofs": {
+        "size": "125B tokens",
+        "sources": ["ProofWiki", "Lean", "Coq", "Isabelle"]
+      },
+      "olympiad_problems": {
+        "size": "150B tokens",
+        "sources": ["IMO", "USAMO", "AMC", "AIME", "Putnam"]
+      },
+      "arxiv_papers": {
+        "size": "100B tokens",
+        "categories": ["math.AC", "math.AG", "math.NT", "math.CO"]
+      },
+      "textbooks": {
+        "size": "75B tokens",
+        "levels": ["undergraduate", "graduate", "reference"]
+      },
+      "qa_platforms": {
+        "size": "50B tokens",
+        "sources": ["Math StackExchange", "MathOverflow"]
+      }
+    }
+  }
+}