likhonsheikh
/

Sheikh-2.5-Coder

phi

Model card Files Files and versions

xet

Community

likhonsheikh commited on Nov 6, 2025

Commit

230f696

verified ·

1 Parent(s): d30c6be

Add config.json: Model architecture configuration file

Browse files

Files changed (1) hide show

config.json +129 -50

config.json CHANGED Viewed

@@ -1,58 +1,137 @@
 {
-  "model_type": "transformer",
-  "architectures": ["Sheikh2_5CoderForCausalLM"],
   "max_position_embeddings": 32768,
-  "vocab_size": 50257,
-  "hidden_size": 3072,
   "num_attention_heads": 16,
   "num_key_value_heads": 2,
   "num_hidden_layers": 36,
   "intermediate_size": 8192,
-  "pad_token_id": 0,
-  "eos_token_id": 1,
-  "bos_token_id": 2,
   "rope_theta": 10000.0,
-  "rope_scaling": {
-    "type": "linear",
-    "factor": 8.0
-  },
-  "use_cache": true,
-  "tie_word_embeddings": true,
-  "layer_norm_epsilon": 1e-6,
-  "mlp_bias": false,
-  "attention_bias": true,
-  "qkv_proj_bias": true,
-  "rms_norm_eps": 1e-6,
-  "activation_function": "swiglu",
-  "torch_dtype": "bfloat16",
-  "pretraining_tp": 1,
-  "reduction_factor": 32,
-  "num_experts_per_tok": 2,
-  "num_local_experts": 8,
-  "model_name": "Sheikh-2.5-Coder",
-  "model_version": "1.0.0",
-  "training_objectives": [
-    "causal_language_modeling",
-    "instruction_tuning",
-    "code_generation"
-  ],
-  "supported_languages": [
-    "python",
-    "javascript",
-    "typescript",
-    "java",
-    "cpp",
-    "c",
-    "go",
-    "rust",
-    "php",
-    "ruby",
-    "swift",
-    "kotlin",
-    "scala",
-    "r",
-    "sql",
-    "html",
-    "css"
-  ]
 }

 {
+  "model_type": "phi",
+  "architecture": "MiniMax-M2",
+  "vocab_size": 51200,
   "max_position_embeddings": 32768,
   "num_attention_heads": 16,
   "num_key_value_heads": 2,
   "num_hidden_layers": 36,
   "intermediate_size": 8192,
+  "hidden_size": 2048,
+  "rms_norm_epsilon": 1e-6,
   "rope_theta": 10000.0,
+  "pad_token_id": 50256,
+  "eos_token_id": 50256,
+  "bos_token_id": 50256,
+  "torch_dtype": "float16",
+  "model_specifics": {
+    "total_parameters": 3090000000,
+    "non_embedding_parameters": 2770000000,
+    "embedding_parameters": 320000000,
+    "parameter_percentage": {
+      "embedding_layer": 0.104,
+      "transformer_layers": 0.793,
+      "layer_norm": 0.003
+    }
+  },
+  "optimization_config": {
+    "quantization": {
+      "supported_formats": ["fp32", "fp16", "int8", "int4"],
+      "recommended": {
+        "memory_optimized": "int8",
+        "performance_optimized": "fp16",
+        "memory_constrained": "int4"
+      }
+    },
+    "memory_requirements": {
+      "fp32": 12.0,
+      "fp16": 6.0,
+      "int8": 3.5,
+      "int4": 2.0,
+      "runtime_activation": 0.5
+    },
+    "inference_optimization": {
+      "flash_attention": true,
+      "gradient_checkpointing": true,
+      "mixed_precision": true,
+      "dynamic_batching": false
+    }
+  },
+  "training_config": {
+    "base_model": "microsoft/phi-2",
+    "context_length": 32768,
+    "batch_size": {
+      "train": 8,
+      "eval": 8,
+      "gradient_accumulation": 4
+    },
+    "learning_rate": 1e-4,
+    "num_epochs": 3,
+    "warmup_steps": 1000,
+    "max_grad_norm": 1.0,
+    "weight_decay": 0.01,
+    "logging_steps": 100,
+    "save_steps": 1000,
+    "eval_steps": 1000
+  },
+  "specialization": {
+    "primary_languages": ["javascript", "typescript", "xml", "html", "css", "mdx"],
+    "domain_focus": "web_development",
+    "on_device_ready": true,
+    "memory_optimized": true,
+    "context_extended": true
+  },
+  "evaluation_targets": {
+    "mmlu_code_score": ">60%",
+    "humaneval": ">40%",
+    "codebleu": ">0.65",
+    "syntax_validity": ">95%",
+    "semantic_coherence": ">0.80"
+  },
+  "tokenization": {
+    "base_tokenizer": "microsoft/codebert-base",
+    "tokenizer_max_length": 8192,
+    "special_tokens": {
+      "javascript": ["<js>", "</js>", "<function>", "</function>", "<react>", "</react>"],
+      "xml": ["<xml>", "</xml>", "<element>", "</element>", "<config>", "</config>"],
+      "mdx": ["<mdx>", "</mdx>", "<component>", "</component>", "<interactive>", "</interactive>"]
+    }
+  },
+  "dataset_distribution": {
+    "total_training_tokens": "500B",
+    "language_distribution": {
+      "javascript_typescript": 0.35,
+      "xml_html": 0.25,
+      "mdx_markdown": 0.15,
+      "css_scss": 0.10,
+      "other_languages": 0.15
+    },
+    "task_distribution": {
+      "code_completion": 0.40,
+      "instruction_following": 0.25,
+      "code_explanation": 0.20,
+      "generation": 0.10,
+      "debugging": 0.05
+    }
+  },
+  "quality_metrics": {
+    "data_quality_threshold": 0.85,
+    "duplication_rate_max": 0.05,
+    "language_accuracy": 0.95,
+    "syntax_validity_min": 0.90,
+    "semantic_coherence_min": 0.75
+  },
+  "deployment_config": {
+    "target_memory_gb": "6-12",
+    "quantization_strategies": {
+      "mobile": "int8",
+      "edge": "int8",
+      "desktop": "fp16",
+      "server": "fp16"
+    },
+    "inference_time_target": {
+      "512_tokens": "<100ms",
+      "1024_tokens": "<200ms",
+      "2048_tokens": "<400ms"
+    }
+  }
 }