likhonsheikh commited on
Commit
230f696
·
verified ·
1 Parent(s): d30c6be

Add config.json: Model architecture configuration file

Browse files
Files changed (1) hide show
  1. config.json +129 -50
config.json CHANGED
@@ -1,58 +1,137 @@
1
  {
2
- "model_type": "transformer",
3
- "architectures": ["Sheikh2_5CoderForCausalLM"],
 
4
  "max_position_embeddings": 32768,
5
- "vocab_size": 50257,
6
- "hidden_size": 3072,
7
  "num_attention_heads": 16,
8
  "num_key_value_heads": 2,
9
  "num_hidden_layers": 36,
10
  "intermediate_size": 8192,
11
- "pad_token_id": 0,
12
- "eos_token_id": 1,
13
- "bos_token_id": 2,
14
  "rope_theta": 10000.0,
15
- "rope_scaling": {
16
- "type": "linear",
17
- "factor": 8.0
18
- },
19
- "use_cache": true,
20
- "tie_word_embeddings": true,
21
- "layer_norm_epsilon": 1e-6,
22
- "mlp_bias": false,
23
- "attention_bias": true,
24
- "qkv_proj_bias": true,
25
- "rms_norm_eps": 1e-6,
26
- "activation_function": "swiglu",
27
- "torch_dtype": "bfloat16",
28
- "pretraining_tp": 1,
29
- "reduction_factor": 32,
30
- "num_experts_per_tok": 2,
31
- "num_local_experts": 8,
32
- "model_name": "Sheikh-2.5-Coder",
33
- "model_version": "1.0.0",
34
- "training_objectives": [
35
- "causal_language_modeling",
36
- "instruction_tuning",
37
- "code_generation"
38
- ],
39
- "supported_languages": [
40
- "python",
41
- "javascript",
42
- "typescript",
43
- "java",
44
- "cpp",
45
- "c",
46
- "go",
47
- "rust",
48
- "php",
49
- "ruby",
50
- "swift",
51
- "kotlin",
52
- "scala",
53
- "r",
54
- "sql",
55
- "html",
56
- "css"
57
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  }
 
1
  {
2
+ "model_type": "phi",
3
+ "architecture": "MiniMax-M2",
4
+ "vocab_size": 51200,
5
  "max_position_embeddings": 32768,
 
 
6
  "num_attention_heads": 16,
7
  "num_key_value_heads": 2,
8
  "num_hidden_layers": 36,
9
  "intermediate_size": 8192,
10
+ "hidden_size": 2048,
11
+ "rms_norm_epsilon": 1e-6,
 
12
  "rope_theta": 10000.0,
13
+ "pad_token_id": 50256,
14
+ "eos_token_id": 50256,
15
+ "bos_token_id": 50256,
16
+ "torch_dtype": "float16",
17
+
18
+ "model_specifics": {
19
+ "total_parameters": 3090000000,
20
+ "non_embedding_parameters": 2770000000,
21
+ "embedding_parameters": 320000000,
22
+ "parameter_percentage": {
23
+ "embedding_layer": 0.104,
24
+ "transformer_layers": 0.793,
25
+ "layer_norm": 0.003
26
+ }
27
+ },
28
+
29
+ "optimization_config": {
30
+ "quantization": {
31
+ "supported_formats": ["fp32", "fp16", "int8", "int4"],
32
+ "recommended": {
33
+ "memory_optimized": "int8",
34
+ "performance_optimized": "fp16",
35
+ "memory_constrained": "int4"
36
+ }
37
+ },
38
+ "memory_requirements": {
39
+ "fp32": 12.0,
40
+ "fp16": 6.0,
41
+ "int8": 3.5,
42
+ "int4": 2.0,
43
+ "runtime_activation": 0.5
44
+ },
45
+ "inference_optimization": {
46
+ "flash_attention": true,
47
+ "gradient_checkpointing": true,
48
+ "mixed_precision": true,
49
+ "dynamic_batching": false
50
+ }
51
+ },
52
+
53
+ "training_config": {
54
+ "base_model": "microsoft/phi-2",
55
+ "context_length": 32768,
56
+ "batch_size": {
57
+ "train": 8,
58
+ "eval": 8,
59
+ "gradient_accumulation": 4
60
+ },
61
+ "learning_rate": 1e-4,
62
+ "num_epochs": 3,
63
+ "warmup_steps": 1000,
64
+ "max_grad_norm": 1.0,
65
+ "weight_decay": 0.01,
66
+ "logging_steps": 100,
67
+ "save_steps": 1000,
68
+ "eval_steps": 1000
69
+ },
70
+
71
+ "specialization": {
72
+ "primary_languages": ["javascript", "typescript", "xml", "html", "css", "mdx"],
73
+ "domain_focus": "web_development",
74
+ "on_device_ready": true,
75
+ "memory_optimized": true,
76
+ "context_extended": true
77
+ },
78
+
79
+ "evaluation_targets": {
80
+ "mmlu_code_score": ">60%",
81
+ "humaneval": ">40%",
82
+ "codebleu": ">0.65",
83
+ "syntax_validity": ">95%",
84
+ "semantic_coherence": ">0.80"
85
+ },
86
+
87
+ "tokenization": {
88
+ "base_tokenizer": "microsoft/codebert-base",
89
+ "tokenizer_max_length": 8192,
90
+ "special_tokens": {
91
+ "javascript": ["<js>", "</js>", "<function>", "</function>", "<react>", "</react>"],
92
+ "xml": ["<xml>", "</xml>", "<element>", "</element>", "<config>", "</config>"],
93
+ "mdx": ["<mdx>", "</mdx>", "<component>", "</component>", "<interactive>", "</interactive>"]
94
+ }
95
+ },
96
+
97
+ "dataset_distribution": {
98
+ "total_training_tokens": "500B",
99
+ "language_distribution": {
100
+ "javascript_typescript": 0.35,
101
+ "xml_html": 0.25,
102
+ "mdx_markdown": 0.15,
103
+ "css_scss": 0.10,
104
+ "other_languages": 0.15
105
+ },
106
+ "task_distribution": {
107
+ "code_completion": 0.40,
108
+ "instruction_following": 0.25,
109
+ "code_explanation": 0.20,
110
+ "generation": 0.10,
111
+ "debugging": 0.05
112
+ }
113
+ },
114
+
115
+ "quality_metrics": {
116
+ "data_quality_threshold": 0.85,
117
+ "duplication_rate_max": 0.05,
118
+ "language_accuracy": 0.95,
119
+ "syntax_validity_min": 0.90,
120
+ "semantic_coherence_min": 0.75
121
+ },
122
+
123
+ "deployment_config": {
124
+ "target_memory_gb": "6-12",
125
+ "quantization_strategies": {
126
+ "mobile": "int8",
127
+ "edge": "int8",
128
+ "desktop": "fp16",
129
+ "server": "fp16"
130
+ },
131
+ "inference_time_target": {
132
+ "512_tokens": "<100ms",
133
+ "1024_tokens": "<200ms",
134
+ "2048_tokens": "<400ms"
135
+ }
136
+ }
137
  }