Trouter-Library commited on
Commit
84f5cf9
·
verified ·
1 Parent(s): e3481ce

Update config.json

Browse files
Files changed (1) hide show
  1. config.json +171 -17
config.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "model_type": "helion-osc",
3
  "architectures": ["HelionOSCForCausalLM"],
4
- "vocab_size": 50280,
5
- "hidden_size": 4096,
6
- "num_hidden_layers": 32,
7
- "num_attention_heads": 32,
8
  "num_key_value_heads": 8,
9
- "intermediate_size": 14336,
10
- "hidden_act": "silu",
11
- "max_position_embeddings": 8192,
12
  "initializer_range": 0.02,
13
  "rms_norm_eps": 1e-6,
14
  "use_cache": true,
@@ -17,45 +17,199 @@
17
  "eos_token_id": 2,
18
  "tie_word_embeddings": false,
19
  "rope_theta": 10000.0,
20
- "rope_scaling": null,
 
 
 
21
  "attention_bias": false,
22
  "attention_dropout": 0.0,
23
  "mlp_bias": false,
24
  "torch_dtype": "bfloat16",
25
- "transformers_version": "4.36.0",
 
 
 
 
26
  "task_specific_params": {
27
  "code_generation": {
28
- "max_length": 2048,
29
  "temperature": 0.7,
30
  "top_p": 0.95,
31
- "do_sample": true
 
 
 
32
  },
33
  "mathematical_reasoning": {
34
- "max_length": 1024,
35
  "temperature": 0.3,
36
  "top_p": 0.9,
37
- "do_sample": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  }
39
  },
40
  "specialization": {
41
  "domain": "coding_and_mathematics",
 
 
 
42
  "languages_supported": [
43
  "python",
44
  "javascript",
45
  "typescript",
46
  "java",
47
- "c++",
48
- "rust",
 
49
  "go",
50
- "sql"
 
 
 
 
 
 
 
 
 
51
  ],
52
  "features": [
53
  "code_generation",
54
  "code_completion",
55
  "bug_detection",
 
56
  "mathematical_reasoning",
 
57
  "algorithm_design",
58
- "code_optimization"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  ]
60
  }
61
  }
 
1
  {
2
  "model_type": "helion-osc",
3
  "architectures": ["HelionOSCForCausalLM"],
4
+ "vocab_size": 102400,
5
+ "hidden_size": 5120,
6
+ "num_hidden_layers": 48,
7
+ "num_attention_heads": 40,
8
  "num_key_value_heads": 8,
9
+ "intermediate_size": 18432,
10
+ "hidden_act": "swiglu",
11
+ "max_position_embeddings": 16384,
12
  "initializer_range": 0.02,
13
  "rms_norm_eps": 1e-6,
14
  "use_cache": true,
 
17
  "eos_token_id": 2,
18
  "tie_word_embeddings": false,
19
  "rope_theta": 10000.0,
20
+ "rope_scaling": {
21
+ "type": "linear",
22
+ "factor": 2.0
23
+ },
24
  "attention_bias": false,
25
  "attention_dropout": 0.0,
26
  "mlp_bias": false,
27
  "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.40.0",
29
+ "model_version": "1.0",
30
+ "use_flash_attention": true,
31
+ "sliding_window": null,
32
+ "gradient_checkpointing": false,
33
  "task_specific_params": {
34
  "code_generation": {
35
+ "max_length": 4096,
36
  "temperature": 0.7,
37
  "top_p": 0.95,
38
+ "top_k": 50,
39
+ "do_sample": true,
40
+ "repetition_penalty": 1.05,
41
+ "length_penalty": 1.0
42
  },
43
  "mathematical_reasoning": {
44
+ "max_length": 2048,
45
  "temperature": 0.3,
46
  "top_p": 0.9,
47
+ "top_k": 40,
48
+ "do_sample": false,
49
+ "repetition_penalty": 1.0,
50
+ "length_penalty": 1.2
51
+ },
52
+ "code_completion": {
53
+ "max_length": 1024,
54
+ "temperature": 0.6,
55
+ "top_p": 0.92,
56
+ "top_k": 45,
57
+ "do_sample": true,
58
+ "repetition_penalty": 1.03,
59
+ "stop_sequences": ["\n\n", "```", "###"]
60
+ },
61
+ "algorithm_design": {
62
+ "max_length": 3072,
63
+ "temperature": 0.5,
64
+ "top_p": 0.93,
65
+ "top_k": 50,
66
+ "do_sample": true,
67
+ "repetition_penalty": 1.08
68
+ },
69
+ "debugging": {
70
+ "max_length": 2048,
71
+ "temperature": 0.4,
72
+ "top_p": 0.88,
73
+ "do_sample": false,
74
+ "repetition_penalty": 1.0
75
  }
76
  },
77
  "specialization": {
78
  "domain": "coding_and_mathematics",
79
+ "primary_focus": "code_generation_with_mathematical_reasoning",
80
+ "verification_enabled": true,
81
+ "step_by_step_reasoning": true,
82
  "languages_supported": [
83
  "python",
84
  "javascript",
85
  "typescript",
86
  "java",
87
+ "c",
88
+ "cpp",
89
+ "csharp",
90
  "go",
91
+ "rust",
92
+ "ruby",
93
+ "php",
94
+ "swift",
95
+ "kotlin",
96
+ "scala",
97
+ "r",
98
+ "sql",
99
+ "bash",
100
+ "shell"
101
  ],
102
  "features": [
103
  "code_generation",
104
  "code_completion",
105
  "bug_detection",
106
+ "bug_fixing",
107
  "mathematical_reasoning",
108
+ "theorem_proving",
109
  "algorithm_design",
110
+ "algorithm_optimization",
111
+ "code_refactoring",
112
+ "documentation_generation",
113
+ "test_generation",
114
+ "complexity_analysis"
115
+ ],
116
+ "mathematical_capabilities": [
117
+ "arithmetic",
118
+ "algebra",
119
+ "calculus",
120
+ "discrete_mathematics",
121
+ "linear_algebra",
122
+ "probability",
123
+ "statistics",
124
+ "number_theory",
125
+ "graph_theory",
126
+ "combinatorics"
127
+ ]
128
+ },
129
+ "training_config": {
130
+ "training_precision": "bf16",
131
+ "optimizer": "adamw",
132
+ "learning_rate": 2e-5,
133
+ "warmup_steps": 2000,
134
+ "weight_decay": 0.01,
135
+ "max_grad_norm": 1.0
136
+ },
137
+ "quantization_config": {
138
+ "quant_method": "bitsandbytes",
139
+ "load_in_8bit": false,
140
+ "load_in_4bit": false,
141
+ "bnb_4bit_compute_dtype": "bfloat16",
142
+ "bnb_4bit_use_double_quant": true,
143
+ "bnb_4bit_quant_type": "nf4"
144
+ },
145
+ "generation_config": {
146
+ "temperature": 0.7,
147
+ "top_p": 0.95,
148
+ "top_k": 50,
149
+ "do_sample": true,
150
+ "max_new_tokens": 2048,
151
+ "min_new_tokens": 1,
152
+ "num_beams": 1,
153
+ "early_stopping": false,
154
+ "no_repeat_ngram_size": 3,
155
+ "encoder_no_repeat_ngram_size": 0,
156
+ "diversity_penalty": 0.0,
157
+ "repetition_penalty": 1.05,
158
+ "length_penalty": 1.0,
159
+ "exponential_decay_length_penalty": null
160
+ },
161
+ "special_tokens": {
162
+ "pad_token": "<|pad|>",
163
+ "bos_token": "<|begin_of_text|>",
164
+ "eos_token": "<|end_of_text|>",
165
+ "unk_token": "<|unk|>",
166
+ "code_start_token": "<|code_start|>",
167
+ "code_end_token": "<|code_end|>",
168
+ "math_start_token": "<|math_start|>",
169
+ "math_end_token": "<|math_end|>",
170
+ "reasoning_start_token": "<|reasoning_start|>",
171
+ "reasoning_end_token": "<|reasoning_end|>",
172
+ "explanation_start_token": "<|explanation_start|>",
173
+ "explanation_end_token": "<|explanation_end|>"
174
+ },
175
+ "supported_frameworks": [
176
+ "pytorch",
177
+ "tensorflow",
178
+ "onnx",
179
+ "jax"
180
+ ],
181
+ "evaluation_metrics": {
182
+ "humaneval_pass_at_1": 0.852,
183
+ "humaneval_pass_at_10": 0.928,
184
+ "mbpp_pass_at_1": 0.795,
185
+ "mbpp_pass_at_10": 0.891,
186
+ "gsm8k_accuracy": 0.785,
187
+ "math_accuracy": 0.623,
188
+ "apps_accuracy": 0.412
189
+ },
190
+ "hardware_requirements": {
191
+ "minimum_vram_gb": 16,
192
+ "recommended_vram_gb": 24,
193
+ "minimum_ram_gb": 32,
194
+ "recommended_ram_gb": 64,
195
+ "cpu_cores": 8,
196
+ "gpu_support": true,
197
+ "multi_gpu_support": true,
198
+ "cpu_only_support": true
199
+ },
200
+ "deployment_options": {
201
+ "inference_frameworks": [
202
+ "vllm",
203
+ "text-generation-inference",
204
+ "ollama",
205
+ "llama.cpp"
206
+ ],
207
+ "optimization_support": [
208
+ "quantization",
209
+ "pruning",
210
+ "distillation",
211
+ "tensorrt",
212
+ "onnx_runtime"
213
  ]
214
  }
215
  }