mike1210 commited on
Commit
b40e905
·
verified ·
1 Parent(s): 0aadc77

Upload logic_mini_config.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. logic_mini_config.py +302 -0
logic_mini_config.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Logic Mini Model Configuration
4
+ Optimized for scientific reasoning and multi-domain expertise
5
+ Based on MiniMind framework, customized for CrowLogic ecosystem
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Optional, Dict, List
10
+ import json
11
+
12
+
13
+ @dataclass
14
+ class LogicMiniConfig:
15
+ """
16
+ Configuration for Logic Mini - Deep Scientific Reasoning Model
17
+
18
+ Designed for:
19
+ - Multi-domain expertise (mycology, drug discovery, AI systems, business)
20
+ - Chain-of-thought reasoning with <think></think> tags
21
+ - Prologic framework integration (intercept-annotate-correlate)
22
+ - Vertical application deployment
23
+ """
24
+
25
+ # Model Architecture
26
+ hidden_size: int = 768 # Larger than base MiniMind for complex reasoning
27
+ num_hidden_layers: int = 16 # Deeper for multi-step logic
28
+ num_attention_heads: int = 12
29
+ num_key_value_heads: int = 4 # Multi-query attention for efficiency
30
+ intermediate_size: Optional[int] = None # Will be computed as 8/3 * hidden_size
31
+
32
+ # Vocabulary
33
+ vocab_size: int = 6400 # Can expand if needed for scientific terms
34
+ max_position_embeddings: int = 8192 # Extended context for research papers
35
+
36
+ # Mixture of Experts (Domain Specialization)
37
+ use_moe: bool = True # Enable for multi-domain expertise
38
+ n_routed_experts: int = 8 # One per major domain
39
+ n_shared_experts: int = 1
40
+ num_experts_per_tok: int = 2 # Hybrid reasoning across domains
41
+ scoring_func: str = 'softmax'
42
+ aux_loss_alpha: float = 0.01 # Load balancing
43
+ seq_aux: bool = True
44
+
45
+ # Positional Encoding
46
+ rope_theta: float = 1e6 # RoPE base frequency
47
+ rope_scaling: Optional[Dict] = None # YaRN scaling for extended context
48
+
49
+ # Normalization
50
+ rms_norm_eps: float = 1e-5
51
+
52
+ # Activation
53
+ hidden_act: str = "silu" # SwiGLU-style activation
54
+
55
+ # Training
56
+ initializer_range: float = 0.02
57
+ use_cache: bool = True
58
+ pad_token_id: int = 0
59
+ bos_token_id: int = 1
60
+ eos_token_id: int = 2
61
+ tie_word_embeddings: bool = False
62
+ flash_attn: bool = True
63
+ dropout: float = 0.0
64
+
65
+ # Logic Mini Specific
66
+ reasoning_token_weight: float = 10.0 # Emphasize <think> tag learning
67
+ domain_experts: List[str] = field(default_factory=list)
68
+
69
+ def __post_init__(self):
70
+ """Post-initialization setup"""
71
+ if self.intermediate_size is None:
72
+ self.intermediate_size = int(8 * self.hidden_size / 3)
73
+ # Round to nearest multiple of 64 for efficiency
74
+ self.intermediate_size = ((self.intermediate_size + 63) // 64) * 64
75
+
76
+ if self.rope_scaling is None:
77
+ # Default YaRN scaling for 4x context extension
78
+ self.rope_scaling = {
79
+ "type": "yarn",
80
+ "factor": 4.0,
81
+ "original_max_position_embeddings": 2048,
82
+ "beta_fast": 4,
83
+ "beta_slow": 1
84
+ }
85
+
86
+ if not self.domain_experts:
87
+ # Define domain expert specializations
88
+ self.domain_experts = [
89
+ "mycology_cultivation",
90
+ "drug_discovery_chemistry",
91
+ "ai_systems_architecture",
92
+ "prologic_methodology",
93
+ "business_strategy",
94
+ "scientific_reasoning",
95
+ "technical_debugging",
96
+ "general_knowledge"
97
+ ]
98
+
99
+ def to_dict(self) -> Dict:
100
+ """Convert config to dictionary"""
101
+ return {
102
+ "model_type": "minimind",
103
+ "hidden_size": self.hidden_size,
104
+ "num_hidden_layers": self.num_hidden_layers,
105
+ "num_attention_heads": self.num_attention_heads,
106
+ "num_key_value_heads": self.num_key_value_heads,
107
+ "intermediate_size": self.intermediate_size,
108
+ "vocab_size": self.vocab_size,
109
+ "max_position_embeddings": self.max_position_embeddings,
110
+ "use_moe": self.use_moe,
111
+ "n_routed_experts": self.n_routed_experts,
112
+ "n_shared_experts": self.n_shared_experts,
113
+ "num_experts_per_tok": self.num_experts_per_tok,
114
+ "rope_theta": self.rope_theta,
115
+ "rope_scaling": self.rope_scaling,
116
+ "rms_norm_eps": self.rms_norm_eps,
117
+ "hidden_act": self.hidden_act,
118
+ "flash_attn": self.flash_attn,
119
+ "dropout": self.dropout,
120
+ "bos_token_id": self.bos_token_id,
121
+ "eos_token_id": self.eos_token_id,
122
+ "pad_token_id": self.pad_token_id,
123
+ "reasoning_token_weight": self.reasoning_token_weight,
124
+ "domain_experts": self.domain_experts,
125
+ "scoring_func": self.scoring_func,
126
+ "aux_loss_alpha": self.aux_loss_alpha,
127
+ "seq_aux": self.seq_aux,
128
+ "norm_topk_prob": True,
129
+ "inference_rope_scaling": False
130
+ }
131
+
132
+ def save(self, path: str):
133
+ """Save configuration to JSON file"""
134
+ with open(path, 'w') as f:
135
+ json.dump(self.to_dict(), f, indent=2)
136
+
137
+ @classmethod
138
+ def from_dict(cls, config_dict: Dict):
139
+ """Load configuration from dictionary"""
140
+ # Filter out keys that aren't in the dataclass
141
+ valid_keys = {f.name for f in cls.__dataclass_fields__.values()}
142
+ filtered_dict = {k: v for k, v in config_dict.items() if k in valid_keys}
143
+ return cls(**filtered_dict)
144
+
145
+ @classmethod
146
+ def load(cls, path: str):
147
+ """Load configuration from JSON file"""
148
+ with open(path, 'r') as f:
149
+ config_dict = json.load(f)
150
+ return cls.from_dict(config_dict)
151
+
152
+
153
+ # Predefined configurations for different use cases
154
+ LOGIC_MINI_TINY = LogicMiniConfig(
155
+ hidden_size=512,
156
+ num_hidden_layers=8,
157
+ num_attention_heads=8,
158
+ num_key_value_heads=2,
159
+ use_moe=False,
160
+ max_position_embeddings=2048
161
+ )
162
+
163
+ LOGIC_MINI_SMALL = LogicMiniConfig(
164
+ hidden_size=768,
165
+ num_hidden_layers=12,
166
+ num_attention_heads=12,
167
+ num_key_value_heads=4,
168
+ use_moe=True,
169
+ n_routed_experts=4,
170
+ max_position_embeddings=4096
171
+ )
172
+
173
+ LOGIC_MINI_MEDIUM = LogicMiniConfig(
174
+ hidden_size=1024,
175
+ num_hidden_layers=16,
176
+ num_attention_heads=16,
177
+ num_key_value_heads=4,
178
+ use_moe=True,
179
+ n_routed_experts=8,
180
+ max_position_embeddings=8192
181
+ )
182
+
183
+ LOGIC_MINI_LARGE = LogicMiniConfig(
184
+ hidden_size=1536,
185
+ num_hidden_layers=24,
186
+ num_attention_heads=24,
187
+ num_key_value_heads=6,
188
+ use_moe=True,
189
+ n_routed_experts=16,
190
+ max_position_embeddings=8192
191
+ )
192
+
193
+
194
+ def get_config(size: str = "medium") -> LogicMiniConfig:
195
+ """
196
+ Get predefined configuration by size
197
+
198
+ Args:
199
+ size: One of "tiny", "small", "medium", "large"
200
+
201
+ Returns:
202
+ LogicMiniConfig instance
203
+ """
204
+ configs = {
205
+ "tiny": LOGIC_MINI_TINY,
206
+ "small": LOGIC_MINI_SMALL,
207
+ "medium": LOGIC_MINI_MEDIUM,
208
+ "large": LOGIC_MINI_LARGE
209
+ }
210
+
211
+ if size not in configs:
212
+ raise ValueError(f"Unknown size: {size}. Choose from {list(configs.keys())}")
213
+
214
+ return configs[size]
215
+
216
+
217
+ def print_model_info(config: LogicMiniConfig):
218
+ """Print detailed model information"""
219
+
220
+ # Calculate approximate parameter count
221
+ embedding_params = config.vocab_size * config.hidden_size
222
+
223
+ # Per layer
224
+ attention_params = (
225
+ 4 * config.hidden_size * config.hidden_size + # QKV + O projections
226
+ config.hidden_size # Bias terms
227
+ )
228
+
229
+ if config.use_moe:
230
+ # MoE FFN
231
+ expert_params = (
232
+ config.hidden_size * config.intermediate_size +
233
+ config.intermediate_size * config.hidden_size
234
+ ) * (config.n_routed_experts + config.n_shared_experts)
235
+
236
+ gate_params = config.hidden_size * config.n_routed_experts
237
+ ffn_params = expert_params + gate_params
238
+ else:
239
+ # Standard FFN
240
+ ffn_params = (
241
+ config.hidden_size * config.intermediate_size * 2 + # Gate + Up
242
+ config.intermediate_size * config.hidden_size # Down
243
+ )
244
+
245
+ layer_params = attention_params + ffn_params + config.hidden_size * 4 # Norms
246
+ total_layer_params = layer_params * config.num_hidden_layers
247
+
248
+ lm_head_params = config.vocab_size * config.hidden_size
249
+
250
+ total_params = embedding_params + total_layer_params + lm_head_params
251
+
252
+ print("=" * 70)
253
+ print("LOGIC MINI MODEL CONFIGURATION")
254
+ print("=" * 70)
255
+ print(f"\nArchitecture:")
256
+ print(f" Hidden Size: {config.hidden_size:,}")
257
+ print(f" Layers: {config.num_hidden_layers}")
258
+ print(f" Attention Heads: {config.num_attention_heads}")
259
+ print(f" KV Heads: {config.num_key_value_heads}")
260
+ print(f" Intermediate Size: {config.intermediate_size:,}")
261
+ print(f" Vocabulary Size: {config.vocab_size:,}")
262
+ print(f" Max Context: {config.max_position_embeddings:,} tokens")
263
+
264
+ print(f"\nMixture of Experts:")
265
+ print(f" Enabled: {config.use_moe}")
266
+ if config.use_moe:
267
+ print(f" Routed Experts: {config.n_routed_experts}")
268
+ print(f" Shared Experts: {config.n_shared_experts}")
269
+ print(f" Experts per Token: {config.num_experts_per_tok}")
270
+ print(f" Domain Specializations:")
271
+ for i, domain in enumerate(config.domain_experts, 1):
272
+ print(f" {i}. {domain.replace('_', ' ').title()}")
273
+
274
+ print(f"\nCapabilities:")
275
+ print(f" Chain-of-Thought: ✓ (<think></think> tags)")
276
+ print(f" Prologic Framework: ✓ (intercept-annotate-correlate)")
277
+ print(f" Extended Context: ✓ (YaRN scaling, 4x extension)")
278
+ print(f" Multi-Domain: ✓ ({len(config.domain_experts)} specializations)")
279
+
280
+ print(f"\nParameter Count:")
281
+ print(f" Embeddings: {embedding_params:,} ({embedding_params/1e6:.1f}M)")
282
+ print(f" Transformer Layers: {total_layer_params:,} ({total_layer_params/1e6:.1f}M)")
283
+ print(f" LM Head: {lm_head_params:,} ({lm_head_params/1e6:.1f}M)")
284
+ print(f" TOTAL: {total_params:,} ({total_params/1e6:.1f}M)")
285
+
286
+ print(f"\nTraining Estimates:")
287
+ print(f" GPU Memory (bf16): ~{total_params * 2 / 1e9:.1f} GB")
288
+ print(f" Training Time: ~{total_params / 25.8e6 * 2:.1f} hours (on consumer GPU)")
289
+ print(f" Dataset Needed: ~{total_params / 1e6 * 100:.0f}M tokens minimum")
290
+ print("=" * 70)
291
+
292
+
293
+ if __name__ == "__main__":
294
+ # Example usage
295
+ print("\n🚀 Logic Mini Configuration Examples\n")
296
+
297
+ for size in ["tiny", "small", "medium", "large"]:
298
+ config = get_config(size)
299
+ print(f"\n{'='*70}")
300
+ print(f"LOGIC MINI - {size.upper()}")
301
+ print_model_info(config)
302
+ print()