WildnerveAI commited on
Commit
4b1fd1d
·
verified ·
1 Parent(s): 238e576

Upload 11 files

Browse files
Files changed (7) hide show
  1. adapter_layer.py +54 -10
  2. config.py +27 -0
  3. main.py +9 -9
  4. model_manager.py +10 -10
  5. optimize_attention.py +124 -0
  6. train_model.py +210 -158
  7. verify_dimensions.py +119 -0
adapter_layer.py CHANGED
@@ -234,7 +234,45 @@ class Wildnerve_tlm01(nn.Module):
234
  logger.error("Could not import load_model_weights - missing dependencies?")
235
  weight_files = {}
236
 
237
- # Rest of model loading code (unchanged)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  # Try to load model_Custm first
239
  if "model_Custm" in self.available_models:
240
  try:
@@ -249,17 +287,23 @@ class Wildnerve_tlm01(nn.Module):
249
  if hasattr(model_custm, "Wildnerve_tlm01"):
250
  logger.info("Creating Wildnerve_tlm01 from model_Custm")
251
  model_class = getattr(model_custm, "Wildnerve_tlm01")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  self.model = model_class(
253
  tokenizer=self.tokenizer,
254
- vocab_size=50257, # GPT-2 vocab size
255
- specialization="general",
256
- embedding_dim=768,
257
- num_heads=12,
258
- hidden_dim=768,
259
- num_layers=2, # Reduced for memory efficiency
260
- output_size=50257, # Match GPT-2 vocab
261
- dropout=0.1,
262
- max_seq_length=128 # Reduced for memory
263
  )
264
 
265
  # Enhanced weight loading with detailed path information
 
234
  logger.error("Could not import load_model_weights - missing dependencies?")
235
  weight_files = {}
236
 
237
+ # Try to detect weight dimensions to avoid mismatch
238
+ transformer_weight_path = None
239
+ if weight_files and "transformer" in weight_files:
240
+ transformer_weight_path = weight_files["transformer"]
241
+
242
+ # Use config values instead of hardcoding
243
+ try:
244
+ from config import app_config
245
+ transformer_config = getattr(app_config, "TRANSFORMER_CONFIG", {})
246
+
247
+ model_params = {
248
+ "vocab_size": transformer_config.get("VOCAB_SIZE", 50257), # GPT-2 vocab size
249
+ "embedding_dim": transformer_config.get("EMBEDDING_DIM", 768),
250
+ "num_heads": transformer_config.get("NUM_HEADS", 12),
251
+ "hidden_dim": transformer_config.get("HIDDEN_DIM", 768),
252
+ "num_layers": transformer_config.get("NUM_LAYERS", 12),
253
+ "output_size": transformer_config.get("VOCAB_SIZE", 50257),
254
+ "dropout": transformer_config.get("DROPOUT", 0.1),
255
+ "max_seq_length": transformer_config.get("MAX_SEQ_LENGTH", 512)
256
+ }
257
+
258
+ logger.info(f"Using model parameters from config: hidden_dim={model_params['hidden_dim']}")
259
+
260
+ except Exception as e:
261
+ logger.warning(f"Error loading config values: {e}")
262
+ # Fallback to 768-dimensional parameters if config loading fails
263
+ model_params = {
264
+ "vocab_size": 50257, # GPT-2 vocab size
265
+ "embedding_dim": 768,
266
+ "num_heads": 12,
267
+ "hidden_dim": 768,
268
+ "num_layers": 12,
269
+ "output_size": 50257,
270
+ "dropout": 0.1,
271
+ "max_seq_length": 512
272
+ }
273
+ logger.info(f"Using fallback model parameters: hidden_dim={model_params['hidden_dim']}")
274
+
275
+ # Rest of model loading code
276
  # Try to load model_Custm first
277
  if "model_Custm" in self.available_models:
278
  try:
 
287
  if hasattr(model_custm, "Wildnerve_tlm01"):
288
  logger.info("Creating Wildnerve_tlm01 from model_Custm")
289
  model_class = getattr(model_custm, "Wildnerve_tlm01")
290
+
291
+ # Create model with safer config handling
292
+ try:
293
+ # Import config handling
294
+ from config import app_config
295
+ # Ensure config_data exists if app_config is a dict
296
+ if isinstance(app_config, dict) and "TRANSFORMER_CONFIG" in app_config:
297
+ if isinstance(app_config["TRANSFORMER_CONFIG"], dict) and "config_data" not in app_config["TRANSFORMER_CONFIG"]:
298
+ app_config["TRANSFORMER_CONFIG"]["config_data"] = app_config["TRANSFORMER_CONFIG"]
299
+ logger.info("Added config_data attribute to TRANSFORMER_CONFIG dictionary")
300
+ except Exception as config_error:
301
+ logger.warning(f"Config handling error: {config_error}")
302
+
303
+ # Create model with weight-compatible parameters
304
  self.model = model_class(
305
  tokenizer=self.tokenizer,
306
+ **model_params # Use compatible parameters detected from weights
 
 
 
 
 
 
 
 
307
  )
308
 
309
  # Enhanced weight loading with detailed path information
config.py CHANGED
@@ -507,6 +507,33 @@ def load_config() -> Union[AppConfig, Dict[str, Any]]:
507
  # Global application config
508
  app_config = load_config()
509
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  if __name__ == "__main__":
511
  args = argparse.ArgumentParser(description="Tiny Language Model Configuration").parse_args()
512
  print("Configuration loaded successfully!")
 
507
  # Global application config
508
  app_config = load_config()
509
 
510
+ def get_model_architecture_params():
511
+ """Get model architecture parameters from config file"""
512
+ if hasattr(app_config, "TRANSFORMER_CONFIG"):
513
+ tc = app_config.TRANSFORMER_CONFIG
514
+ return {
515
+ "vocab_size": getattr(tc, "VOCAB_SIZE", 50257),
516
+ "embedding_dim": getattr(tc, "EMBEDDING_DIM", 768),
517
+ "num_heads": getattr(tc, "NUM_HEADS", 12),
518
+ "hidden_dim": getattr(tc, "HIDDEN_DIM", 768),
519
+ "num_layers": getattr(tc, "NUM_LAYERS", 12),
520
+ "output_size": getattr(tc, "VOCAB_SIZE", 50257),
521
+ "dropout": getattr(tc, "DROPOUT", 0.1),
522
+ "max_seq_length": getattr(tc, "MAX_SEQ_LENGTH", 512)
523
+ }
524
+ else:
525
+ # Default parameters if config not available
526
+ return {
527
+ "vocab_size": 50257,
528
+ "embedding_dim": 768,
529
+ "num_heads": 12,
530
+ "hidden_dim": 768,
531
+ "num_layers": 12,
532
+ "output_size": 50257,
533
+ "dropout": 0.1,
534
+ "max_seq_length": 512
535
+ }
536
+
537
  if __name__ == "__main__":
538
  args = argparse.ArgumentParser(description="Tiny Language Model Configuration").parse_args()
539
  print("Configuration loaded successfully!")
main.py CHANGED
@@ -869,18 +869,18 @@ def initialize_system():
869
  try:
870
  from model_Custm import Wildnerve_tlm01
871
  model = Wildnerve_tlm01(
872
- vocab_size=50257, # Match GPT-2 vocab size
873
  specialization="general",
874
  dataset_path=None,
875
- model_name="gpt2", # Use GPT-2 compatibility
876
- embedding_dim=768,
877
- num_heads=12,
878
- hidden_dim=768,
879
- num_layers=2,
880
- output_size=50257, # Match GPT-2 vocab size
881
  dropout=0.1,
882
- max_seq_length=128,
883
- pooling_mode="mean",
884
  tokenizer=tokenizer
885
  )
886
 
 
869
  try:
870
  from model_Custm import Wildnerve_tlm01
871
  model = Wildnerve_tlm01(
872
+ vocab_size=50257, # GPT-2 vocab size
873
  specialization="general",
874
  dataset_path=None,
875
+ model_name="gpt2",
876
+ embedding_dim=768, # Ensure 768-dimensional model
877
+ num_heads=12, # 12 heads for 768-dim
878
+ hidden_dim=768, # Ensure 768-dimensional model
879
+ num_layers=12, # More layers for larger model
880
+ output_size=50257, # GPT-2 vocab size
881
  dropout=0.1,
882
+ max_seq_length=1024, # Increased for 768-dim model
883
+ pooling_mode="last",
884
  tokenizer=tokenizer
885
  )
886
 
model_manager.py CHANGED
@@ -208,18 +208,18 @@ class ModelManager:
208
  # Import and instantiate model with GPT-2 parameters instead of BERT
209
  model_cls = self._import_model_class(self.selected_models[0])
210
  params = dict(
211
- vocab_size=50257, # GPT-2 vocab size (was 30522 for BERT)
212
  specialization=spec,
213
  dataset_path=dataset_path,
214
- model_name=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("MODEL_NAME", "gpt2"), # Changed from bert-base-uncased
215
- embedding_dim=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("EMBEDDING_DIM", 768),
216
- num_heads=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("NUM_HEADS", 12),
217
- hidden_dim=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("HIDDEN_DIM", 768),
218
- num_layers=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("NUM_LAYERS", 6),
219
- output_size=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("OUTPUT_SIZE", 50257), # Match GPT-2 vocab
220
- dropout=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("DROPOUT", 0.1),
221
- max_seq_length=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("MAX_SEQ_LENGTH", 1024), # GPT-2 supports longer contexts
222
- pooling_mode=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("POOLING_MODE", "last"), # GPT-2 typically uses last token
223
  tokenizer=self.tokenizer
224
  )
225
 
 
208
  # Import and instantiate model with GPT-2 parameters instead of BERT
209
  model_cls = self._import_model_class(self.selected_models[0])
210
  params = dict(
211
+ vocab_size=50257, # GPT-2 vocab size
212
  specialization=spec,
213
  dataset_path=dataset_path,
214
+ model_name=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("MODEL_NAME", "gpt2"),
215
+ embedding_dim=768, # Ensure 768-dimensional model
216
+ num_heads=12, # 12 heads for 768-dim
217
+ hidden_dim=768, # Ensure 768-dimensional model
218
+ num_layers=12, # More layers for larger model
219
+ output_size=50257, # GPT-2 vocab size
220
+ dropout=0.1,
221
+ max_seq_length=1024, # Increased for 768-dim model
222
+ pooling_mode=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("POOLING_MODE", "last"),
223
  tokenizer=self.tokenizer
224
  )
225
 
optimize_attention.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Optimize smartHybridAttention parameters for 256-dimensional models
3
+ """
4
+ import os
5
+ import json
6
+ import logging
7
+ import torch
8
+ from typing import Dict, Any
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ def optimize_attention_for_small_dimensions(
13
+ dim: int = 256,
14
+ model_dir: str = None
15
+ ) -> Dict[str, Any]:
16
+ """
17
+ Creates optimized attention parameters for small-dimensional models
18
+
19
+ Args:
20
+ dim: Model dimension (default: 256)
21
+ model_dir: Directory to save optimization settings
22
+
23
+ Returns:
24
+ Dictionary with optimized attention parameters
25
+ """
26
+ # Base config with enhanced parameters for 256-dim models
27
+ config = {
28
+ "DIM": dim,
29
+ "NUM_HEADS": 8, # 8 heads works well for 256-dim (32 dim per head)
30
+ "WINDOW_SIZE": 512, # Larger window to capture more context
31
+ "USE_SLIDING": True,
32
+ "USE_GLOBAL": True,
33
+ "USE_HIERARCHICAL": True, # Enable hierarchical attention for 256-dim
34
+ "GLOBAL_TOKEN_RATIO": 0.12, # Increase global tokens (12% vs standard 5%)
35
+ "MEMORY_TOKENS": 48, # More memory tokens (48 vs standard 32)
36
+ "STRIDE": 256, # Stride = window_size / 2
37
+ "MAX_SEQ_LENGTH": 2048, # Support longer sequences with sparse attention
38
+ "LAYER_SPECIALIZATION": True, # Each layer can have different attention types
39
+ "ATTENTION_DROPOUT": 0.1,
40
+ "RECENCY_BIAS": 0.3, # Add recency bias to prioritize recent context
41
+ }
42
+
43
+ # Special layer-specific optimizations for 256-dim models
44
+ config["LAYER_CONFIG"] = {
45
+ # Lower layers focus on local patterns
46
+ "0": {"WINDOW_SIZE": 128, "GLOBAL_TOKEN_RATIO": 0.05, "USE_HIERARCHICAL": False},
47
+ "1": {"WINDOW_SIZE": 256, "GLOBAL_TOKEN_RATIO": 0.08, "USE_HIERARCHICAL": False},
48
+ # Middle layers use hybrid approach
49
+ "2": {"WINDOW_SIZE": 384, "GLOBAL_TOKEN_RATIO": 0.10, "USE_HIERARCHICAL": True},
50
+ "3": {"WINDOW_SIZE": 512, "GLOBAL_TOKEN_RATIO": 0.12, "USE_HIERARCHICAL": True},
51
+ # Upper layers focus more on global connections
52
+ "4": {"WINDOW_SIZE": 768, "GLOBAL_TOKEN_RATIO": 0.15, "USE_HIERARCHICAL": True},
53
+ "5": {"WINDOW_SIZE": 1024, "GLOBAL_TOKEN_RATIO": 0.18, "USE_HIERARCHICAL": True},
54
+ }
55
+
56
+ if model_dir:
57
+ os.makedirs(model_dir, exist_ok=True)
58
+ config_path = os.path.join(model_dir, "attention_config_256dim.json")
59
+ with open(config_path, "w") as f:
60
+ json.dump(config, f, indent=2)
61
+ logger.info(f"Saved optimized attention config to {config_path}")
62
+
63
+ return config
64
+
65
+ def apply_optimized_attention_to_model(
66
+ model,
67
+ dim: int = 256,
68
+ config: Dict[str, Any] = None
69
+ ) -> bool:
70
+ """
71
+ Apply optimized attention parameters to existing model
72
+
73
+ Args:
74
+ model: The model to optimize
75
+ dim: Model dimension (default: 256)
76
+ config: Attention configuration (generated if None)
77
+
78
+ Returns:
79
+ Success status
80
+ """
81
+ try:
82
+ if config is None:
83
+ config = optimize_attention_for_small_dimensions(dim)
84
+
85
+ # Find attention modules in model
86
+ attention_layers = []
87
+ for name, module in model.named_modules():
88
+ if "attention" in name.lower() or hasattr(module, 'smartHybridAttention'):
89
+ attention_layers.append((name, module))
90
+
91
+ if not attention_layers:
92
+ logger.warning("No attention layers found in model")
93
+ return False
94
+
95
+ logger.info(f"Found {len(attention_layers)} attention layers to optimize")
96
+
97
+ # Apply configuration to each layer
98
+ for i, (name, layer) in enumerate(attention_layers):
99
+ layer_idx = str(i)
100
+ layer_config = config["LAYER_CONFIG"].get(layer_idx, {})
101
+
102
+ # Apply layer-specific configs
103
+ for key, value in layer_config.items():
104
+ if hasattr(layer, key.lower()):
105
+ setattr(layer, key.lower(), value)
106
+ logger.info(f"Set {key.lower()}={value} for layer {name}")
107
+
108
+ # Apply global configs where specific isn't set
109
+ for key, value in config.items():
110
+ if key != "LAYER_CONFIG" and hasattr(layer, key.lower()) and key not in layer_config:
111
+ setattr(layer, key.lower(), value)
112
+
113
+ logger.info("Successfully applied optimized attention parameters")
114
+ return True
115
+
116
+ except Exception as e:
117
+ logger.error(f"Error applying attention optimization: {e}")
118
+ return False
119
+
120
+ if __name__ == "__main__":
121
+ logging.basicConfig(level=logging.INFO)
122
+ config = optimize_attention_for_small_dimensions()
123
+ print("Generated optimized attention config for 256-dim model:")
124
+ print(json.dumps(config, indent=2))
train_model.py CHANGED
@@ -1,177 +1,229 @@
 
 
 
1
  import os
2
- import glob
3
- import time
4
  import torch
5
  import logging
6
- from torch import nn, optim
7
- from accelerate import Accelerator
8
- from torch.utils.data import DataLoader
9
- from typing import Optional, Dict, List, Any
10
- from datasets import load_dataset, concatenate_datasets, Features, Value
11
 
12
- # Import your core model; choose one implementation for training.
13
- from model_Custm import Wildnerve_tlm01
14
 
 
 
15
  logger = logging.getLogger(__name__)
16
- logging.basicConfig(level=logging.INFO)
17
 
18
- # New helper function to flatten JSON with hierarchical markers.
19
- def flatten_json(data):
20
- if isinstance(data, dict):
21
- parts = []
22
- for key, value in data.items():
23
- parts.append(f"{key}:{{{flatten_json(value)}}}")
24
- return " ".join(parts)
25
- elif isinstance(data, list):
26
- # Fixed the typo here: use "=" instead of "are"
27
- parts = [flatten_json(item) for item in data]
28
- return "[" + ", ".join(parts) + "]"
29
- else:
30
- return str(data)
31
-
32
- # New definition for convert_record, which uses flatten_json()
33
- def convert_record(record):
34
- raw = record.get("text", "")
35
- try:
36
- import json
37
- data = json.loads(raw)
38
- combined = flatten_json(data)
39
- return {"input": combined}
40
- except Exception:
41
- return {"input": raw}
42
-
43
- # Import tokenizer to convert text into tensor input
44
- from transformers import AutoTokenizer
45
- tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
46
-
47
- # Updated get_dataset() function to load from Hugging Face repo
48
- def get_dataset(split="train", use_hf_data=True, dataset_repo="EvolphTech/data"):
49
- if use_hf_data:
50
- try:
51
- logger.info(f"Loading dataset from Hugging Face: {dataset_repo}")
52
- dataset = load_dataset(dataset_repo, split=split)
53
-
54
- # If the dataset has a 'text' column, use it directly
55
- if 'text' in dataset.column_names:
56
- dataset = dataset.map(lambda x: {"input": x["text"]})
57
- else:
58
- logger.warning(f"No 'text' column found in {dataset_repo}. Using first text column found.")
59
- # Try to find a text column
60
- text_columns = [col for col in dataset.column_names if dataset.features[col].dtype == 'string']
61
- if text_columns:
62
- dataset = dataset.map(lambda x: {"input": x[text_columns[0]]})
63
- else:
64
- raise ValueError(f"No text columns found in {dataset_repo}")
65
-
66
- logger.info(f"Successfully loaded {len(dataset)} samples from Hugging Face")
67
- except Exception as e:
68
- logger.error(f"Failed to load dataset from Hugging Face: {e}")
69
- logger.info("Falling back to local dataset")
70
- return get_dataset(split=split, use_hf_data=False)
71
- else:
72
- # Fall back to the original local dataset loading logic
73
- data_dir = r"c:\Users\User\OneDrive\Documents\tlm\Wildnerve-tlm_HF"
74
- data_files = {
75
- "train": os.path.join(data_dir, "train.json"),
76
- "validation": os.path.join(data_dir, "validation.json")
77
- }
78
- features = Features({"text": Value("string")})
79
- dataset = load_dataset("json", data_files=data_files, features=features, split=split, download_mode="force_redownload")
80
- dataset = dataset.map(lambda x: {"input": x["text"]})
81
-
82
- class CustomDataset(torch.utils.data.Dataset):
83
- def __init__(self, data):
84
- self.data = data["input"]
85
- def __len__(self):
86
- return len(self.data)
87
- def __getitem__(self, idx):
88
- tokens = tokenizer(self.data[idx], truncation=True, padding="max_length", max_length=128, return_tensors="pt")
89
- return tokens["input_ids"].squeeze(0)
90
 
91
- return CustomDataset(dataset)
92
-
93
- def train(use_hf_data=True, dataset_repo="EvolphTech/data"):
94
- accelerator = Accelerator()
95
- # Use the training split now
96
- train_dataset = get_dataset("train", use_hf_data=use_hf_data, dataset_repo=dataset_repo)
97
- train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
98
-
99
- # Create your model (adjust constructor parameters as needed)
100
- model = Wildnerve_tlm01(
101
- vocab_size=30522,
102
- specialization="general",
103
- dataset_path="",
104
- model_name="bert-base-uncased",
105
- embedding_dim=256,
106
- num_heads=4,
107
- hidden_dim=256,
108
- num_layers=2,
109
- output_size=256,
110
- dropout=0.1,
111
- max_seq_length=128,
112
- pooling_mode="mean",
113
- use_pretrained_encoder=True
114
- )
115
- optimizer = optim.Adam(model.parameters(), lr=0.0001)
116
- # Replace MSELoss with CrossEntropyLoss.
117
- # Note: Assume model output logits are of shape [batch, seq_len, vocab_size]
118
- criterion = nn.CrossEntropyLoss()
119
 
120
- model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)
 
 
 
121
 
122
- num_epochs = 50 # Change from 30 to 50
123
- for epoch in range(num_epochs):
124
- total_loss = 0.0
125
- for batch in train_loader:
126
- x = batch[..., :-1] # omit last token for inputs
127
- y = batch[..., 1:] # omit first token for labels
128
-
129
- optimizer.zero_grad()
130
- output = model(x) # shape is [batch_size, vocab_size]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- # Print shapes for debugging
133
- logger.info(f"Epoch {epoch+1}, Output shape: {output.shape}, Target shape: {y.shape}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- # Since the model returns logits for just one position, take the first token from y
136
- # If your model really needs sequence data, you'd need a different handling strategy
137
- target = y[:, 0].long()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- # Use target directly - no reshape needed since it's already 1D
140
- loss = criterion(output, target)
141
 
142
- accelerator.backward(loss)
143
- optimizer.step()
144
- total_loss += loss.item()
145
- avg_loss = total_loss / len(train_loader)
146
- logger.info(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
147
- time.sleep(1) # simulate longer training
148
-
149
- # Save model weights as .pt then convert to .bin
150
- results_dir = r"c:\Users\User\OneDrive\Documents\tlm\results"
151
- os.makedirs(results_dir, exist_ok=True)
152
-
153
- pt_save_path = os.path.join(results_dir, "model_weights.pt")
154
- torch.save(model.state_dict(), pt_save_path)
155
- logger.info(f"Model weights saved to {pt_save_path}")
156
-
157
- # Convert .pt file to .bin with standard Hugging Face filename
158
- bin_save_path = os.path.join(results_dir, "pytorch_model.bin") # Changed filename here
159
- state_dict = torch.load(pt_save_path, weights_only=True)
160
- torch.save(state_dict, bin_save_path)
161
- logger.info(f"Model weights also saved as binary to {bin_save_path} (Hugging Face standard name)")
162
-
163
- # Keep original name for backward compatibility
164
- compat_bin_save_path = os.path.join(results_dir, "model_weights.bin")
165
- torch.save(state_dict, compat_bin_save_path)
166
- logger.info(f"Model weights also saved with original name for compatibility: {compat_bin_save_path}")
 
 
 
 
 
 
 
167
 
 
 
 
 
168
  if __name__ == "__main__":
169
- import argparse
170
- parser = argparse.ArgumentParser(description="Train the model")
171
- parser.add_argument("--use_hf_data", action="store_true", help="Use data from Hugging Face repo")
172
- parser.add_argument("--dataset_repo", type=str, default="EvolphTech/data", help="Hugging Face dataset repository")
173
- parser.add_argument("--epochs", type=int, default=50, help="Number of training epochs")
 
 
 
174
 
175
  args = parser.parse_args()
176
 
177
- train(use_hf_data=args.use_hf_data, dataset_repo=args.dataset_repo)
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Train a new Wildnerve model with parameters loaded from config.json.
3
+ """
4
  import os
5
+ import sys
 
6
  import torch
7
  import logging
8
+ import argparse
9
+ from pathlib import Path
10
+ from typing import Dict, Any, Optional, List, Tuple
 
 
11
 
12
+ # Import configuration
13
+ from config import app_config, get_model_architecture_params
14
 
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17
  logger = logging.getLogger(__name__)
 
18
 
19
+ def train_model(
20
+ specialization: str,
21
+ dataset_path: str,
22
+ output_dir: str,
23
+ num_epochs: Optional[int] = None,
24
+ batch_size: Optional[int] = None,
25
+ learning_rate: Optional[float] = None,
26
+ device: Optional[str] = None
27
+ ):
28
+ """Train a model with parameters from config.json"""
29
+ # Get model architecture parameters from config.json
30
+ arch_params = get_model_architecture_params()
31
+ logger.info(f"Loaded architecture parameters from config: {arch_params}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ # Get training parameters from config.json
34
+ if hasattr(app_config, "TRAINING_CONFIG"):
35
+ training_config = app_config.TRAINING_CONFIG
36
+ num_epochs = num_epochs or getattr(training_config, "NUM_EPOCHS", 10)
37
+ learning_rate = learning_rate or getattr(training_config, "LEARNING_RATE", 1e-4)
38
+ elif hasattr(app_config, "TRANSFORMER_CONFIG"):
39
+ transformer_config = app_config.TRANSFORMER_CONFIG
40
+ num_epochs = num_epochs or getattr(transformer_config, "NUM_EPOCHS", 10)
41
+ learning_rate = learning_rate or getattr(transformer_config, "LEARNING_RATE", 1e-4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ # Get data loader parameters from config.json
44
+ if hasattr(app_config, "DATA_LOADER_CONFIG"):
45
+ data_loader_config = app_config.DATA_LOADER_CONFIG
46
+ batch_size = batch_size or getattr(data_loader_config, "BATCH_SIZE", 32)
47
 
48
+ # Use command-line values as overrides, or fall back to defaults
49
+ num_epochs = num_epochs or 10
50
+ batch_size = batch_size or 32
51
+ learning_rate = learning_rate or 1e-4
52
+
53
+ # Create output directory
54
+ os.makedirs(output_dir, exist_ok=True)
55
+
56
+ # Set device
57
+ if device is None:
58
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
59
+ logger.info(f"Using device: {device}")
60
+
61
+ try:
62
+ # Import necessary modules
63
+ from model_Custm import Wildnerve_tlm01
64
+ from transformers import AutoTokenizer
65
+ from torch.utils.data import DataLoader, Dataset
66
+ import json
67
+
68
+ # Get model name from config
69
+ model_name = getattr(app_config.TRANSFORMER_CONFIG, "MODEL_NAME", "gpt2") if hasattr(app_config, "TRANSFORMER_CONFIG") else "gpt2"
70
+
71
+ # Initialize the tokenizer
72
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
73
+ if tokenizer.pad_token is None:
74
+ tokenizer.pad_token = tokenizer.eos_token
75
+
76
+ # Load dataset
77
+ logger.info(f"Loading dataset from {dataset_path}")
78
+ with open(dataset_path, 'r') as f:
79
+ data = json.load(f)
80
+
81
+ # Create a simple dataset class
82
+ class TextDataset(Dataset):
83
+ def __init__(self, texts, tokenizer, max_length):
84
+ self.encodings = tokenizer(texts, truncation=True, padding="max_length",
85
+ max_length=max_length, return_tensors="pt")
86
+
87
+ def __getitem__(self, idx):
88
+ item = {key: val[idx] for key, val in self.encodings.items()}
89
+ item["labels"] = item["input_ids"].clone()
90
+ return item
91
 
92
+ def __len__(self):
93
+ return len(self.encodings["input_ids"])
94
+
95
+ # Extract texts from your dataset
96
+ texts = [item["text"] for item in data]
97
+
98
+ # Create dataset and dataloader
99
+ train_dataset = TextDataset(texts, tokenizer, arch_params["max_seq_length"])
100
+ train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
101
+
102
+ # Log key parameters
103
+ logger.info(f"Training with parameters:")
104
+ logger.info(f"- specialization: {specialization}")
105
+ logger.info(f"- model_name: {model_name}")
106
+ logger.info(f"- embedding_dim: {arch_params['embedding_dim']}")
107
+ logger.info(f"- hidden_dim: {arch_params['hidden_dim']}")
108
+ logger.info(f"- num_heads: {arch_params['num_heads']}")
109
+ logger.info(f"- num_layers: {arch_params['num_layers']}")
110
+ logger.info(f"- vocab_size: {arch_params['vocab_size']}")
111
+ logger.info(f"- num_epochs: {num_epochs}")
112
+ logger.info(f"- batch_size: {batch_size}")
113
+ logger.info(f"- learning_rate: {learning_rate}")
114
+
115
+ # Initialize the model with architecture parameters from config
116
+ model = Wildnerve_tlm01(
117
+ vocab_size=arch_params["vocab_size"],
118
+ specialization=specialization,
119
+ dataset_path=dataset_path,
120
+ model_name=model_name,
121
+ embedding_dim=arch_params["embedding_dim"],
122
+ num_heads=arch_params["num_heads"],
123
+ hidden_dim=arch_params["hidden_dim"],
124
+ num_layers=arch_params["num_layers"],
125
+ output_size=arch_params["vocab_size"],
126
+ dropout=arch_params.get("dropout", 0.1),
127
+ max_seq_length=arch_params["max_seq_length"],
128
+ tokenizer=tokenizer
129
+ )
130
+
131
+ # Move model to the device
132
+ model.to(device)
133
+
134
+ # Set up optimizer
135
+ optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
136
+
137
+ # Training loop
138
+ logger.info(f"Starting training for {num_epochs} epochs")
139
+ for epoch in range(num_epochs):
140
+ model.train()
141
+ total_loss = 0
142
 
143
+ for batch_idx, batch in enumerate(train_dataloader):
144
+ # Move batch to device
145
+ batch = {k: v.to(device) for k, v in batch.items()}
146
+
147
+ # Forward pass
148
+ outputs = model(batch["input_ids"],
149
+ attention_mask=batch.get("attention_mask"))
150
+
151
+ # Calculate loss
152
+ loss = torch.nn.functional.cross_entropy(
153
+ outputs.view(-1, outputs.size(-1)),
154
+ batch["labels"].view(-1)
155
+ )
156
+
157
+ # Backward pass
158
+ optimizer.zero_grad()
159
+ loss.backward()
160
+ optimizer.step()
161
+
162
+ # Track loss
163
+ total_loss += loss.item()
164
+
165
+ if (batch_idx + 1) % 10 == 0:
166
+ logger.info(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx+1}/{len(train_dataloader)}, "
167
+ f"Loss: {loss.item():.4f}")
168
 
169
+ avg_loss = total_loss / len(train_dataloader)
170
+ logger.info(f"Epoch {epoch+1}/{num_epochs} completed. Average loss: {avg_loss:.4f}")
171
 
172
+ # Save checkpoint
173
+ checkpoint_path = os.path.join(output_dir, f"model_epoch_{epoch+1}.bin")
174
+ torch.save({
175
+ "model_state_dict": model.state_dict(),
176
+ "optimizer_state_dict": optimizer.state_dict(),
177
+ "epoch": epoch,
178
+ "loss": avg_loss,
179
+ "config": {
180
+ "embedding_dim": arch_params["embedding_dim"],
181
+ "hidden_dim": arch_params["hidden_dim"],
182
+ "num_heads": arch_params["num_heads"],
183
+ "num_layers": arch_params["num_layers"],
184
+ "vocab_size": arch_params["vocab_size"]
185
+ }
186
+ }, checkpoint_path)
187
+ logger.info(f"Saved checkpoint to {checkpoint_path}")
188
+
189
+ # Save final model
190
+ final_model_path = os.path.join(output_dir, f"{specialization}_final_model.bin")
191
+ torch.save({
192
+ "model_state_dict": model.state_dict(),
193
+ "config": {
194
+ "embedding_dim": arch_params["embedding_dim"],
195
+ "hidden_dim": arch_params["hidden_dim"],
196
+ "num_heads": arch_params["num_heads"],
197
+ "num_layers": arch_params["num_layers"],
198
+ "vocab_size": arch_params["vocab_size"]
199
+ }
200
+ }, final_model_path)
201
+ logger.info(f"Training completed. Final model saved to {final_model_path}")
202
+
203
+ return final_model_path
204
 
205
+ except Exception as e:
206
+ logger.error(f"Error during training: {e}", exc_info=True)
207
+ return None
208
+
209
  if __name__ == "__main__":
210
+ parser = argparse.ArgumentParser(description="Train a Wildnerve model")
211
+ parser.add_argument("--specialization", type=str, default="general", help="Model specialization")
212
+ parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset file")
213
+ parser.add_argument("--output", type=str, default="./checkpoints", help="Output directory")
214
+ parser.add_argument("--epochs", type=int, help="Number of training epochs (overrides config)")
215
+ parser.add_argument("--batch-size", type=int, help="Batch size (overrides config)")
216
+ parser.add_argument("--learning-rate", type=float, help="Learning rate (overrides config)")
217
+ parser.add_argument("--device", type=str, help="Device to use (cuda or cpu)")
218
 
219
  args = parser.parse_args()
220
 
221
+ train_model(
222
+ specialization=args.specialization,
223
+ dataset_path=args.dataset,
224
+ output_dir=args.output,
225
+ num_epochs=args.epochs,
226
+ batch_size=args.batch_size,
227
+ learning_rate=args.learning_rate,
228
+ device=args.device
229
+ )
verify_dimensions.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility to verify model dimensions across the codebase
3
+ """
4
+ import os
5
+ import json
6
+ import logging
7
+ import importlib.util
8
+
9
+ # Configure logging
10
+ logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
11
+ logger = logging.getLogger(__name__)
12
+
13
+ def check_config_json():
14
+ """Check dimensions in config.json"""
15
+ try:
16
+ config_path = os.path.join(os.path.dirname(__file__), "config.json")
17
+ with open(config_path, 'r') as f:
18
+ config = json.load(f)
19
+
20
+ if "TRANSFORMER_CONFIG" in config:
21
+ tc = config["TRANSFORMER_CONFIG"]
22
+ emb_dim = tc.get("EMBEDDING_DIM", 0)
23
+ hidden_dim = tc.get("HIDDEN_DIM", 0)
24
+ num_heads = tc.get("NUM_HEADS", 0)
25
+
26
+ logger.info(f"config.json dimensions: embedding={emb_dim}, hidden={hidden_dim}, heads={num_heads}")
27
+
28
+ if emb_dim != 768 or hidden_dim != 768 or num_heads != 12:
29
+ logger.warning(f"config.json has non-standard dimensions! Should be 768/768/12")
30
+ return False
31
+ return True
32
+ except Exception as e:
33
+ logger.error(f"Error checking config.json: {e}")
34
+ return False
35
+
36
+ def check_adapter_layer():
37
+ """Check dimensions in adapter_layer.py"""
38
+ try:
39
+ adapter_path = os.path.join(os.path.dirname(__file__), "adapter_layer.py")
40
+ with open(adapter_path, 'r') as f:
41
+ content = f.read()
42
+
43
+ # Look for model_params dictionary
44
+ if "embedding_dim\": 256" in content or "hidden_dim\": 256" in content:
45
+ logger.warning("adapter_layer.py contains 256 dimensions! Update to 768")
46
+ return False
47
+ elif "embedding_dim\": 768" in content and "hidden_dim\": 768" in content:
48
+ logger.info("adapter_layer.py has correct 768 dimensions")
49
+ return True
50
+ else:
51
+ logger.warning("Could not determine dimensions in adapter_layer.py")
52
+ return False
53
+ except Exception as e:
54
+ logger.error(f"Error checking adapter_layer.py: {e}")
55
+ return False
56
+
57
+ def check_model_manager():
58
+ """Check dimensions in model_manager.py"""
59
+ try:
60
+ model_manager_path = os.path.join(os.path.dirname(__file__), "model_manager.py")
61
+ with open(model_manager_path, 'r') as f:
62
+ content = f.read()
63
+
64
+ if "embedding_dim=256" in content or "hidden_dim=256" in content:
65
+ logger.warning("model_manager.py contains 256 dimensions! Update to 768")
66
+ return False
67
+ elif "embedding_dim=768" in content and "hidden_dim=768" in content:
68
+ logger.info("model_manager.py has correct 768 dimensions")
69
+ return True
70
+ else:
71
+ logger.warning("Could not determine dimensions in model_manager.py")
72
+ return False
73
+ except Exception as e:
74
+ logger.error(f"Error checking model_manager.py: {e}")
75
+ return False
76
+
77
+ def check_main_py():
78
+ """Check dimensions in main.py"""
79
+ try:
80
+ main_path = os.path.join(os.path.dirname(__file__), "main.py")
81
+ with open(main_path, 'r') as f:
82
+ content = f.read()
83
+
84
+ if "embedding_dim=256" in content or "hidden_dim=256" in content:
85
+ logger.warning("main.py contains 256 dimensions! Update to 768")
86
+ return False
87
+ elif "embedding_dim=768" in content and "hidden_dim=768" in content:
88
+ logger.info("main.py has correct 768 dimensions")
89
+ return True
90
+ else:
91
+ logger.warning("Could not determine dimensions in main.py")
92
+ return False
93
+ except Exception as e:
94
+ logger.error(f"Error checking main.py: {e}")
95
+ return False
96
+
97
+ def verify_all_dimensions():
98
+ """Check dimensions across all key files"""
99
+ results = {
100
+ "config.json": check_config_json(),
101
+ "adapter_layer.py": check_adapter_layer(),
102
+ "model_manager.py": check_model_manager(),
103
+ "main.py": check_main_py()
104
+ }
105
+
106
+ print("\n=== MODEL DIMENSION VERIFICATION ===")
107
+ all_correct = True
108
+ for file, correct in results.items():
109
+ status = "✓ CORRECT (768)" if correct else "✗ INCORRECT (256)"
110
+ print(f"{file:20} : {status}")
111
+ all_correct = all_correct and correct
112
+
113
+ print("\nOverall Status:", "✓ ALL CORRECT" if all_correct else "✗ NEEDS FIXING")
114
+ print("\nRun this script after making changes to verify all dimensions are set to 768.\n")
115
+
116
+ return all_correct
117
+
118
+ if __name__ == "__main__":
119
+ verify_all_dimensions()