Upload 11 files

Browse files

Files changed (7) hide show

adapter_layer.py +54 -10
config.py +27 -0
main.py +9 -9
model_manager.py +10 -10
optimize_attention.py +124 -0
train_model.py +210 -158
verify_dimensions.py +119 -0

adapter_layer.py CHANGED Viewed

@@ -234,7 +234,45 @@ class Wildnerve_tlm01(nn.Module):
                 logger.error("Could not import load_model_weights - missing dependencies?")
                 weight_files = {}
-            # Rest of model loading code (unchanged)
             # Try to load model_Custm first
             if "model_Custm" in self.available_models:
                 try:
@@ -249,17 +287,23 @@ class Wildnerve_tlm01(nn.Module):
                     if hasattr(model_custm, "Wildnerve_tlm01"):
                         logger.info("Creating Wildnerve_tlm01 from model_Custm")
                         model_class = getattr(model_custm, "Wildnerve_tlm01")
                         self.model = model_class(
                             tokenizer=self.tokenizer,
-                            vocab_size=50257,  # GPT-2 vocab size
-                            specialization="general",
-                            embedding_dim=768,
-                            num_heads=12,
-                            hidden_dim=768,
-                            num_layers=2,  # Reduced for memory efficiency
-                            output_size=50257,  # Match GPT-2 vocab
-                            dropout=0.1,
-                            max_seq_length=128  # Reduced for memory
                         )
                         # Enhanced weight loading with detailed path information

                 logger.error("Could not import load_model_weights - missing dependencies?")
                 weight_files = {}
+            # Try to detect weight dimensions to avoid mismatch
+            transformer_weight_path = None
+            if weight_files and "transformer" in weight_files:
+                transformer_weight_path = weight_files["transformer"]
+            # Use config values instead of hardcoding
+            try:
+                from config import app_config
+                transformer_config = getattr(app_config, "TRANSFORMER_CONFIG", {})
+                model_params = {
+                    "vocab_size": transformer_config.get("VOCAB_SIZE", 50257),  # GPT-2 vocab size
+                    "embedding_dim": transformer_config.get("EMBEDDING_DIM", 768),
+                    "num_heads": transformer_config.get("NUM_HEADS", 12),
+                    "hidden_dim": transformer_config.get("HIDDEN_DIM", 768),
+                    "num_layers": transformer_config.get("NUM_LAYERS", 12),
+                    "output_size": transformer_config.get("VOCAB_SIZE", 50257),
+                    "dropout": transformer_config.get("DROPOUT", 0.1),
+                    "max_seq_length": transformer_config.get("MAX_SEQ_LENGTH", 512)
+                }
+                logger.info(f"Using model parameters from config: hidden_dim={model_params['hidden_dim']}")
+            except Exception as e:
+                logger.warning(f"Error loading config values: {e}")
+                # Fallback to 768-dimensional parameters if config loading fails
+                model_params = {
+                    "vocab_size": 50257,  # GPT-2 vocab size
+                    "embedding_dim": 768,
+                    "num_heads": 12,
+                    "hidden_dim": 768,
+                    "num_layers": 12,
+                    "output_size": 50257,
+                    "dropout": 0.1,
+                    "max_seq_length": 512
+                }
+                logger.info(f"Using fallback model parameters: hidden_dim={model_params['hidden_dim']}")
+            # Rest of model loading code
             # Try to load model_Custm first
             if "model_Custm" in self.available_models:
                 try:
                     if hasattr(model_custm, "Wildnerve_tlm01"):
                         logger.info("Creating Wildnerve_tlm01 from model_Custm")
                         model_class = getattr(model_custm, "Wildnerve_tlm01")
+                        # Create model with safer config handling
+                        try:
+                            # Import config handling
+                            from config import app_config
+                            # Ensure config_data exists if app_config is a dict
+                            if isinstance(app_config, dict) and "TRANSFORMER_CONFIG" in app_config:
+                                if isinstance(app_config["TRANSFORMER_CONFIG"], dict) and "config_data" not in app_config["TRANSFORMER_CONFIG"]:
+                                    app_config["TRANSFORMER_CONFIG"]["config_data"] = app_config["TRANSFORMER_CONFIG"]
+                                    logger.info("Added config_data attribute to TRANSFORMER_CONFIG dictionary")
+                        except Exception as config_error:
+                            logger.warning(f"Config handling error: {config_error}")
+                        # Create model with weight-compatible parameters
                         self.model = model_class(
                             tokenizer=self.tokenizer,
+                            **model_params  # Use compatible parameters detected from weights
                         )
                         # Enhanced weight loading with detailed path information

config.py CHANGED Viewed

@@ -507,6 +507,33 @@ def load_config() -> Union[AppConfig, Dict[str, Any]]:
 # Global application config
 app_config = load_config()
 if __name__ == "__main__":
     args = argparse.ArgumentParser(description="Tiny Language Model Configuration").parse_args()
     print("Configuration loaded successfully!")

 # Global application config
 app_config = load_config()
+def get_model_architecture_params():
+    """Get model architecture parameters from config file"""
+    if hasattr(app_config, "TRANSFORMER_CONFIG"):
+        tc = app_config.TRANSFORMER_CONFIG
+        return {
+            "vocab_size": getattr(tc, "VOCAB_SIZE", 50257),
+            "embedding_dim": getattr(tc, "EMBEDDING_DIM", 768),
+            "num_heads": getattr(tc, "NUM_HEADS", 12),
+            "hidden_dim": getattr(tc, "HIDDEN_DIM", 768),
+            "num_layers": getattr(tc, "NUM_LAYERS", 12),
+            "output_size": getattr(tc, "VOCAB_SIZE", 50257),
+            "dropout": getattr(tc, "DROPOUT", 0.1),
+            "max_seq_length": getattr(tc, "MAX_SEQ_LENGTH", 512)
+        }
+    else:
+        # Default parameters if config not available
+        return {
+            "vocab_size": 50257,
+            "embedding_dim": 768,
+            "num_heads": 12,
+            "hidden_dim": 768,
+            "num_layers": 12,
+            "output_size": 50257,
+            "dropout": 0.1,
+            "max_seq_length": 512
+        }
 if __name__ == "__main__":
     args = argparse.ArgumentParser(description="Tiny Language Model Configuration").parse_args()
     print("Configuration loaded successfully!")

main.py CHANGED Viewed

@@ -869,18 +869,18 @@ def initialize_system():
     try:
         from model_Custm import Wildnerve_tlm01
         model = Wildnerve_tlm01(
-            vocab_size=50257,  # Match GPT-2 vocab size
             specialization="general",
             dataset_path=None,
-            model_name="gpt2",  # Use GPT-2 compatibility
-            embedding_dim=768,
-            num_heads=12,
-            hidden_dim=768,
-            num_layers=2,
-            output_size=50257,  # Match GPT-2 vocab size
             dropout=0.1,
-            max_seq_length=128,
-            pooling_mode="mean",
             tokenizer=tokenizer
         )

     try:
         from model_Custm import Wildnerve_tlm01
         model = Wildnerve_tlm01(
+            vocab_size=50257,  # GPT-2 vocab size
             specialization="general",
             dataset_path=None,
+            model_name="gpt2",
+            embedding_dim=768,  # Ensure 768-dimensional model
+            num_heads=12,       # 12 heads for 768-dim
+            hidden_dim=768,     # Ensure 768-dimensional model
+            num_layers=12,      # More layers for larger model
+            output_size=50257,  # GPT-2 vocab size
             dropout=0.1,
+            max_seq_length=1024,  # Increased for 768-dim model
+            pooling_mode="last",
             tokenizer=tokenizer
         )

model_manager.py CHANGED Viewed

@@ -208,18 +208,18 @@ class ModelManager:
         # Import and instantiate model with GPT-2 parameters instead of BERT
         model_cls = self._import_model_class(self.selected_models[0])
         params = dict(
-            vocab_size=50257,  # GPT-2 vocab size (was 30522 for BERT)
             specialization=spec,
             dataset_path=dataset_path,
-            model_name=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("MODEL_NAME", "gpt2"),  # Changed from bert-base-uncased
-            embedding_dim=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("EMBEDDING_DIM", 768),
-            num_heads=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("NUM_HEADS", 12),
-            hidden_dim=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("HIDDEN_DIM", 768),
-            num_layers=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("NUM_LAYERS", 6),
-            output_size=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("OUTPUT_SIZE", 50257),  # Match GPT-2 vocab
-            dropout=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("DROPOUT", 0.1),
-            max_seq_length=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("MAX_SEQ_LENGTH", 1024),  # GPT-2 supports longer contexts
-            pooling_mode=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("POOLING_MODE", "last"),  # GPT-2 typically uses last token
             tokenizer=self.tokenizer
         )

         # Import and instantiate model with GPT-2 parameters instead of BERT
         model_cls = self._import_model_class(self.selected_models[0])
         params = dict(
+            vocab_size=50257,  # GPT-2 vocab size
             specialization=spec,
             dataset_path=dataset_path,
+            model_name=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("MODEL_NAME", "gpt2"),
+            embedding_dim=768,  # Ensure 768-dimensional model
+            num_heads=12,       # 12 heads for 768-dim
+            hidden_dim=768,     # Ensure 768-dimensional model
+            num_layers=12,      # More layers for larger model
+            output_size=50257,  # GPT-2 vocab size
+            dropout=0.1,
+            max_seq_length=1024,  # Increased for 768-dim model
+            pooling_mode=safe_get_config_value(app_config, "TRANSFORMER_CONFIG", {}).get("POOLING_MODE", "last"),
             tokenizer=self.tokenizer
         )

optimize_attention.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+Optimize smartHybridAttention parameters for 256-dimensional models
+"""
+import os
+import json
+import logging
+import torch
+from typing import Dict, Any
+logger = logging.getLogger(__name__)
+def optimize_attention_for_small_dimensions(
+    dim: int = 256,
+    model_dir: str = None
+) -> Dict[str, Any]:
+    """
+    Creates optimized attention parameters for small-dimensional models
+    Args:
+        dim: Model dimension (default: 256)
+        model_dir: Directory to save optimization settings
+    Returns:
+        Dictionary with optimized attention parameters
+    """
+    # Base config with enhanced parameters for 256-dim models
+    config = {
+        "DIM": dim,
+        "NUM_HEADS": 8,  # 8 heads works well for 256-dim (32 dim per head)
+        "WINDOW_SIZE": 512,  # Larger window to capture more context
+        "USE_SLIDING": True,
+        "USE_GLOBAL": True,
+        "USE_HIERARCHICAL": True,  # Enable hierarchical attention for 256-dim
+        "GLOBAL_TOKEN_RATIO": 0.12,  # Increase global tokens (12% vs standard 5%)
+        "MEMORY_TOKENS": 48,  # More memory tokens (48 vs standard 32)
+        "STRIDE": 256,  # Stride = window_size / 2
+        "MAX_SEQ_LENGTH": 2048,  # Support longer sequences with sparse attention
+        "LAYER_SPECIALIZATION": True,  # Each layer can have different attention types
+        "ATTENTION_DROPOUT": 0.1,
+        "RECENCY_BIAS": 0.3,  # Add recency bias to prioritize recent context
+    }
+    # Special layer-specific optimizations for 256-dim models
+    config["LAYER_CONFIG"] = {
+        # Lower layers focus on local patterns
+        "0": {"WINDOW_SIZE": 128, "GLOBAL_TOKEN_RATIO": 0.05, "USE_HIERARCHICAL": False},
+        "1": {"WINDOW_SIZE": 256, "GLOBAL_TOKEN_RATIO": 0.08, "USE_HIERARCHICAL": False},
+        # Middle layers use hybrid approach
+        "2": {"WINDOW_SIZE": 384, "GLOBAL_TOKEN_RATIO": 0.10, "USE_HIERARCHICAL": True},
+        "3": {"WINDOW_SIZE": 512, "GLOBAL_TOKEN_RATIO": 0.12, "USE_HIERARCHICAL": True},
+        # Upper layers focus more on global connections
+        "4": {"WINDOW_SIZE": 768, "GLOBAL_TOKEN_RATIO": 0.15, "USE_HIERARCHICAL": True},
+        "5": {"WINDOW_SIZE": 1024, "GLOBAL_TOKEN_RATIO": 0.18, "USE_HIERARCHICAL": True},
+    }
+    if model_dir:
+        os.makedirs(model_dir, exist_ok=True)
+        config_path = os.path.join(model_dir, "attention_config_256dim.json")
+        with open(config_path, "w") as f:
+            json.dump(config, f, indent=2)
+        logger.info(f"Saved optimized attention config to {config_path}")
+    return config
+def apply_optimized_attention_to_model(
+    model,
+    dim: int = 256,
+    config: Dict[str, Any] = None
+) -> bool:
+    """
+    Apply optimized attention parameters to existing model
+    Args:
+        model: The model to optimize
+        dim: Model dimension (default: 256)
+        config: Attention configuration (generated if None)
+    Returns:
+        Success status
+    """
+    try:
+        if config is None:
+            config = optimize_attention_for_small_dimensions(dim)
+        # Find attention modules in model
+        attention_layers = []
+        for name, module in model.named_modules():
+            if "attention" in name.lower() or hasattr(module, 'smartHybridAttention'):
+                attention_layers.append((name, module))
+        if not attention_layers:
+            logger.warning("No attention layers found in model")
+            return False
+        logger.info(f"Found {len(attention_layers)} attention layers to optimize")
+        # Apply configuration to each layer
+        for i, (name, layer) in enumerate(attention_layers):
+            layer_idx = str(i)
+            layer_config = config["LAYER_CONFIG"].get(layer_idx, {})
+            # Apply layer-specific configs
+            for key, value in layer_config.items():
+                if hasattr(layer, key.lower()):
+                    setattr(layer, key.lower(), value)
+                    logger.info(f"Set {key.lower()}={value} for layer {name}")
+            # Apply global configs where specific isn't set
+            for key, value in config.items():
+                if key != "LAYER_CONFIG" and hasattr(layer, key.lower()) and key not in layer_config:
+                    setattr(layer, key.lower(), value)
+        logger.info("Successfully applied optimized attention parameters")
+        return True
+    except Exception as e:
+        logger.error(f"Error applying attention optimization: {e}")
+        return False
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    config = optimize_attention_for_small_dimensions()
+    print("Generated optimized attention config for 256-dim model:")
+    print(json.dumps(config, indent=2))

train_model.py CHANGED Viewed

@@ -1,177 +1,229 @@
 import os
-import glob
-import time
 import torch
 import logging
-from torch import nn, optim
-from accelerate import Accelerator
-from torch.utils.data import DataLoader
-from typing import Optional, Dict, List, Any
-from datasets import load_dataset, concatenate_datasets, Features, Value
-# Import your core model; choose one implementation for training.
-from model_Custm import Wildnerve_tlm01
 logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-# New helper function to flatten JSON with hierarchical markers.
-def flatten_json(data):
-    if isinstance(data, dict):
-        parts = []
-        for key, value in data.items():
-            parts.append(f"{key}:{{{flatten_json(value)}}}")
-        return " ".join(parts)
-    elif isinstance(data, list):
-        # Fixed the typo here: use "=" instead of "are"
-        parts = [flatten_json(item) for item in data]
-        return "[" + ", ".join(parts) + "]"
-    else:
-        return str(data)
-# New definition for convert_record, which uses flatten_json()
-def convert_record(record):
-    raw = record.get("text", "")
-    try:
-        import json
-        data = json.loads(raw)
-        combined = flatten_json(data)
-        return {"input": combined}
-    except Exception:
-        return {"input": raw}
-# Import tokenizer to convert text into tensor input
-from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-# Updated get_dataset() function to load from Hugging Face repo
-def get_dataset(split="train", use_hf_data=True, dataset_repo="EvolphTech/data"):
-    if use_hf_data:
-        try:
-            logger.info(f"Loading dataset from Hugging Face: {dataset_repo}")
-            dataset = load_dataset(dataset_repo, split=split)
-            # If the dataset has a 'text' column, use it directly
-            if 'text' in dataset.column_names:
-                dataset = dataset.map(lambda x: {"input": x["text"]})
-            else:
-                logger.warning(f"No 'text' column found in {dataset_repo}. Using first text column found.")
-                # Try to find a text column
-                text_columns = [col for col in dataset.column_names if dataset.features[col].dtype == 'string']
-                if text_columns:
-                    dataset = dataset.map(lambda x: {"input": x[text_columns[0]]})
-                else:
-                    raise ValueError(f"No text columns found in {dataset_repo}")
-            logger.info(f"Successfully loaded {len(dataset)} samples from Hugging Face")
-        except Exception as e:
-            logger.error(f"Failed to load dataset from Hugging Face: {e}")
-            logger.info("Falling back to local dataset")
-            return get_dataset(split=split, use_hf_data=False)
-    else:
-        # Fall back to the original local dataset loading logic
-        data_dir = r"c:\Users\User\OneDrive\Documents\tlm\Wildnerve-tlm_HF"
-        data_files = {
-            "train": os.path.join(data_dir, "train.json"),
-            "validation": os.path.join(data_dir, "validation.json")
-        }
-        features = Features({"text": Value("string")})
-        dataset = load_dataset("json", data_files=data_files, features=features, split=split, download_mode="force_redownload")
-        dataset = dataset.map(lambda x: {"input": x["text"]})
-    class CustomDataset(torch.utils.data.Dataset):
-        def __init__(self, data):
-            self.data = data["input"]
-        def __len__(self):
-            return len(self.data)
-        def __getitem__(self, idx):
-            tokens = tokenizer(self.data[idx], truncation=True, padding="max_length", max_length=128, return_tensors="pt")
-            return tokens["input_ids"].squeeze(0)
-    return CustomDataset(dataset)
-def train(use_hf_data=True, dataset_repo="EvolphTech/data"):
-    accelerator = Accelerator()
-    # Use the training split now
-    train_dataset = get_dataset("train", use_hf_data=use_hf_data, dataset_repo=dataset_repo)
-    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
-    # Create your model (adjust constructor parameters as needed)
-    model = Wildnerve_tlm01(
-        vocab_size=30522,
-        specialization="general",
-        dataset_path="",
-        model_name="bert-base-uncased",
-        embedding_dim=256,
-        num_heads=4,
-        hidden_dim=256,
-        num_layers=2,
-        output_size=256,
-        dropout=0.1,
-        max_seq_length=128,
-        pooling_mode="mean",
-        use_pretrained_encoder=True
-    )
-    optimizer = optim.Adam(model.parameters(), lr=0.0001)
-    # Replace MSELoss with CrossEntropyLoss.
-    # Note: Assume model output logits are of shape [batch, seq_len, vocab_size]
-    criterion = nn.CrossEntropyLoss()
-    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)
-    num_epochs = 50  # Change from 30 to 50
-    for epoch in range(num_epochs):
-        total_loss = 0.0
-        for batch in train_loader:
-            x = batch[..., :-1]  # omit last token for inputs
-            y = batch[..., 1:]   # omit first token for labels
-            optimizer.zero_grad()
-            output = model(x)  # shape is [batch_size, vocab_size]
-            # Print shapes for debugging
-            logger.info(f"Epoch {epoch+1}, Output shape: {output.shape}, Target shape: {y.shape}")
-            # Since the model returns logits for just one position, take the first token from y
-            # If your model really needs sequence data, you'd need a different handling strategy
-            target = y[:, 0].long()
-            # Use target directly - no reshape needed since it's already 1D
-            loss = criterion(output, target)
-            accelerator.backward(loss)
-            optimizer.step()
-            total_loss += loss.item()
-        avg_loss = total_loss / len(train_loader)
-        logger.info(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
-        time.sleep(1)  # simulate longer training
-    # Save model weights as .pt then convert to .bin
-    results_dir = r"c:\Users\User\OneDrive\Documents\tlm\results"
-    os.makedirs(results_dir, exist_ok=True)
-    pt_save_path = os.path.join(results_dir, "model_weights.pt")
-    torch.save(model.state_dict(), pt_save_path)
-    logger.info(f"Model weights saved to {pt_save_path}")
-    # Convert .pt file to .bin with standard Hugging Face filename
-    bin_save_path = os.path.join(results_dir, "pytorch_model.bin")  # Changed filename here
-    state_dict = torch.load(pt_save_path, weights_only=True)
-    torch.save(state_dict, bin_save_path)
-    logger.info(f"Model weights also saved as binary to {bin_save_path} (Hugging Face standard name)")
-    # Keep original name for backward compatibility
-    compat_bin_save_path = os.path.join(results_dir, "model_weights.bin")
-    torch.save(state_dict, compat_bin_save_path)
-    logger.info(f"Model weights also saved with original name for compatibility: {compat_bin_save_path}")
 if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Train the model")
-    parser.add_argument("--use_hf_data", action="store_true", help="Use data from Hugging Face repo")
-    parser.add_argument("--dataset_repo", type=str, default="EvolphTech/data", help="Hugging Face dataset repository")
-    parser.add_argument("--epochs", type=int, default=50, help="Number of training epochs")
     args = parser.parse_args()
-    train(use_hf_data=args.use_hf_data, dataset_repo=args.dataset_repo)

+"""
+Train a new Wildnerve model with parameters loaded from config.json.
+"""
 import os
+import sys
 import torch
 import logging
+import argparse
+from pathlib import Path
+from typing import Dict, Any, Optional, List, Tuple
+# Import configuration
+from config import app_config, get_model_architecture_params
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+def train_model(
+    specialization: str,
+    dataset_path: str,
+    output_dir: str,
+    num_epochs: Optional[int] = None,
+    batch_size: Optional[int] = None,
+    learning_rate: Optional[float] = None,
+    device: Optional[str] = None
+):
+    """Train a model with parameters from config.json"""
+    # Get model architecture parameters from config.json
+    arch_params = get_model_architecture_params()
+    logger.info(f"Loaded architecture parameters from config: {arch_params}")
+    # Get training parameters from config.json
+    if hasattr(app_config, "TRAINING_CONFIG"):
+        training_config = app_config.TRAINING_CONFIG
+        num_epochs = num_epochs or getattr(training_config, "NUM_EPOCHS", 10)
+        learning_rate = learning_rate or getattr(training_config, "LEARNING_RATE", 1e-4)
+    elif hasattr(app_config, "TRANSFORMER_CONFIG"):
+        transformer_config = app_config.TRANSFORMER_CONFIG
+        num_epochs = num_epochs or getattr(transformer_config, "NUM_EPOCHS", 10)
+        learning_rate = learning_rate or getattr(transformer_config, "LEARNING_RATE", 1e-4)
+    # Get data loader parameters from config.json
+    if hasattr(app_config, "DATA_LOADER_CONFIG"):
+        data_loader_config = app_config.DATA_LOADER_CONFIG
+        batch_size = batch_size or getattr(data_loader_config, "BATCH_SIZE", 32)
+    # Use command-line values as overrides, or fall back to defaults
+    num_epochs = num_epochs or 10
+    batch_size = batch_size or 32
+    learning_rate = learning_rate or 1e-4
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    # Set device
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    logger.info(f"Using device: {device}")
+    try:
+        # Import necessary modules
+        from model_Custm import Wildnerve_tlm01
+        from transformers import AutoTokenizer
+        from torch.utils.data import DataLoader, Dataset
+        import json
+        # Get model name from config
+        model_name = getattr(app_config.TRANSFORMER_CONFIG, "MODEL_NAME", "gpt2") if hasattr(app_config, "TRANSFORMER_CONFIG") else "gpt2"
+        # Initialize the tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        # Load dataset
+        logger.info(f"Loading dataset from {dataset_path}")
+        with open(dataset_path, 'r') as f:
+            data = json.load(f)
+        # Create a simple dataset class
+        class TextDataset(Dataset):
+            def __init__(self, texts, tokenizer, max_length):
+                self.encodings = tokenizer(texts, truncation=True, padding="max_length",
+                                         max_length=max_length, return_tensors="pt")
+            def __getitem__(self, idx):
+                item = {key: val[idx] for key, val in self.encodings.items()}
+                item["labels"] = item["input_ids"].clone()
+                return item
+            def __len__(self):
+                return len(self.encodings["input_ids"])
+        # Extract texts from your dataset
+        texts = [item["text"] for item in data]
+        # Create dataset and dataloader
+        train_dataset = TextDataset(texts, tokenizer, arch_params["max_seq_length"])
+        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+        # Log key parameters
+        logger.info(f"Training with parameters:")
+        logger.info(f"- specialization: {specialization}")
+        logger.info(f"- model_name: {model_name}")
+        logger.info(f"- embedding_dim: {arch_params['embedding_dim']}")
+        logger.info(f"- hidden_dim: {arch_params['hidden_dim']}")
+        logger.info(f"- num_heads: {arch_params['num_heads']}")
+        logger.info(f"- num_layers: {arch_params['num_layers']}")
+        logger.info(f"- vocab_size: {arch_params['vocab_size']}")
+        logger.info(f"- num_epochs: {num_epochs}")
+        logger.info(f"- batch_size: {batch_size}")
+        logger.info(f"- learning_rate: {learning_rate}")
+        # Initialize the model with architecture parameters from config
+        model = Wildnerve_tlm01(
+            vocab_size=arch_params["vocab_size"],
+            specialization=specialization,
+            dataset_path=dataset_path,
+            model_name=model_name,
+            embedding_dim=arch_params["embedding_dim"],
+            num_heads=arch_params["num_heads"],
+            hidden_dim=arch_params["hidden_dim"],
+            num_layers=arch_params["num_layers"],
+            output_size=arch_params["vocab_size"],
+            dropout=arch_params.get("dropout", 0.1),
+            max_seq_length=arch_params["max_seq_length"],
+            tokenizer=tokenizer
+        )
+        # Move model to the device
+        model.to(device)
+        # Set up optimizer
+        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+        # Training loop
+        logger.info(f"Starting training for {num_epochs} epochs")
+        for epoch in range(num_epochs):
+            model.train()
+            total_loss = 0
+            for batch_idx, batch in enumerate(train_dataloader):
+                # Move batch to device
+                batch = {k: v.to(device) for k, v in batch.items()}
+                # Forward pass
+                outputs = model(batch["input_ids"],
+                              attention_mask=batch.get("attention_mask"))
+                # Calculate loss
+                loss = torch.nn.functional.cross_entropy(
+                    outputs.view(-1, outputs.size(-1)),
+                    batch["labels"].view(-1)
+                )
+                # Backward pass
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+                # Track loss
+                total_loss += loss.item()
+                if (batch_idx + 1) % 10 == 0:
+                    logger.info(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx+1}/{len(train_dataloader)}, "
+                              f"Loss: {loss.item():.4f}")
+            avg_loss = total_loss / len(train_dataloader)
+            logger.info(f"Epoch {epoch+1}/{num_epochs} completed. Average loss: {avg_loss:.4f}")
+            # Save checkpoint
+            checkpoint_path = os.path.join(output_dir, f"model_epoch_{epoch+1}.bin")
+            torch.save({
+                "model_state_dict": model.state_dict(),
+                "optimizer_state_dict": optimizer.state_dict(),
+                "epoch": epoch,
+                "loss": avg_loss,
+                "config": {
+                    "embedding_dim": arch_params["embedding_dim"],
+                    "hidden_dim": arch_params["hidden_dim"],
+                    "num_heads": arch_params["num_heads"],
+                    "num_layers": arch_params["num_layers"],
+                    "vocab_size": arch_params["vocab_size"]
+                }
+            }, checkpoint_path)
+            logger.info(f"Saved checkpoint to {checkpoint_path}")
+        # Save final model
+        final_model_path = os.path.join(output_dir, f"{specialization}_final_model.bin")
+        torch.save({
+            "model_state_dict": model.state_dict(),
+            "config": {
+                "embedding_dim": arch_params["embedding_dim"],
+                "hidden_dim": arch_params["hidden_dim"],
+                "num_heads": arch_params["num_heads"],
+                "num_layers": arch_params["num_layers"],
+                "vocab_size": arch_params["vocab_size"]
+            }
+        }, final_model_path)
+        logger.info(f"Training completed. Final model saved to {final_model_path}")
+        return final_model_path
+    except Exception as e:
+        logger.error(f"Error during training: {e}", exc_info=True)
+        return None
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Train a Wildnerve model")
+    parser.add_argument("--specialization", type=str, default="general", help="Model specialization")
+    parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset file")
+    parser.add_argument("--output", type=str, default="./checkpoints", help="Output directory")
+    parser.add_argument("--epochs", type=int, help="Number of training epochs (overrides config)")
+    parser.add_argument("--batch-size", type=int, help="Batch size (overrides config)")
+    parser.add_argument("--learning-rate", type=float, help="Learning rate (overrides config)")
+    parser.add_argument("--device", type=str, help="Device to use (cuda or cpu)")
     args = parser.parse_args()
+    train_model(
+        specialization=args.specialization,
+        dataset_path=args.dataset,
+        output_dir=args.output,
+        num_epochs=args.epochs,
+        batch_size=args.batch_size,
+        learning_rate=args.learning_rate,
+        device=args.device
+    )

verify_dimensions.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+Utility to verify model dimensions across the codebase
+"""
+import os
+import json
+import logging
+import importlib.util
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+def check_config_json():
+    """Check dimensions in config.json"""
+    try:
+        config_path = os.path.join(os.path.dirname(__file__), "config.json")
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+        if "TRANSFORMER_CONFIG" in config:
+            tc = config["TRANSFORMER_CONFIG"]
+            emb_dim = tc.get("EMBEDDING_DIM", 0)
+            hidden_dim = tc.get("HIDDEN_DIM", 0)
+            num_heads = tc.get("NUM_HEADS", 0)
+            logger.info(f"config.json dimensions: embedding={emb_dim}, hidden={hidden_dim}, heads={num_heads}")
+            if emb_dim != 768 or hidden_dim != 768 or num_heads != 12:
+                logger.warning(f"config.json has non-standard dimensions! Should be 768/768/12")
+                return False
+            return True
+    except Exception as e:
+        logger.error(f"Error checking config.json: {e}")
+        return False
+def check_adapter_layer():
+    """Check dimensions in adapter_layer.py"""
+    try:
+        adapter_path = os.path.join(os.path.dirname(__file__), "adapter_layer.py")
+        with open(adapter_path, 'r') as f:
+            content = f.read()
+        # Look for model_params dictionary
+        if "embedding_dim\": 256" in content or "hidden_dim\": 256" in content:
+            logger.warning("adapter_layer.py contains 256 dimensions! Update to 768")
+            return False
+        elif "embedding_dim\": 768" in content and "hidden_dim\": 768" in content:
+            logger.info("adapter_layer.py has correct 768 dimensions")
+            return True
+        else:
+            logger.warning("Could not determine dimensions in adapter_layer.py")
+            return False
+    except Exception as e:
+        logger.error(f"Error checking adapter_layer.py: {e}")
+        return False
+def check_model_manager():
+    """Check dimensions in model_manager.py"""
+    try:
+        model_manager_path = os.path.join(os.path.dirname(__file__), "model_manager.py")
+        with open(model_manager_path, 'r') as f:
+            content = f.read()
+        if "embedding_dim=256" in content or "hidden_dim=256" in content:
+            logger.warning("model_manager.py contains 256 dimensions! Update to 768")
+            return False
+        elif "embedding_dim=768" in content and "hidden_dim=768" in content:
+            logger.info("model_manager.py has correct 768 dimensions")
+            return True
+        else:
+            logger.warning("Could not determine dimensions in model_manager.py")
+            return False
+    except Exception as e:
+        logger.error(f"Error checking model_manager.py: {e}")
+        return False
+def check_main_py():
+    """Check dimensions in main.py"""
+    try:
+        main_path = os.path.join(os.path.dirname(__file__), "main.py")
+        with open(main_path, 'r') as f:
+            content = f.read()
+        if "embedding_dim=256" in content or "hidden_dim=256" in content:
+            logger.warning("main.py contains 256 dimensions! Update to 768")
+            return False
+        elif "embedding_dim=768" in content and "hidden_dim=768" in content:
+            logger.info("main.py has correct 768 dimensions")
+            return True
+        else:
+            logger.warning("Could not determine dimensions in main.py")
+            return False
+    except Exception as e:
+        logger.error(f"Error checking main.py: {e}")
+        return False
+def verify_all_dimensions():
+    """Check dimensions across all key files"""
+    results = {
+        "config.json": check_config_json(),
+        "adapter_layer.py": check_adapter_layer(),
+        "model_manager.py": check_model_manager(),
+        "main.py": check_main_py()
+    }
+    print("\n=== MODEL DIMENSION VERIFICATION ===")
+    all_correct = True
+    for file, correct in results.items():
+        status = "✓ CORRECT (768)" if correct else "✗ INCORRECT (256)"
+        print(f"{file:20} : {status}")
+        all_correct = all_correct and correct
+    print("\nOverall Status:", "✓ ALL CORRECT" if all_correct else "✗ NEEDS FIXING")
+    print("\nRun this script after making changes to verify all dimensions are set to 768.\n")
+    return all_correct
+if __name__ == "__main__":
+    verify_all_dimensions()