# config.py - 21/02/2025, cleaned up version, 5:14pm, C:\Users\User\OneDrive\Documents\tlm\config.py import os import json import logging import argparse import pydantic # prefer real import in main block from utils import dependency_helpers # Changed to direct import from utils package from pathlib import Path from typing import Optional, Dict, List, Literal, Any, Union # flag indicating real pydantic is present pydantic_available = True # Attempt to load pydantic and fall back on dummy types try: from pydantic import BaseModel, Field, ValidationError, ConfigDict except ImportError: pydantic_available = False logger = logging.getLogger(__name__) logger.warning("pydantic not available, using dummy BaseModel") class BaseModel: def __init__(self, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) Field = lambda *args, **kwargs: None ValidationError = Exception ConfigDict = dict # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) if pydantic_available: logger.info(f"Loaded pydantic v{pydantic.__version__}") else: logger.debug("Operating with dummy pydantic types") class PathConfig: """Handle path configurations""" @staticmethod def get_project_root() -> Path: return Path(__file__).resolve().parent @staticmethod def get_data_dir() -> Path: """Get writable data directory, falling back to temp if needed""" # First try in project directory project_dir = PathConfig.get_project_root() data_dir = project_dir / "data" # Check if we can write to this location try: if not data_dir.exists(): data_dir.mkdir(parents=True, exist_ok=True) # Test write access with a small file test_file = data_dir / ".write_test" test_file.touch() test_file.unlink() return data_dir except (PermissionError, IOError): # Fall back to temp directory import tempfile tmp_dir = Path(tempfile.gettempdir()) / "wildnerve_data" tmp_dir.mkdir(parents=True, exist_ok=True) logger.info("Using temporary directory for data: %s", tmp_dir) return tmp_dir @staticmethod def get_checkpoint_dir() -> Path: # First try in project directory project_dir = PathConfig.get_project_root() checkpoint_dir = project_dir / "checkpoints" # Check if we can write to this directory if os.access(project_dir, os.W_OK): return checkpoint_dir # If not writable, fallback to temp directory import tempfile tmp_dir = Path(tempfile.gettempdir()) / "wildnerve_checkpoints" return tmp_dir # Replace the current directory setup with: BASE_DIR = PathConfig.get_project_root() DATA_DIR = PathConfig.get_data_dir() CHECKPOINT_DIR = PathConfig.get_checkpoint_dir() # Add these model architecture parameters INPUT_SIZE = 768 # BERT base hidden size OUTPUT_SIZE = 768 # Output embedding size HIDDEN_SIZE = 768 # Hidden layer size # Add SPECIALIZATIONS list SPECIALIZATIONS = [ "python", "rust", "solidity", "computer", "cpp", "go", "java", "javascript", "mathematics", "nim", "other_information", "physics" ] # Define DATASET_PATHS so that each specialization maps to its JSON files DATASET_PATHS = { "python": [ str(DATA_DIR / "data" / "python_mbpp.json"), str(DATA_DIR / "data" / "python_programming.json"), str(DATA_DIR / "data" / "python_transformer_model.json") ], "rust": [ str(DATA_DIR / "data" / "rust_ai_language_model.json"), str(DATA_DIR / "data" / "rust_blockchain.json"), str(DATA_DIR / "data" / "rust_mbrp.json"), str(DATA_DIR / "data" / "rust_programming.json") ], "solidity": [ str(DATA_DIR / "data" / "solidity_programming.json") ], "computer": [ str(DATA_DIR / "data" / "computer_advanced_debugging.json"), str(DATA_DIR / "data" / "computer_agenticAI.json"), str(DATA_DIR / "data" / "computer_architecture.json"), str(DATA_DIR / "data" / "computer_cloud_security.json"), str(DATA_DIR / "data" / "computer_cloudCI-CD.json"), str(DATA_DIR / "data" / "computer_creativity.json"), str(DATA_DIR / "data" / "computer_crossplatform.json"), str(DATA_DIR / "data" / "computer_cybersecurity.json"), str(DATA_DIR / "data" / "computer_error_handling_examples.json"), str(DATA_DIR / "data" / "computer_gitInstruct.json") ], "cpp": [ str(DATA_DIR / "data" / "cpp_ai_language_model.json"), str(DATA_DIR / "data" / "cpp_blockchain.json"), str(DATA_DIR / "data" / "cpp_mbcppp.json"), str(DATA_DIR / "data" / "cpp_programming.json") ], "go": [ str(DATA_DIR / "data" / "golang_ai_language_model.json"), str(DATA_DIR / "data" / "golang_mbgp.json"), str(DATA_DIR / "data" / "golang_programming.json") ], "java": [ str(DATA_DIR / "data" / "java_ai_language_model.json"), str(DATA_DIR / "data" / "java_blockchain.json"), str(DATA_DIR / "data" / "java_mbjp.json"), str(DATA_DIR / "data" / "java_programming.json"), str(DATA_DIR / "data" / "java_transformer_language_model.json") ], "javascript": [ str(DATA_DIR / "data" / "javascript_chatbot.json"), str(DATA_DIR / "data" / "javascript_n_Typescript_backend.json"), str(DATA_DIR / "data" / "javascript_n_Typescript_frontend.json"), str(DATA_DIR / "data" / "javascript_n_Typescript_programming.json") ], "mathematics": [ str(DATA_DIR / "data" / "mathematics.json"), str(DATA_DIR / "data" / "mathematics_training.json") ], "nim": [ str(DATA_DIR / "data" / "nim_ai_language_model.json"), str(DATA_DIR / "data" / "nim_blockchain.json"), str(DATA_DIR / "data" / "nim_chatbot.json"), str(DATA_DIR / "data" / "nim_conversation.json"), str(DATA_DIR / "data" / "nim_mbnp.json"), str(DATA_DIR / "data" / "nim_programming.json") ], "other_information": [ str(DATA_DIR / "data" / "other_information.json") ], "physics": [ str(DATA_DIR / "data" / "physics_n_engineering.json"), str(DATA_DIR / "data" / "physics_n_engineering_applied.json"), str(DATA_DIR / "data" / "project_structure.json"), str(DATA_DIR / "data" / "python_chatbot_guide.json") ] } # Nested configuration models class TrainingConfig(BaseModel): PATIENCE: int = Field(..., description="Early stopping patience") DELTA: float = Field(..., description="Minimum change in the monitored value") VERBOSE: bool = Field(..., description="Verbosity of training logs") NUM_EPOCHS: int = Field(..., description="Number of training epochs") LEARNING_RATE: float = Field(..., description="Learning rate for optimizer") TRANSFORMER_LEARNING_RATE: float = Field(..., description="Learning rate for transformer") TRANSFORMER_NUM_EPOCHS: int = Field(..., description="Transformer training epochs") model_config = ConfigDict( validate_assignment=True, extra="allow" ) class CheckpointConfig(BaseModel): PATH: str = Field(..., description="Checkpoint saving folder") BASE_DIR: str = Field(..., description="Base directory for checkpoints") TRANSFORMER_FILENAME_FORMAT: str = Field(..., description="Transformer checkpoint filename format") SNN_FILENAME_FORMAT: str = Field(..., description="SNN checkpoint filename format") model_config = ConfigDict( validate_assignment=True, extra="allow" ) class TokenizerConfig(BaseModel): MODEL_NAME: str = Field(..., description="Name of the tokenizer model") MAX_SEQ_LENGTH: int = Field(..., description="Maximum length the tokenizer handles") POOLING_MODE: str = Field(..., description="Pooling mode for embeddings") model_config = ConfigDict( validate_assignment=True, extra="allow" ) class DataLoaderConfig(BaseModel): SHUFFLE: bool = Field(..., description="Whether to Shuffle the dataset") BATCH_SIZE: int = Field(..., description="Batch size for dataloader") NUM_WORKERS: int = Field(..., description="Number of workers for dataloader") INCLUDE_CRAWL: bool = Field(..., description="Include crawl parameter") model_config = ConfigDict( validate_assignment=True, extra="allow" ) class GenerationConfig(BaseModel): temperature: float = Field(0.7, description="Decoding temperature.") top_p: float = Field(0.9, description="Nucleus sampling probability.") num_return_sequences: int = Field(1, description="Number of sequences to generate.") model_config = ConfigDict( validate_assignment=True, extra="allow" ) class PretrainedLimitsConfig(BaseModel): GPT2: int = Field(1024, description="Maximum sequence length for GPT-2") BERT: int = Field(512, description="Maximum sequence length for BERT") model_config = ConfigDict( validate_assignment=True, extra="allow" ) class CustomWindowsConfig(BaseModel): MAX_SEQ_LENGTH: int = Field(2048, description="Maximum sequence length for custom models") WINDOW_SIZE: int = Field(1024, description="Window size for sliding window attention") STRIDE: int = Field(512, description="Stride for sliding window attention") model_config = ConfigDict( validate_assignment=True, extra="allow" ) class AttentionConfig(BaseModel): PRETRAINED_LIMITS: PretrainedLimitsConfig = Field(default_factory=PretrainedLimitsConfig) CUSTOM_WINDOWS: CustomWindowsConfig = Field(default_factory=CustomWindowsConfig) model_config = ConfigDict( validate_assignment=True, extra="allow" ) class TransformerConfig(BaseModel): ATTENTION_MECHANISM: Dict[str, Any] = Field( default={ "TYPE": "hybrid", "WINDOW_SIZE": 1024, "STRIDE": 512, "USE_MEMORY": True, "ATTENTION_TYPES": { "SLIDING": True, "HIERARCHICAL": True, "GLOBAL": True } }, description="Attention mechanism configuration" ) BASE_DIR: str = Field(..., description="Base directory for transformer checkpoints") TRANSFORMER_FILENAME_FORMAT: str = Field(..., description="Filename format for transformer checkpoints") MODEL_NAME: str = Field("bert-base-uncased", description="Name of the primary model from Hugging Face") # Changed from Wildnerve-tlm01 NUM_EPOCHS: int = Field(30, description="Number of epochs for transformer training") # Increased from whatever value was here before LEARNING_RATE: float = Field(..., description="Learning rate for transformer") BATCH_SIZE: int = Field(..., description="Batch size for transformer training") EMBEDDING_DIM: int = Field(..., description="Embedding dimension") NUM_HEADS: int = Field(..., description="Number of attention heads") HIDDEN_DIM: int = Field(..., description="Hidden dimension") NUM_LAYERS: int = Field(..., description="Number of layers") DROPOUT: float = Field(..., description="Dropout rate") specialization: Optional[str] = Field( default="general", description="Specialization type (defaults to 'general')" ) DATASET_PATH: str = Field(..., description="Path to the dataset") OUTPUT_SIZE: int = Field(..., description="Size of the output (usually vocab size)") MAX_SEQ_LENGTH: int = Field(..., description="Maximum sequence length") POOLING_MODE: str = Field(..., description="Pooling mode") VOCAB_SIZE: int = Field(..., description="Vocabulary size") MAX_RATE: int = Field(..., description="Maximum rate") MODE: str = Field(..., description="Model mode") MODE2: str = Field(..., description="Secondary mode") SHUFFLE: bool = Field(..., description="Shuffle flag for transformer") SIMILARITY_THRESHOLD: float = Field(..., description="Similarity threshold for weight sharing") USE_PRETRAINED_ENCODER: bool = Field(..., description="Enable pretrained encoder branch") model_config = ConfigDict( validate_assignment=True, extra="allow" ) class PreprocessingConfig(BaseModel): LOWERCASE: bool = Field(True, description="Convert text to lowercase") REMOVE_SPECIAL_CHARACTERS: bool = Field(True, description="Remove special characters from text") REPLACE_MULTIPLE_SPACES: bool = Field(True, description="Replace multiple spaces with a single space") model_config = ConfigDict( validate_assignment=True, extra="allow" ) class STDPConfig(BaseModel): WEIGHT_THRESHOLD: float = Field(..., description="Threshold for STDP weight update") ACTIVATION_THRESHOLD: float = Field(..., description="Threshold for STDP activation") USE_SNN: bool = Field(..., description="Use spiking neural network") ALPHA: float = Field(..., description="STDP alpha parameter") BETA: float = Field(..., description="STDP beta parameter") BASE_DIR: str = Field(..., description="Directory for STDP checkpoints") SNN_FILENAME_FORMAT: str = Field(..., description="Filename format for SNN checkpoints") STDPLearningRate: float = Field(..., description="STDP learning rate") STDPMemDecay: float = Field(..., description="STDP memory decay factor") SpikeThreshold: float = Field(..., description="Spike threshold") firing_rate: int = Field(..., description="Firing rate") MAX_SEQ_LENGTH: int = Field(..., description="Maximum sequence length") STDP_PRETRAIN_EPOCHS: int = Field(..., description="Pre-training epochs for STDP") STDP_FINETUNE_EPOCHS: int = Field(..., description="Fine-tuning epochs for STDP") BATCH_SIZE_PRETRAIN: int = Field(..., description="Batch size during STDP pre-training") BATCH_SIZE_FINETUNE: int = Field(..., description="Batch size during STDP fine-tuning") NUM_NEURONS: int = Field(..., description="Number of neurons in the STDP model") MAX_RATE: int = Field(..., description="Maximum rate for STDP") model_config = ConfigDict( validate_assignment=True, extra="allow" ) class SerializableDict(dict): """Dictionary subclass with attribute-style access that can be serialized safely""" def __getattr__(self, key): if key in self: return self[key] return None def __setattr__(self, key, value): self[key] = value def __delattr__(self, key): if key in self: del self[key] # Add special methods to handle JSON serialization def __getstate__(self): """Return state for pickling - exclude config_data if it's self""" state = dict(self) if 'config_data' in state and id(state['config_data']) == id(self): state['config_data'] = '__self__' # Replace self-reference with marker return state def __repr__(self): """Safe representation that handles circular references""" items = [] for k, v in self.items(): if k == "config_data" and v is self: items.append(f"{k}=") else: items.append(f"{k}={v!r}") return f"{self.__class__.__name__}({', '.join(items)})" class AppConfig(BaseModel): """Main application configuration with proper serialization handling""" # which model files to load by default SELECTED_MODEL: List[str] = Field( default=["model_Custm.py", "model_PrTr.py"], description="Default model files (custom first, then pretrained)" ) DATA_DIR: str = Field(default="/tmp/tlm_data", description="Local data directory") MODEL_DIR: str = Field(default="/tmp/tlm_data/models", description="Local model weights directory") HF_DATASET_URL: str = Field( default="https://huggingface.co/datasets/EvolphTech/data", description="Remote dataset repository URL" ) HF_WEIGHTS_URL: str = Field( default="https://huggingface.co/EvolphTech/Weights", description="Remote weights repository URL" ) HF_MODEL_URL: str = Field( default="https://huggingface.co/EvolphTech/Wildnerve-tlm01_Hybrid_Model", description="Remote model repository URL" ) HF_CHATBOT_SPACE_URL: str = Field( default="https://huggingface.co/spaces/EvolphTech/Wildnerve-tlm01-0.05Bx12", description="Chatbot Space URL" ) WP_PLUGIN_FILE: str = Field( default="wildnerve-chatbot.php", description="WordPress chatbot plugin file" ) TRANSFORMER_CONFIG: Dict[str, Any] = Field( default_factory=dict, description="Transformer configuration overrides" ) SIMILARITY_THRESHOLD: float = Field(default=0.85) TOP_K: int = Field(default=3) MAX_ACTIVE_MODELS: int = Field(default=2) MODEL_IDLE_THRESHOLD: int = Field(default=600) # Add a new Pydantic model_config to fix serialization issues model_config = ConfigDict( extra="allow", # Allow extra fields not in the model arbitrary_types_allowed=True, # Allow arbitrary types populate_by_name=True, # Allow population by field name json_encoders={ # Add custom encoders for non-serializable types SerializableDict: lambda v: {k: v[k] for k in v if not k.startswith("_")} }, validate_assignment=False # Don't validate on attribute assignment ) def load_config() -> Union[AppConfig, Dict[str, Any]]: """Load configuration from JSON file with robust error handling""" config_path = os.path.join(os.path.dirname(__file__), "config.json") logger.info(f"Loading config from {config_path}") raw_config = {} try: with open(config_path, "r") as f: try: raw = json.load(f) raw_config = raw # Save raw config in case Pydantic validation fails except json.JSONDecodeError as e: logger.error(f"JSON parsing error in config.json: {e}") logger.error(f"Error at line {e.lineno}, column {e.colno}: {e.msg}") raise # Process the TRANSFORMER_CONFIG section if isinstance(raw.get("TRANSFORMER_CONFIG"), dict): # Create SerializableDict with safe self-reference handling transformer_config = SerializableDict(raw["TRANSFORMER_CONFIG"]) # Add config_data attribute directly transformer_config['config_data'] = transformer_config # Replace the dict with our enhanced SerializableDict raw["TRANSFORMER_CONFIG"] = transformer_config # Ensure GPT-2 parameters are set if not isinstance(transformer_config.get("VOCAB_SIZE"), int) or transformer_config["VOCAB_SIZE"] != 50257: transformer_config["VOCAB_SIZE"] = 50257 # Standard GPT-2 vocab size if transformer_config.get("MODEL_NAME") != "gpt2": transformer_config["MODEL_NAME"] = "gpt2" # Ensure OUTPUT_SIZE matches VOCAB_SIZE transformer_config["OUTPUT_SIZE"] = transformer_config["VOCAB_SIZE"] # Add generation parameters if missing if "GENERATION_CONFIG" not in raw: raw["GENERATION_CONFIG"] = { "temperature": 0.7, "top_p": 0.95, "top_k": 50, "repetition_penalty": 1.3, "no_repeat_ngram_size": 3, "do_sample": True, "penalty_alpha": 0.6 } except Exception as e: logger.error(f"Failed to read config.json: {e}", exc_info=True) raise # Try to create AppConfig with pydantic validation if pydantic_available: try: cfg = AppConfig(**raw) # Just log success message logger.debug("Config loaded successfully") return cfg except ValidationError as ve: logger.error(f"Config validation error: {ve}", exc_info=True) # Fall back to returning the raw config as a dict logger.warning("Using raw config dictionary due to validation failure") return raw_config else: # If pydantic not available, just return the raw dict return raw_config # Global application config app_config = load_config() def get_model_architecture_params(): """Get model architecture parameters from config file""" if hasattr(app_config, "TRANSFORMER_CONFIG"): tc = app_config.TRANSFORMER_CONFIG # CRITICAL: Use 767 consistently for max_seq_length to match config.json return { "vocab_size": getattr(tc, "VOCAB_SIZE", 50257), "embedding_dim": 768, # Fixed to 768 for embedding dimensions "num_heads": 12, # 12 heads works with 768 (768/12=64) "hidden_dim": 768, # Fixed to 768 for hidden dimensions "num_layers": getattr(tc, "NUM_LAYERS", 12), "output_size": getattr(tc, "VOCAB_SIZE", 50257), "dropout": getattr(tc, "DROPOUT", 0.1), "max_seq_length": 767 # IMPORTANT: Fixed to 767 to match config.json } else: # Default parameters if config not available return { "vocab_size": 50257, "embedding_dim": 768, "num_heads": 12, "hidden_dim": 768, "num_layers": 12, "output_size": 50257, "dropout": 0.1, "max_seq_length": 767 # IMPORTANT: Fixed to 767 to match config.json } if __name__ == "__main__": args = argparse.ArgumentParser(description="Tiny Language Model Configuration").parse_args() print("Configuration loaded successfully!")