# config.py - 21/02/2025, cleaned up version, 5:14pm, C:\Users\User\OneDrive\Documents\tlm\config.py
import os
import json
import logging
import argparse
import pydantic                 # prefer real import in main block
from utils import dependency_helpers # Changed to direct import from utils package
from pathlib import Path
from typing import Optional, Dict, List, Literal, Any, Union

# flag indicating real pydantic is present
pydantic_available = True

# Attempt to load pydantic and fall back on dummy types
try:
    from pydantic import BaseModel, Field, ValidationError, ConfigDict
except ImportError:
    pydantic_available = False
    logger = logging.getLogger(__name__)
    logger.warning("pydantic not available, using dummy BaseModel")
    class BaseModel:
        def __init__(self, **kwargs):
            for k, v in kwargs.items(): setattr(self, k, v)
    Field = lambda *args, **kwargs: None
    ValidationError = Exception
    ConfigDict = dict

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

if pydantic_available:
    logger.info(f"Loaded pydantic v{pydantic.__version__}")
else:
    logger.debug("Operating with dummy pydantic types")

class PathConfig:
    """Handle path configurations"""
    @staticmethod
    def get_project_root() -> Path:
        return Path(__file__).resolve().parent

    @staticmethod
    def get_data_dir() -> Path:
        """Get writable data directory, falling back to temp if needed"""
        # First try in project directory
        project_dir = PathConfig.get_project_root()
        data_dir = project_dir / "data"
        
        # Check if we can write to this location
        try:
            if not data_dir.exists():
                data_dir.mkdir(parents=True, exist_ok=True)
            # Test write access with a small file
            test_file = data_dir / ".write_test"
            test_file.touch()
            test_file.unlink()
            return data_dir
        except (PermissionError, IOError):
            # Fall back to temp directory
            import tempfile
            tmp_dir = Path(tempfile.gettempdir()) / "wildnerve_data"
            tmp_dir.mkdir(parents=True, exist_ok=True)
            logger.info("Using temporary directory for data: %s", tmp_dir)
            return tmp_dir

    @staticmethod
    def get_checkpoint_dir() -> Path:
        # First try in project directory
        project_dir = PathConfig.get_project_root()
        checkpoint_dir = project_dir / "checkpoints"
        
        # Check if we can write to this directory
        if os.access(project_dir, os.W_OK):
            return checkpoint_dir
            
        # If not writable, fallback to temp directory
        import tempfile
        tmp_dir = Path(tempfile.gettempdir()) / "wildnerve_checkpoints"
        return tmp_dir

# Replace the current directory setup with:
BASE_DIR = PathConfig.get_project_root()
DATA_DIR = PathConfig.get_data_dir()
CHECKPOINT_DIR = PathConfig.get_checkpoint_dir()

# Add these model architecture parameters
INPUT_SIZE = 768  # BERT base hidden size
OUTPUT_SIZE = 768  # Output embedding size
HIDDEN_SIZE = 768  # Hidden layer size

# Add SPECIALIZATIONS list
SPECIALIZATIONS = [
    "python",
    "rust",
    "solidity",
    "computer",
    "cpp",
    "go",
    "java",
    "javascript",
    "mathematics",
    "nim",
    "other_information",
    "physics"
]

# Define DATASET_PATHS so that each specialization maps to its JSON files
DATASET_PATHS = {
    "python": [
        str(DATA_DIR / "data" / "python_mbpp.json"),
        str(DATA_DIR / "data" / "python_programming.json"),
        str(DATA_DIR / "data" / "python_transformer_model.json")
    ],
    "rust": [
        str(DATA_DIR / "data" / "rust_ai_language_model.json"),
        str(DATA_DIR / "data" / "rust_blockchain.json"),
        str(DATA_DIR / "data" / "rust_mbrp.json"),
        str(DATA_DIR / "data" / "rust_programming.json")
    ],
    "solidity": [
        str(DATA_DIR / "data" / "solidity_programming.json")
    ],
    "computer": [
        str(DATA_DIR / "data" / "computer_advanced_debugging.json"),
        str(DATA_DIR / "data" / "computer_agenticAI.json"),
        str(DATA_DIR / "data" / "computer_architecture.json"),
        str(DATA_DIR / "data" / "computer_cloud_security.json"),
        str(DATA_DIR / "data" / "computer_cloudCI-CD.json"),
        str(DATA_DIR / "data" / "computer_creativity.json"),
        str(DATA_DIR / "data" / "computer_crossplatform.json"),
        str(DATA_DIR / "data" / "computer_cybersecurity.json"),
        str(DATA_DIR / "data" / "computer_error_handling_examples.json"),
        str(DATA_DIR / "data" / "computer_gitInstruct.json")
    ],
    "cpp": [
        str(DATA_DIR / "data" / "cpp_ai_language_model.json"),
        str(DATA_DIR / "data" / "cpp_blockchain.json"),
        str(DATA_DIR / "data" / "cpp_mbcppp.json"),
        str(DATA_DIR / "data" / "cpp_programming.json")
    ],
    "go": [
        str(DATA_DIR / "data" / "golang_ai_language_model.json"),
        str(DATA_DIR / "data" / "golang_mbgp.json"),
        str(DATA_DIR / "data" / "golang_programming.json")
    ],
    "java": [
        str(DATA_DIR / "data" / "java_ai_language_model.json"),
        str(DATA_DIR / "data" / "java_blockchain.json"),
        str(DATA_DIR / "data" / "java_mbjp.json"),
        str(DATA_DIR / "data" / "java_programming.json"),
        str(DATA_DIR / "data" / "java_transformer_language_model.json")
    ],
    "javascript": [
        str(DATA_DIR / "data" / "javascript_chatbot.json"),
        str(DATA_DIR / "data" / "javascript_n_Typescript_backend.json"),
        str(DATA_DIR / "data" / "javascript_n_Typescript_frontend.json"),
        str(DATA_DIR / "data" / "javascript_n_Typescript_programming.json")
    ],
    "mathematics": [
        str(DATA_DIR / "data" / "mathematics.json"),
        str(DATA_DIR / "data" / "mathematics_training.json")
    ],
    "nim": [
        str(DATA_DIR / "data" / "nim_ai_language_model.json"),
        str(DATA_DIR / "data" / "nim_blockchain.json"),
        str(DATA_DIR / "data" / "nim_chatbot.json"),
        str(DATA_DIR / "data" / "nim_conversation.json"),
        str(DATA_DIR / "data" / "nim_mbnp.json"),
        str(DATA_DIR / "data" / "nim_programming.json")
    ],
    "other_information": [
        str(DATA_DIR / "data" / "other_information.json")
    ],
    "physics": [
        str(DATA_DIR / "data" / "physics_n_engineering.json"),
        str(DATA_DIR / "data" / "physics_n_engineering_applied.json"),
        str(DATA_DIR / "data" / "project_structure.json"),
        str(DATA_DIR / "data" / "python_chatbot_guide.json")
    ]
}

# Nested configuration models
class TrainingConfig(BaseModel):
    PATIENCE: int = Field(..., description="Early stopping patience")
    DELTA: float = Field(..., description="Minimum change in the monitored value")
    VERBOSE: bool = Field(..., description="Verbosity of training logs")
    NUM_EPOCHS: int = Field(..., description="Number of training epochs")
    LEARNING_RATE: float = Field(..., description="Learning rate for optimizer")
    TRANSFORMER_LEARNING_RATE: float = Field(..., description="Learning rate for transformer")
    TRANSFORMER_NUM_EPOCHS: int = Field(..., description="Transformer training epochs")
    
    model_config = ConfigDict(
        validate_assignment=True,
        extra="allow"
    )

class CheckpointConfig(BaseModel):
    PATH: str = Field(..., description="Checkpoint saving folder")
    BASE_DIR: str = Field(..., description="Base directory for checkpoints")
    TRANSFORMER_FILENAME_FORMAT: str = Field(..., description="Transformer checkpoint filename format")
    SNN_FILENAME_FORMAT: str = Field(..., description="SNN checkpoint filename format")
    
    model_config = ConfigDict(
        validate_assignment=True,
        extra="allow"
    )

class TokenizerConfig(BaseModel):
    MODEL_NAME: str = Field(..., description="Name of the tokenizer model")
    MAX_SEQ_LENGTH: int = Field(..., description="Maximum length the tokenizer handles")
    POOLING_MODE: str = Field(..., description="Pooling mode for embeddings")
    
    model_config = ConfigDict(
        validate_assignment=True,
        extra="allow"
    )

class DataLoaderConfig(BaseModel):
    SHUFFLE: bool = Field(..., description="Whether to Shuffle the dataset")
    BATCH_SIZE: int = Field(..., description="Batch size for dataloader")
    NUM_WORKERS: int = Field(..., description="Number of workers for dataloader")
    INCLUDE_CRAWL: bool = Field(..., description="Include crawl parameter")
    
    model_config = ConfigDict(
        validate_assignment=True,
        extra="allow"
    )

class GenerationConfig(BaseModel):
    temperature: float = Field(0.7, description="Decoding temperature.")
    top_p: float = Field(0.9, description="Nucleus sampling probability.")
    num_return_sequences: int = Field(1, description="Number of sequences to generate.")
    
    model_config = ConfigDict(
        validate_assignment=True,
        extra="allow"
    )

class PretrainedLimitsConfig(BaseModel):
    GPT2: int = Field(1024, description="Maximum sequence length for GPT-2")
    BERT: int = Field(512, description="Maximum sequence length for BERT")
    
    model_config = ConfigDict(
        validate_assignment=True,
        extra="allow"
    )

class CustomWindowsConfig(BaseModel):
    MAX_SEQ_LENGTH: int = Field(2048, description="Maximum sequence length for custom models")
    WINDOW_SIZE: int = Field(1024, description="Window size for sliding window attention")
    STRIDE: int = Field(512, description="Stride for sliding window attention")
    
    model_config = ConfigDict(
        validate_assignment=True,
        extra="allow"
    )

class AttentionConfig(BaseModel):
    PRETRAINED_LIMITS: PretrainedLimitsConfig = Field(default_factory=PretrainedLimitsConfig)
    CUSTOM_WINDOWS: CustomWindowsConfig = Field(default_factory=CustomWindowsConfig)
    
    model_config = ConfigDict(
        validate_assignment=True,
        extra="allow"
    )

class TransformerConfig(BaseModel):
    ATTENTION_MECHANISM: Dict[str, Any] = Field(
        default={
            "TYPE": "hybrid",
            "WINDOW_SIZE": 1024,
            "STRIDE": 512,
            "USE_MEMORY": True,
            "ATTENTION_TYPES": {
                "SLIDING": True,
                "HIERARCHICAL": True,
                "GLOBAL": True
            }
        },
        description="Attention mechanism configuration"
    )
    
    BASE_DIR: str = Field(..., description="Base directory for transformer checkpoints")
    TRANSFORMER_FILENAME_FORMAT: str = Field(..., description="Filename format for transformer checkpoints")
    MODEL_NAME: str = Field("bert-base-uncased", description="Name of the primary model from Hugging Face")  # Changed from Wildnerve-tlm01
    NUM_EPOCHS: int = Field(30, description="Number of epochs for transformer training")  # Increased from whatever value was here before
    LEARNING_RATE: float = Field(..., description="Learning rate for transformer")
    BATCH_SIZE: int = Field(..., description="Batch size for transformer training")
    EMBEDDING_DIM: int = Field(..., description="Embedding dimension")
    NUM_HEADS: int = Field(..., description="Number of attention heads")
    HIDDEN_DIM: int = Field(..., description="Hidden dimension")
    NUM_LAYERS: int = Field(..., description="Number of layers")
    DROPOUT: float = Field(..., description="Dropout rate")
    specialization: Optional[str] = Field(
        default="general",
        description="Specialization type (defaults to 'general')"
    )
    DATASET_PATH: str = Field(..., description="Path to the dataset")
    OUTPUT_SIZE: int = Field(..., description="Size of the output (usually vocab size)")
    MAX_SEQ_LENGTH: int = Field(..., description="Maximum sequence length")
    POOLING_MODE: str = Field(..., description="Pooling mode")
    VOCAB_SIZE: int = Field(..., description="Vocabulary size")
    MAX_RATE: int = Field(..., description="Maximum rate")
    MODE: str = Field(..., description="Model mode")
    MODE2: str = Field(..., description="Secondary mode")
    SHUFFLE: bool = Field(..., description="Shuffle flag for transformer")
    SIMILARITY_THRESHOLD: float = Field(..., description="Similarity threshold for weight sharing")
    USE_PRETRAINED_ENCODER: bool = Field(..., description="Enable pretrained encoder branch")
    
    model_config = ConfigDict(
        validate_assignment=True,
        extra="allow"
    )

class PreprocessingConfig(BaseModel):
    LOWERCASE: bool = Field(True, description="Convert text to lowercase")
    REMOVE_SPECIAL_CHARACTERS: bool = Field(True, description="Remove special characters from text")
    REPLACE_MULTIPLE_SPACES: bool = Field(True, description="Replace multiple spaces with a single space")
    
    model_config = ConfigDict(
        validate_assignment=True,
        extra="allow"
    )

class STDPConfig(BaseModel):
    WEIGHT_THRESHOLD: float = Field(..., description="Threshold for STDP weight update")
    ACTIVATION_THRESHOLD: float = Field(..., description="Threshold for STDP activation")
    USE_SNN: bool = Field(..., description="Use spiking neural network")
    ALPHA: float = Field(..., description="STDP alpha parameter")
    BETA: float = Field(..., description="STDP beta parameter")
    BASE_DIR: str = Field(..., description="Directory for STDP checkpoints")
    SNN_FILENAME_FORMAT: str = Field(..., description="Filename format for SNN checkpoints")
    STDPLearningRate: float = Field(..., description="STDP learning rate")
    STDPMemDecay: float = Field(..., description="STDP memory decay factor")
    SpikeThreshold: float = Field(..., description="Spike threshold")
    firing_rate: int = Field(..., description="Firing rate")
    MAX_SEQ_LENGTH: int = Field(..., description="Maximum sequence length")
    STDP_PRETRAIN_EPOCHS: int = Field(..., description="Pre-training epochs for STDP")
    STDP_FINETUNE_EPOCHS: int = Field(..., description="Fine-tuning epochs for STDP")
    BATCH_SIZE_PRETRAIN: int = Field(..., description="Batch size during STDP pre-training")
    BATCH_SIZE_FINETUNE: int = Field(..., description="Batch size during STDP fine-tuning")
    NUM_NEURONS: int = Field(..., description="Number of neurons in the STDP model")
    MAX_RATE: int = Field(..., description="Maximum rate for STDP")
    
    model_config = ConfigDict(
        validate_assignment=True,
        extra="allow"
    )

class SerializableDict(dict):
    """Dictionary subclass with attribute-style access that can be serialized safely"""
    def __getattr__(self, key):
        if key in self:
            return self[key]
        return None
        
    def __setattr__(self, key, value):
        self[key] = value
    
    def __delattr__(self, key):
        if key in self:
            del self[key]
            
    # Add special methods to handle JSON serialization
    def __getstate__(self):
        """Return state for pickling - exclude config_data if it's self"""
        state = dict(self)
        if 'config_data' in state and id(state['config_data']) == id(self):
            state['config_data'] = '__self__'  # Replace self-reference with marker
        return state
    
    def __repr__(self):
        """Safe representation that handles circular references"""
        items = []
        for k, v in self.items():
            if k == "config_data" and v is self:
                items.append(f"{k}=<self>")
            else:
                items.append(f"{k}={v!r}")
        return f"{self.__class__.__name__}({', '.join(items)})"

class AppConfig(BaseModel):
    """Main application configuration with proper serialization handling"""
    # which model files to load by default
    SELECTED_MODEL: List[str] = Field(
        default=["model_Custm.py", "model_PrTr.py"],
        description="Default model files (custom first, then pretrained)"
    )
    DATA_DIR: str = Field(default="/tmp/tlm_data", description="Local data directory")
    MODEL_DIR: str = Field(default="/tmp/tlm_data/models", description="Local model weights directory")
    HF_DATASET_URL: str = Field(
        default="https://huggingface.co/datasets/EvolphTech/data",
        description="Remote dataset repository URL"
    )
    HF_WEIGHTS_URL: str = Field(
        default="https://huggingface.co/EvolphTech/Weights",
        description="Remote weights repository URL"
    )
    HF_MODEL_URL: str = Field(
        default="https://huggingface.co/EvolphTech/Wildnerve-tlm01_Hybrid_Model",
        description="Remote model repository URL"
    )
    HF_CHATBOT_SPACE_URL: str = Field(
        default="https://huggingface.co/spaces/EvolphTech/Wildnerve-tlm01-0.05Bx12",
        description="Chatbot Space URL"
    )
    WP_PLUGIN_FILE: str = Field(
        default="wildnerve-chatbot.php",
        description="WordPress chatbot plugin file"
    )
    TRANSFORMER_CONFIG: Dict[str, Any] = Field(
        default_factory=dict,
        description="Transformer configuration overrides"
    )
    SIMILARITY_THRESHOLD: float = Field(default=0.85)
    TOP_K: int = Field(default=3)
    MAX_ACTIVE_MODELS: int = Field(default=2)
    MODEL_IDLE_THRESHOLD: int = Field(default=600)
    
    # Add a new Pydantic model_config to fix serialization issues
    model_config = ConfigDict(
        extra="allow",  # Allow extra fields not in the model
        arbitrary_types_allowed=True,  # Allow arbitrary types
        populate_by_name=True, # Allow population by field name
        json_encoders={
            # Add custom encoders for non-serializable types
            SerializableDict: lambda v: {k: v[k] for k in v if not k.startswith("_")}
        },
        validate_assignment=False  # Don't validate on attribute assignment
    )

def load_config() -> Union[AppConfig, Dict[str, Any]]:
    """Load configuration from JSON file with robust error handling"""
    config_path = os.path.join(os.path.dirname(__file__), "config.json")
    logger.info(f"Loading config from {config_path}")
    raw_config = {}
    
    try:
        with open(config_path, "r") as f:
            try:
                raw = json.load(f)
                raw_config = raw  # Save raw config in case Pydantic validation fails
            except json.JSONDecodeError as e:
                logger.error(f"JSON parsing error in config.json: {e}")
                logger.error(f"Error at line {e.lineno}, column {e.colno}: {e.msg}")
                raise

            # Process the TRANSFORMER_CONFIG section
            if isinstance(raw.get("TRANSFORMER_CONFIG"), dict):
                # Create SerializableDict with safe self-reference handling
                transformer_config = SerializableDict(raw["TRANSFORMER_CONFIG"])
                
                # Add config_data attribute directly
                transformer_config['config_data'] = transformer_config
                
                # Replace the dict with our enhanced SerializableDict
                raw["TRANSFORMER_CONFIG"] = transformer_config
                
                # Ensure GPT-2 parameters are set
                if not isinstance(transformer_config.get("VOCAB_SIZE"), int) or transformer_config["VOCAB_SIZE"] != 50257:
                    transformer_config["VOCAB_SIZE"] = 50257  # Standard GPT-2 vocab size
                    
                if transformer_config.get("MODEL_NAME") != "gpt2":
                    transformer_config["MODEL_NAME"] = "gpt2"
                    
                # Ensure OUTPUT_SIZE matches VOCAB_SIZE
                transformer_config["OUTPUT_SIZE"] = transformer_config["VOCAB_SIZE"]
            
            # Add generation parameters if missing
            if "GENERATION_CONFIG" not in raw:
                raw["GENERATION_CONFIG"] = {
                    "temperature": 0.7,
                    "top_p": 0.95,
                    "top_k": 50,
                    "repetition_penalty": 1.3,
                    "no_repeat_ngram_size": 3,
                    "do_sample": True,
                    "penalty_alpha": 0.6
                }
    except Exception as e:
        logger.error(f"Failed to read config.json: {e}", exc_info=True)
        raise

    # Try to create AppConfig with pydantic validation
    if pydantic_available:
        try:
            cfg = AppConfig(**raw)
            
            # Just log success message
            logger.debug("Config loaded successfully")
            return cfg
            
        except ValidationError as ve:
            logger.error(f"Config validation error: {ve}", exc_info=True)
            
            # Fall back to returning the raw config as a dict
            logger.warning("Using raw config dictionary due to validation failure")
            return raw_config
    else:
        # If pydantic not available, just return the raw dict
        return raw_config

# Global application config
app_config = load_config()

def get_model_architecture_params():
    """Get model architecture parameters from config file"""
    if hasattr(app_config, "TRANSFORMER_CONFIG"):
        tc = app_config.TRANSFORMER_CONFIG
        # CRITICAL: Use 767 consistently for max_seq_length to match config.json
        return {
            "vocab_size": getattr(tc, "VOCAB_SIZE", 50257),
            "embedding_dim": 768,  # Fixed to 768 for embedding dimensions
            "num_heads": 12,       # 12 heads works with 768 (768/12=64)
            "hidden_dim": 768,     # Fixed to 768 for hidden dimensions
            "num_layers": getattr(tc, "NUM_LAYERS", 12),
            "output_size": getattr(tc, "VOCAB_SIZE", 50257),
            "dropout": getattr(tc, "DROPOUT", 0.1),
            "max_seq_length": 767  # IMPORTANT: Fixed to 767 to match config.json
        }
    else:
        # Default parameters if config not available
        return {
            "vocab_size": 50257,
            "embedding_dim": 768,
            "num_heads": 12,
            "hidden_dim": 768,
            "num_layers": 12,
            "output_size": 50257,
            "dropout": 0.1,
            "max_seq_length": 767  # IMPORTANT: Fixed to 767 to match config.json
        }

if __name__ == "__main__":
    args = argparse.ArgumentParser(description="Tiny Language Model Configuration").parse_args()
    print("Configuration loaded successfully!")