WildnerveAI's picture
Upload config.py
45c2916 verified
# config.py - 21/02/2025, cleaned up version, 5:14pm, C:\Users\User\OneDrive\Documents\tlm\config.py
import os
import json
import logging
import argparse
import pydantic # prefer real import in main block
from utils import dependency_helpers # Changed to direct import from utils package
from pathlib import Path
from typing import Optional, Dict, List, Literal, Any, Union
# flag indicating real pydantic is present
pydantic_available = True
# Attempt to load pydantic and fall back on dummy types
try:
from pydantic import BaseModel, Field, ValidationError, ConfigDict
except ImportError:
pydantic_available = False
logger = logging.getLogger(__name__)
logger.warning("pydantic not available, using dummy BaseModel")
class BaseModel:
def __init__(self, **kwargs):
for k, v in kwargs.items(): setattr(self, k, v)
Field = lambda *args, **kwargs: None
ValidationError = Exception
ConfigDict = dict
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
if pydantic_available:
logger.info(f"Loaded pydantic v{pydantic.__version__}")
else:
logger.debug("Operating with dummy pydantic types")
class PathConfig:
"""Handle path configurations"""
@staticmethod
def get_project_root() -> Path:
return Path(__file__).resolve().parent
@staticmethod
def get_data_dir() -> Path:
"""Get writable data directory, falling back to temp if needed"""
# First try in project directory
project_dir = PathConfig.get_project_root()
data_dir = project_dir / "data"
# Check if we can write to this location
try:
if not data_dir.exists():
data_dir.mkdir(parents=True, exist_ok=True)
# Test write access with a small file
test_file = data_dir / ".write_test"
test_file.touch()
test_file.unlink()
return data_dir
except (PermissionError, IOError):
# Fall back to temp directory
import tempfile
tmp_dir = Path(tempfile.gettempdir()) / "wildnerve_data"
tmp_dir.mkdir(parents=True, exist_ok=True)
logger.info("Using temporary directory for data: %s", tmp_dir)
return tmp_dir
@staticmethod
def get_checkpoint_dir() -> Path:
# First try in project directory
project_dir = PathConfig.get_project_root()
checkpoint_dir = project_dir / "checkpoints"
# Check if we can write to this directory
if os.access(project_dir, os.W_OK):
return checkpoint_dir
# If not writable, fallback to temp directory
import tempfile
tmp_dir = Path(tempfile.gettempdir()) / "wildnerve_checkpoints"
return tmp_dir
# Replace the current directory setup with:
BASE_DIR = PathConfig.get_project_root()
DATA_DIR = PathConfig.get_data_dir()
CHECKPOINT_DIR = PathConfig.get_checkpoint_dir()
# Add these model architecture parameters
INPUT_SIZE = 768 # BERT base hidden size
OUTPUT_SIZE = 768 # Output embedding size
HIDDEN_SIZE = 768 # Hidden layer size
# Add SPECIALIZATIONS list
SPECIALIZATIONS = [
"python",
"rust",
"solidity",
"computer",
"cpp",
"go",
"java",
"javascript",
"mathematics",
"nim",
"other_information",
"physics"
]
# Define DATASET_PATHS so that each specialization maps to its JSON files
DATASET_PATHS = {
"python": [
str(DATA_DIR / "data" / "python_mbpp.json"),
str(DATA_DIR / "data" / "python_programming.json"),
str(DATA_DIR / "data" / "python_transformer_model.json")
],
"rust": [
str(DATA_DIR / "data" / "rust_ai_language_model.json"),
str(DATA_DIR / "data" / "rust_blockchain.json"),
str(DATA_DIR / "data" / "rust_mbrp.json"),
str(DATA_DIR / "data" / "rust_programming.json")
],
"solidity": [
str(DATA_DIR / "data" / "solidity_programming.json")
],
"computer": [
str(DATA_DIR / "data" / "computer_advanced_debugging.json"),
str(DATA_DIR / "data" / "computer_agenticAI.json"),
str(DATA_DIR / "data" / "computer_architecture.json"),
str(DATA_DIR / "data" / "computer_cloud_security.json"),
str(DATA_DIR / "data" / "computer_cloudCI-CD.json"),
str(DATA_DIR / "data" / "computer_creativity.json"),
str(DATA_DIR / "data" / "computer_crossplatform.json"),
str(DATA_DIR / "data" / "computer_cybersecurity.json"),
str(DATA_DIR / "data" / "computer_error_handling_examples.json"),
str(DATA_DIR / "data" / "computer_gitInstruct.json")
],
"cpp": [
str(DATA_DIR / "data" / "cpp_ai_language_model.json"),
str(DATA_DIR / "data" / "cpp_blockchain.json"),
str(DATA_DIR / "data" / "cpp_mbcppp.json"),
str(DATA_DIR / "data" / "cpp_programming.json")
],
"go": [
str(DATA_DIR / "data" / "golang_ai_language_model.json"),
str(DATA_DIR / "data" / "golang_mbgp.json"),
str(DATA_DIR / "data" / "golang_programming.json")
],
"java": [
str(DATA_DIR / "data" / "java_ai_language_model.json"),
str(DATA_DIR / "data" / "java_blockchain.json"),
str(DATA_DIR / "data" / "java_mbjp.json"),
str(DATA_DIR / "data" / "java_programming.json"),
str(DATA_DIR / "data" / "java_transformer_language_model.json")
],
"javascript": [
str(DATA_DIR / "data" / "javascript_chatbot.json"),
str(DATA_DIR / "data" / "javascript_n_Typescript_backend.json"),
str(DATA_DIR / "data" / "javascript_n_Typescript_frontend.json"),
str(DATA_DIR / "data" / "javascript_n_Typescript_programming.json")
],
"mathematics": [
str(DATA_DIR / "data" / "mathematics.json"),
str(DATA_DIR / "data" / "mathematics_training.json")
],
"nim": [
str(DATA_DIR / "data" / "nim_ai_language_model.json"),
str(DATA_DIR / "data" / "nim_blockchain.json"),
str(DATA_DIR / "data" / "nim_chatbot.json"),
str(DATA_DIR / "data" / "nim_conversation.json"),
str(DATA_DIR / "data" / "nim_mbnp.json"),
str(DATA_DIR / "data" / "nim_programming.json")
],
"other_information": [
str(DATA_DIR / "data" / "other_information.json")
],
"physics": [
str(DATA_DIR / "data" / "physics_n_engineering.json"),
str(DATA_DIR / "data" / "physics_n_engineering_applied.json"),
str(DATA_DIR / "data" / "project_structure.json"),
str(DATA_DIR / "data" / "python_chatbot_guide.json")
]
}
# Nested configuration models
class TrainingConfig(BaseModel):
PATIENCE: int = Field(..., description="Early stopping patience")
DELTA: float = Field(..., description="Minimum change in the monitored value")
VERBOSE: bool = Field(..., description="Verbosity of training logs")
NUM_EPOCHS: int = Field(..., description="Number of training epochs")
LEARNING_RATE: float = Field(..., description="Learning rate for optimizer")
TRANSFORMER_LEARNING_RATE: float = Field(..., description="Learning rate for transformer")
TRANSFORMER_NUM_EPOCHS: int = Field(..., description="Transformer training epochs")
model_config = ConfigDict(
validate_assignment=True,
extra="allow"
)
class CheckpointConfig(BaseModel):
PATH: str = Field(..., description="Checkpoint saving folder")
BASE_DIR: str = Field(..., description="Base directory for checkpoints")
TRANSFORMER_FILENAME_FORMAT: str = Field(..., description="Transformer checkpoint filename format")
SNN_FILENAME_FORMAT: str = Field(..., description="SNN checkpoint filename format")
model_config = ConfigDict(
validate_assignment=True,
extra="allow"
)
class TokenizerConfig(BaseModel):
MODEL_NAME: str = Field(..., description="Name of the tokenizer model")
MAX_SEQ_LENGTH: int = Field(..., description="Maximum length the tokenizer handles")
POOLING_MODE: str = Field(..., description="Pooling mode for embeddings")
model_config = ConfigDict(
validate_assignment=True,
extra="allow"
)
class DataLoaderConfig(BaseModel):
SHUFFLE: bool = Field(..., description="Whether to Shuffle the dataset")
BATCH_SIZE: int = Field(..., description="Batch size for dataloader")
NUM_WORKERS: int = Field(..., description="Number of workers for dataloader")
INCLUDE_CRAWL: bool = Field(..., description="Include crawl parameter")
model_config = ConfigDict(
validate_assignment=True,
extra="allow"
)
class GenerationConfig(BaseModel):
temperature: float = Field(0.7, description="Decoding temperature.")
top_p: float = Field(0.9, description="Nucleus sampling probability.")
num_return_sequences: int = Field(1, description="Number of sequences to generate.")
model_config = ConfigDict(
validate_assignment=True,
extra="allow"
)
class PretrainedLimitsConfig(BaseModel):
GPT2: int = Field(1024, description="Maximum sequence length for GPT-2")
BERT: int = Field(512, description="Maximum sequence length for BERT")
model_config = ConfigDict(
validate_assignment=True,
extra="allow"
)
class CustomWindowsConfig(BaseModel):
MAX_SEQ_LENGTH: int = Field(2048, description="Maximum sequence length for custom models")
WINDOW_SIZE: int = Field(1024, description="Window size for sliding window attention")
STRIDE: int = Field(512, description="Stride for sliding window attention")
model_config = ConfigDict(
validate_assignment=True,
extra="allow"
)
class AttentionConfig(BaseModel):
PRETRAINED_LIMITS: PretrainedLimitsConfig = Field(default_factory=PretrainedLimitsConfig)
CUSTOM_WINDOWS: CustomWindowsConfig = Field(default_factory=CustomWindowsConfig)
model_config = ConfigDict(
validate_assignment=True,
extra="allow"
)
class TransformerConfig(BaseModel):
ATTENTION_MECHANISM: Dict[str, Any] = Field(
default={
"TYPE": "hybrid",
"WINDOW_SIZE": 1024,
"STRIDE": 512,
"USE_MEMORY": True,
"ATTENTION_TYPES": {
"SLIDING": True,
"HIERARCHICAL": True,
"GLOBAL": True
}
},
description="Attention mechanism configuration"
)
BASE_DIR: str = Field(..., description="Base directory for transformer checkpoints")
TRANSFORMER_FILENAME_FORMAT: str = Field(..., description="Filename format for transformer checkpoints")
MODEL_NAME: str = Field("bert-base-uncased", description="Name of the primary model from Hugging Face") # Changed from Wildnerve-tlm01
NUM_EPOCHS: int = Field(30, description="Number of epochs for transformer training") # Increased from whatever value was here before
LEARNING_RATE: float = Field(..., description="Learning rate for transformer")
BATCH_SIZE: int = Field(..., description="Batch size for transformer training")
EMBEDDING_DIM: int = Field(..., description="Embedding dimension")
NUM_HEADS: int = Field(..., description="Number of attention heads")
HIDDEN_DIM: int = Field(..., description="Hidden dimension")
NUM_LAYERS: int = Field(..., description="Number of layers")
DROPOUT: float = Field(..., description="Dropout rate")
specialization: Optional[str] = Field(
default="general",
description="Specialization type (defaults to 'general')"
)
DATASET_PATH: str = Field(..., description="Path to the dataset")
OUTPUT_SIZE: int = Field(..., description="Size of the output (usually vocab size)")
MAX_SEQ_LENGTH: int = Field(..., description="Maximum sequence length")
POOLING_MODE: str = Field(..., description="Pooling mode")
VOCAB_SIZE: int = Field(..., description="Vocabulary size")
MAX_RATE: int = Field(..., description="Maximum rate")
MODE: str = Field(..., description="Model mode")
MODE2: str = Field(..., description="Secondary mode")
SHUFFLE: bool = Field(..., description="Shuffle flag for transformer")
SIMILARITY_THRESHOLD: float = Field(..., description="Similarity threshold for weight sharing")
USE_PRETRAINED_ENCODER: bool = Field(..., description="Enable pretrained encoder branch")
model_config = ConfigDict(
validate_assignment=True,
extra="allow"
)
class PreprocessingConfig(BaseModel):
LOWERCASE: bool = Field(True, description="Convert text to lowercase")
REMOVE_SPECIAL_CHARACTERS: bool = Field(True, description="Remove special characters from text")
REPLACE_MULTIPLE_SPACES: bool = Field(True, description="Replace multiple spaces with a single space")
model_config = ConfigDict(
validate_assignment=True,
extra="allow"
)
class STDPConfig(BaseModel):
WEIGHT_THRESHOLD: float = Field(..., description="Threshold for STDP weight update")
ACTIVATION_THRESHOLD: float = Field(..., description="Threshold for STDP activation")
USE_SNN: bool = Field(..., description="Use spiking neural network")
ALPHA: float = Field(..., description="STDP alpha parameter")
BETA: float = Field(..., description="STDP beta parameter")
BASE_DIR: str = Field(..., description="Directory for STDP checkpoints")
SNN_FILENAME_FORMAT: str = Field(..., description="Filename format for SNN checkpoints")
STDPLearningRate: float = Field(..., description="STDP learning rate")
STDPMemDecay: float = Field(..., description="STDP memory decay factor")
SpikeThreshold: float = Field(..., description="Spike threshold")
firing_rate: int = Field(..., description="Firing rate")
MAX_SEQ_LENGTH: int = Field(..., description="Maximum sequence length")
STDP_PRETRAIN_EPOCHS: int = Field(..., description="Pre-training epochs for STDP")
STDP_FINETUNE_EPOCHS: int = Field(..., description="Fine-tuning epochs for STDP")
BATCH_SIZE_PRETRAIN: int = Field(..., description="Batch size during STDP pre-training")
BATCH_SIZE_FINETUNE: int = Field(..., description="Batch size during STDP fine-tuning")
NUM_NEURONS: int = Field(..., description="Number of neurons in the STDP model")
MAX_RATE: int = Field(..., description="Maximum rate for STDP")
model_config = ConfigDict(
validate_assignment=True,
extra="allow"
)
class SerializableDict(dict):
"""Dictionary subclass with attribute-style access that can be serialized safely"""
def __getattr__(self, key):
if key in self:
return self[key]
return None
def __setattr__(self, key, value):
self[key] = value
def __delattr__(self, key):
if key in self:
del self[key]
# Add special methods to handle JSON serialization
def __getstate__(self):
"""Return state for pickling - exclude config_data if it's self"""
state = dict(self)
if 'config_data' in state and id(state['config_data']) == id(self):
state['config_data'] = '__self__' # Replace self-reference with marker
return state
def __repr__(self):
"""Safe representation that handles circular references"""
items = []
for k, v in self.items():
if k == "config_data" and v is self:
items.append(f"{k}=<self>")
else:
items.append(f"{k}={v!r}")
return f"{self.__class__.__name__}({', '.join(items)})"
class AppConfig(BaseModel):
"""Main application configuration with proper serialization handling"""
# which model files to load by default
SELECTED_MODEL: List[str] = Field(
default=["model_Custm.py", "model_PrTr.py"],
description="Default model files (custom first, then pretrained)"
)
DATA_DIR: str = Field(default="/tmp/tlm_data", description="Local data directory")
MODEL_DIR: str = Field(default="/tmp/tlm_data/models", description="Local model weights directory")
HF_DATASET_URL: str = Field(
default="https://huggingface.co/datasets/EvolphTech/data",
description="Remote dataset repository URL"
)
HF_WEIGHTS_URL: str = Field(
default="https://huggingface.co/EvolphTech/Weights",
description="Remote weights repository URL"
)
HF_MODEL_URL: str = Field(
default="https://huggingface.co/EvolphTech/Wildnerve-tlm01_Hybrid_Model",
description="Remote model repository URL"
)
HF_CHATBOT_SPACE_URL: str = Field(
default="https://huggingface.co/spaces/EvolphTech/Wildnerve-tlm01-0.05Bx12",
description="Chatbot Space URL"
)
WP_PLUGIN_FILE: str = Field(
default="wildnerve-chatbot.php",
description="WordPress chatbot plugin file"
)
TRANSFORMER_CONFIG: Dict[str, Any] = Field(
default_factory=dict,
description="Transformer configuration overrides"
)
SIMILARITY_THRESHOLD: float = Field(default=0.85)
TOP_K: int = Field(default=3)
MAX_ACTIVE_MODELS: int = Field(default=2)
MODEL_IDLE_THRESHOLD: int = Field(default=600)
# Add a new Pydantic model_config to fix serialization issues
model_config = ConfigDict(
extra="allow", # Allow extra fields not in the model
arbitrary_types_allowed=True, # Allow arbitrary types
populate_by_name=True, # Allow population by field name
json_encoders={
# Add custom encoders for non-serializable types
SerializableDict: lambda v: {k: v[k] for k in v if not k.startswith("_")}
},
validate_assignment=False # Don't validate on attribute assignment
)
def load_config() -> Union[AppConfig, Dict[str, Any]]:
"""Load configuration from JSON file with robust error handling"""
config_path = os.path.join(os.path.dirname(__file__), "config.json")
logger.info(f"Loading config from {config_path}")
raw_config = {}
try:
with open(config_path, "r") as f:
try:
raw = json.load(f)
raw_config = raw # Save raw config in case Pydantic validation fails
except json.JSONDecodeError as e:
logger.error(f"JSON parsing error in config.json: {e}")
logger.error(f"Error at line {e.lineno}, column {e.colno}: {e.msg}")
raise
# Process the TRANSFORMER_CONFIG section
if isinstance(raw.get("TRANSFORMER_CONFIG"), dict):
# Create SerializableDict with safe self-reference handling
transformer_config = SerializableDict(raw["TRANSFORMER_CONFIG"])
# Add config_data attribute directly
transformer_config['config_data'] = transformer_config
# Replace the dict with our enhanced SerializableDict
raw["TRANSFORMER_CONFIG"] = transformer_config
# Ensure GPT-2 parameters are set
if not isinstance(transformer_config.get("VOCAB_SIZE"), int) or transformer_config["VOCAB_SIZE"] != 50257:
transformer_config["VOCAB_SIZE"] = 50257 # Standard GPT-2 vocab size
if transformer_config.get("MODEL_NAME") != "gpt2":
transformer_config["MODEL_NAME"] = "gpt2"
# Ensure OUTPUT_SIZE matches VOCAB_SIZE
transformer_config["OUTPUT_SIZE"] = transformer_config["VOCAB_SIZE"]
# Add generation parameters if missing
if "GENERATION_CONFIG" not in raw:
raw["GENERATION_CONFIG"] = {
"temperature": 0.7,
"top_p": 0.95,
"top_k": 50,
"repetition_penalty": 1.3,
"no_repeat_ngram_size": 3,
"do_sample": True,
"penalty_alpha": 0.6
}
except Exception as e:
logger.error(f"Failed to read config.json: {e}", exc_info=True)
raise
# Try to create AppConfig with pydantic validation
if pydantic_available:
try:
cfg = AppConfig(**raw)
# Just log success message
logger.debug("Config loaded successfully")
return cfg
except ValidationError as ve:
logger.error(f"Config validation error: {ve}", exc_info=True)
# Fall back to returning the raw config as a dict
logger.warning("Using raw config dictionary due to validation failure")
return raw_config
else:
# If pydantic not available, just return the raw dict
return raw_config
# Global application config
app_config = load_config()
def get_model_architecture_params():
"""Get model architecture parameters from config file"""
if hasattr(app_config, "TRANSFORMER_CONFIG"):
tc = app_config.TRANSFORMER_CONFIG
# CRITICAL: Use 767 consistently for max_seq_length to match config.json
return {
"vocab_size": getattr(tc, "VOCAB_SIZE", 50257),
"embedding_dim": 768, # Fixed to 768 for embedding dimensions
"num_heads": 12, # 12 heads works with 768 (768/12=64)
"hidden_dim": 768, # Fixed to 768 for hidden dimensions
"num_layers": getattr(tc, "NUM_LAYERS", 12),
"output_size": getattr(tc, "VOCAB_SIZE", 50257),
"dropout": getattr(tc, "DROPOUT", 0.1),
"max_seq_length": 767 # IMPORTANT: Fixed to 767 to match config.json
}
else:
# Default parameters if config not available
return {
"vocab_size": 50257,
"embedding_dim": 768,
"num_heads": 12,
"hidden_dim": 768,
"num_layers": 12,
"output_size": 50257,
"dropout": 0.1,
"max_seq_length": 767 # IMPORTANT: Fixed to 767 to match config.json
}
if __name__ == "__main__":
args = argparse.ArgumentParser(description="Tiny Language Model Configuration").parse_args()
print("Configuration loaded successfully!")