Upload config.py

45c2916 verified 10 months ago

23 kB

	# config.py - 21/02/2025, cleaned up version, 5:14pm, C:\Users\User\OneDrive\Documents\tlm\config.py
	import os
	import json
	import logging
	import argparse
	import pydantic # prefer real import in main block
	from utils import dependency_helpers # Changed to direct import from utils package
	from pathlib import Path
	from typing import Optional, Dict, List, Literal, Any, Union

	# flag indicating real pydantic is present
	pydantic_available = True

	# Attempt to load pydantic and fall back on dummy types
	try:
	from pydantic import BaseModel, Field, ValidationError, ConfigDict
	except ImportError:
	pydantic_available = False
	logger = logging.getLogger(__name__)
	logger.warning("pydantic not available, using dummy BaseModel")
	class BaseModel:
	def __init__(self, **kwargs):
	for k, v in kwargs.items(): setattr(self, k, v)
	Field = lambda args, *kwargs: None
	ValidationError = Exception
	ConfigDict = dict

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
	)
	logger = logging.getLogger(__name__)

	if pydantic_available:
	logger.info(f"Loaded pydantic v{pydantic.__version__}")
	else:
	logger.debug("Operating with dummy pydantic types")

	class PathConfig:
	"""Handle path configurations"""
	@staticmethod
	def get_project_root() -> Path:
	return Path(__file__).resolve().parent

	@staticmethod
	def get_data_dir() -> Path:
	"""Get writable data directory, falling back to temp if needed"""
	# First try in project directory
	project_dir = PathConfig.get_project_root()
	data_dir = project_dir / "data"

	# Check if we can write to this location
	try:
	if not data_dir.exists():
	data_dir.mkdir(parents=True, exist_ok=True)
	# Test write access with a small file
	test_file = data_dir / ".write_test"
	test_file.touch()
	test_file.unlink()
	return data_dir
	except (PermissionError, IOError):
	# Fall back to temp directory
	import tempfile
	tmp_dir = Path(tempfile.gettempdir()) / "wildnerve_data"
	tmp_dir.mkdir(parents=True, exist_ok=True)
	logger.info("Using temporary directory for data: %s", tmp_dir)
	return tmp_dir

	@staticmethod
	def get_checkpoint_dir() -> Path:
	# First try in project directory
	project_dir = PathConfig.get_project_root()
	checkpoint_dir = project_dir / "checkpoints"

	# Check if we can write to this directory
	if os.access(project_dir, os.W_OK):
	return checkpoint_dir

	# If not writable, fallback to temp directory
	import tempfile
	tmp_dir = Path(tempfile.gettempdir()) / "wildnerve_checkpoints"
	return tmp_dir

	# Replace the current directory setup with:
	BASE_DIR = PathConfig.get_project_root()
	DATA_DIR = PathConfig.get_data_dir()
	CHECKPOINT_DIR = PathConfig.get_checkpoint_dir()

	# Add these model architecture parameters
	INPUT_SIZE = 768 # BERT base hidden size
	OUTPUT_SIZE = 768 # Output embedding size
	HIDDEN_SIZE = 768 # Hidden layer size

	# Add SPECIALIZATIONS list
	SPECIALIZATIONS = [
	"python",
	"rust",
	"solidity",
	"computer",
	"cpp",
	"go",
	"java",
	"javascript",
	"mathematics",
	"nim",
	"other_information",
	"physics"
	]

	# Define DATASET_PATHS so that each specialization maps to its JSON files
	DATASET_PATHS = {
	"python": [
	str(DATA_DIR / "data" / "python_mbpp.json"),
	str(DATA_DIR / "data" / "python_programming.json"),
	str(DATA_DIR / "data" / "python_transformer_model.json")
	],
	"rust": [
	str(DATA_DIR / "data" / "rust_ai_language_model.json"),
	str(DATA_DIR / "data" / "rust_blockchain.json"),
	str(DATA_DIR / "data" / "rust_mbrp.json"),
	str(DATA_DIR / "data" / "rust_programming.json")
	],
	"solidity": [
	str(DATA_DIR / "data" / "solidity_programming.json")
	],
	"computer": [
	str(DATA_DIR / "data" / "computer_advanced_debugging.json"),
	str(DATA_DIR / "data" / "computer_agenticAI.json"),
	str(DATA_DIR / "data" / "computer_architecture.json"),
	str(DATA_DIR / "data" / "computer_cloud_security.json"),
	str(DATA_DIR / "data" / "computer_cloudCI-CD.json"),
	str(DATA_DIR / "data" / "computer_creativity.json"),
	str(DATA_DIR / "data" / "computer_crossplatform.json"),
	str(DATA_DIR / "data" / "computer_cybersecurity.json"),
	str(DATA_DIR / "data" / "computer_error_handling_examples.json"),
	str(DATA_DIR / "data" / "computer_gitInstruct.json")
	],
	"cpp": [
	str(DATA_DIR / "data" / "cpp_ai_language_model.json"),
	str(DATA_DIR / "data" / "cpp_blockchain.json"),
	str(DATA_DIR / "data" / "cpp_mbcppp.json"),
	str(DATA_DIR / "data" / "cpp_programming.json")
	],
	"go": [
	str(DATA_DIR / "data" / "golang_ai_language_model.json"),
	str(DATA_DIR / "data" / "golang_mbgp.json"),
	str(DATA_DIR / "data" / "golang_programming.json")
	],
	"java": [
	str(DATA_DIR / "data" / "java_ai_language_model.json"),
	str(DATA_DIR / "data" / "java_blockchain.json"),
	str(DATA_DIR / "data" / "java_mbjp.json"),
	str(DATA_DIR / "data" / "java_programming.json"),
	str(DATA_DIR / "data" / "java_transformer_language_model.json")
	],
	"javascript": [
	str(DATA_DIR / "data" / "javascript_chatbot.json"),
	str(DATA_DIR / "data" / "javascript_n_Typescript_backend.json"),
	str(DATA_DIR / "data" / "javascript_n_Typescript_frontend.json"),
	str(DATA_DIR / "data" / "javascript_n_Typescript_programming.json")
	],
	"mathematics": [
	str(DATA_DIR / "data" / "mathematics.json"),
	str(DATA_DIR / "data" / "mathematics_training.json")
	],
	"nim": [
	str(DATA_DIR / "data" / "nim_ai_language_model.json"),
	str(DATA_DIR / "data" / "nim_blockchain.json"),
	str(DATA_DIR / "data" / "nim_chatbot.json"),
	str(DATA_DIR / "data" / "nim_conversation.json"),
	str(DATA_DIR / "data" / "nim_mbnp.json"),
	str(DATA_DIR / "data" / "nim_programming.json")
	],
	"other_information": [
	str(DATA_DIR / "data" / "other_information.json")
	],
	"physics": [
	str(DATA_DIR / "data" / "physics_n_engineering.json"),
	str(DATA_DIR / "data" / "physics_n_engineering_applied.json"),
	str(DATA_DIR / "data" / "project_structure.json"),
	str(DATA_DIR / "data" / "python_chatbot_guide.json")
	]
	}

	# Nested configuration models
	class TrainingConfig(BaseModel):
	PATIENCE: int = Field(..., description="Early stopping patience")
	DELTA: float = Field(..., description="Minimum change in the monitored value")
	VERBOSE: bool = Field(..., description="Verbosity of training logs")
	NUM_EPOCHS: int = Field(..., description="Number of training epochs")
	LEARNING_RATE: float = Field(..., description="Learning rate for optimizer")
	TRANSFORMER_LEARNING_RATE: float = Field(..., description="Learning rate for transformer")
	TRANSFORMER_NUM_EPOCHS: int = Field(..., description="Transformer training epochs")

	model_config = ConfigDict(
	validate_assignment=True,
	extra="allow"
	)

	class CheckpointConfig(BaseModel):
	PATH: str = Field(..., description="Checkpoint saving folder")
	BASE_DIR: str = Field(..., description="Base directory for checkpoints")
	TRANSFORMER_FILENAME_FORMAT: str = Field(..., description="Transformer checkpoint filename format")
	SNN_FILENAME_FORMAT: str = Field(..., description="SNN checkpoint filename format")

	model_config = ConfigDict(
	validate_assignment=True,
	extra="allow"
	)

	class TokenizerConfig(BaseModel):
	MODEL_NAME: str = Field(..., description="Name of the tokenizer model")
	MAX_SEQ_LENGTH: int = Field(..., description="Maximum length the tokenizer handles")
	POOLING_MODE: str = Field(..., description="Pooling mode for embeddings")

	model_config = ConfigDict(
	validate_assignment=True,
	extra="allow"
	)

	class DataLoaderConfig(BaseModel):
	SHUFFLE: bool = Field(..., description="Whether to Shuffle the dataset")
	BATCH_SIZE: int = Field(..., description="Batch size for dataloader")
	NUM_WORKERS: int = Field(..., description="Number of workers for dataloader")
	INCLUDE_CRAWL: bool = Field(..., description="Include crawl parameter")

	model_config = ConfigDict(
	validate_assignment=True,
	extra="allow"
	)

	class GenerationConfig(BaseModel):
	temperature: float = Field(0.7, description="Decoding temperature.")
	top_p: float = Field(0.9, description="Nucleus sampling probability.")
	num_return_sequences: int = Field(1, description="Number of sequences to generate.")

	model_config = ConfigDict(
	validate_assignment=True,
	extra="allow"
	)

	class PretrainedLimitsConfig(BaseModel):
	GPT2: int = Field(1024, description="Maximum sequence length for GPT-2")
	BERT: int = Field(512, description="Maximum sequence length for BERT")

	model_config = ConfigDict(
	validate_assignment=True,
	extra="allow"
	)

	class CustomWindowsConfig(BaseModel):
	MAX_SEQ_LENGTH: int = Field(2048, description="Maximum sequence length for custom models")
	WINDOW_SIZE: int = Field(1024, description="Window size for sliding window attention")
	STRIDE: int = Field(512, description="Stride for sliding window attention")

	model_config = ConfigDict(
	validate_assignment=True,
	extra="allow"
	)

	class AttentionConfig(BaseModel):
	PRETRAINED_LIMITS: PretrainedLimitsConfig = Field(default_factory=PretrainedLimitsConfig)
	CUSTOM_WINDOWS: CustomWindowsConfig = Field(default_factory=CustomWindowsConfig)

	model_config = ConfigDict(
	validate_assignment=True,
	extra="allow"
	)

	class TransformerConfig(BaseModel):
	ATTENTION_MECHANISM: Dict[str, Any] = Field(
	default={
	"TYPE": "hybrid",
	"WINDOW_SIZE": 1024,
	"STRIDE": 512,
	"USE_MEMORY": True,
	"ATTENTION_TYPES": {
	"SLIDING": True,
	"HIERARCHICAL": True,
	"GLOBAL": True
	}
	},
	description="Attention mechanism configuration"
	)

	BASE_DIR: str = Field(..., description="Base directory for transformer checkpoints")
	TRANSFORMER_FILENAME_FORMAT: str = Field(..., description="Filename format for transformer checkpoints")
	MODEL_NAME: str = Field("bert-base-uncased", description="Name of the primary model from Hugging Face") # Changed from Wildnerve-tlm01
	NUM_EPOCHS: int = Field(30, description="Number of epochs for transformer training") # Increased from whatever value was here before
	LEARNING_RATE: float = Field(..., description="Learning rate for transformer")
	BATCH_SIZE: int = Field(..., description="Batch size for transformer training")
	EMBEDDING_DIM: int = Field(..., description="Embedding dimension")
	NUM_HEADS: int = Field(..., description="Number of attention heads")
	HIDDEN_DIM: int = Field(..., description="Hidden dimension")
	NUM_LAYERS: int = Field(..., description="Number of layers")
	DROPOUT: float = Field(..., description="Dropout rate")
	specialization: Optional[str] = Field(
	default="general",
	description="Specialization type (defaults to 'general')"
	)
	DATASET_PATH: str = Field(..., description="Path to the dataset")
	OUTPUT_SIZE: int = Field(..., description="Size of the output (usually vocab size)")
	MAX_SEQ_LENGTH: int = Field(..., description="Maximum sequence length")
	POOLING_MODE: str = Field(..., description="Pooling mode")
	VOCAB_SIZE: int = Field(..., description="Vocabulary size")
	MAX_RATE: int = Field(..., description="Maximum rate")
	MODE: str = Field(..., description="Model mode")
	MODE2: str = Field(..., description="Secondary mode")
	SHUFFLE: bool = Field(..., description="Shuffle flag for transformer")
	SIMILARITY_THRESHOLD: float = Field(..., description="Similarity threshold for weight sharing")
	USE_PRETRAINED_ENCODER: bool = Field(..., description="Enable pretrained encoder branch")

	model_config = ConfigDict(
	validate_assignment=True,
	extra="allow"
	)

	class PreprocessingConfig(BaseModel):
	LOWERCASE: bool = Field(True, description="Convert text to lowercase")
	REMOVE_SPECIAL_CHARACTERS: bool = Field(True, description="Remove special characters from text")
	REPLACE_MULTIPLE_SPACES: bool = Field(True, description="Replace multiple spaces with a single space")

	model_config = ConfigDict(
	validate_assignment=True,
	extra="allow"
	)

	class STDPConfig(BaseModel):
	WEIGHT_THRESHOLD: float = Field(..., description="Threshold for STDP weight update")
	ACTIVATION_THRESHOLD: float = Field(..., description="Threshold for STDP activation")
	USE_SNN: bool = Field(..., description="Use spiking neural network")
	ALPHA: float = Field(..., description="STDP alpha parameter")
	BETA: float = Field(..., description="STDP beta parameter")
	BASE_DIR: str = Field(..., description="Directory for STDP checkpoints")
	SNN_FILENAME_FORMAT: str = Field(..., description="Filename format for SNN checkpoints")
	STDPLearningRate: float = Field(..., description="STDP learning rate")
	STDPMemDecay: float = Field(..., description="STDP memory decay factor")
	SpikeThreshold: float = Field(..., description="Spike threshold")
	firing_rate: int = Field(..., description="Firing rate")
	MAX_SEQ_LENGTH: int = Field(..., description="Maximum sequence length")
	STDP_PRETRAIN_EPOCHS: int = Field(..., description="Pre-training epochs for STDP")
	STDP_FINETUNE_EPOCHS: int = Field(..., description="Fine-tuning epochs for STDP")
	BATCH_SIZE_PRETRAIN: int = Field(..., description="Batch size during STDP pre-training")
	BATCH_SIZE_FINETUNE: int = Field(..., description="Batch size during STDP fine-tuning")
	NUM_NEURONS: int = Field(..., description="Number of neurons in the STDP model")
	MAX_RATE: int = Field(..., description="Maximum rate for STDP")

	model_config = ConfigDict(
	validate_assignment=True,
	extra="allow"
	)

	class SerializableDict(dict):
	"""Dictionary subclass with attribute-style access that can be serialized safely"""
	def __getattr__(self, key):
	if key in self:
	return self[key]
	return None

	def __setattr__(self, key, value):
	self[key] = value

	def __delattr__(self, key):
	if key in self:
	del self[key]

	# Add special methods to handle JSON serialization
	def __getstate__(self):
	"""Return state for pickling - exclude config_data if it's self"""
	state = dict(self)
	if 'config_data' in state and id(state['config_data']) == id(self):
	state['config_data'] = '__self__' # Replace self-reference with marker
	return state

	def __repr__(self):
	"""Safe representation that handles circular references"""
	items = []
	for k, v in self.items():
	if k == "config_data" and v is self:
	items.append(f"{k}=<self>")
	else:
	items.append(f"{k}={v!r}")
	return f"{self.__class__.__name__}({', '.join(items)})"

	class AppConfig(BaseModel):
	"""Main application configuration with proper serialization handling"""
	# which model files to load by default
	SELECTED_MODEL: List[str] = Field(
	default=["model_Custm.py", "model_PrTr.py"],
	description="Default model files (custom first, then pretrained)"
	)
	DATA_DIR: str = Field(default="/tmp/tlm_data", description="Local data directory")
	MODEL_DIR: str = Field(default="/tmp/tlm_data/models", description="Local model weights directory")
	HF_DATASET_URL: str = Field(
	default="https://huggingface.co/datasets/EvolphTech/data",
	description="Remote dataset repository URL"
	)
	HF_WEIGHTS_URL: str = Field(
	default="https://huggingface.co/EvolphTech/Weights",
	description="Remote weights repository URL"
	)
	HF_MODEL_URL: str = Field(
	default="https://huggingface.co/EvolphTech/Wildnerve-tlm01_Hybrid_Model",
	description="Remote model repository URL"
	)
	HF_CHATBOT_SPACE_URL: str = Field(
	default="https://huggingface.co/spaces/EvolphTech/Wildnerve-tlm01-0.05Bx12",
	description="Chatbot Space URL"
	)
	WP_PLUGIN_FILE: str = Field(
	default="wildnerve-chatbot.php",
	description="WordPress chatbot plugin file"
	)
	TRANSFORMER_CONFIG: Dict[str, Any] = Field(
	default_factory=dict,
	description="Transformer configuration overrides"
	)
	SIMILARITY_THRESHOLD: float = Field(default=0.85)
	TOP_K: int = Field(default=3)
	MAX_ACTIVE_MODELS: int = Field(default=2)
	MODEL_IDLE_THRESHOLD: int = Field(default=600)

	# Add a new Pydantic model_config to fix serialization issues
	model_config = ConfigDict(
	extra="allow", # Allow extra fields not in the model
	arbitrary_types_allowed=True, # Allow arbitrary types
	populate_by_name=True, # Allow population by field name
	json_encoders={
	# Add custom encoders for non-serializable types
	SerializableDict: lambda v: {k: v[k] for k in v if not k.startswith("_")}
	},
	validate_assignment=False # Don't validate on attribute assignment
	)

	def load_config() -> Union[AppConfig, Dict[str, Any]]:
	"""Load configuration from JSON file with robust error handling"""
	config_path = os.path.join(os.path.dirname(__file__), "config.json")
	logger.info(f"Loading config from {config_path}")
	raw_config = {}

	try:
	with open(config_path, "r") as f:
	try:
	raw = json.load(f)
	raw_config = raw # Save raw config in case Pydantic validation fails
	except json.JSONDecodeError as e:
	logger.error(f"JSON parsing error in config.json: {e}")
	logger.error(f"Error at line {e.lineno}, column {e.colno}: {e.msg}")
	raise

	# Process the TRANSFORMER_CONFIG section
	if isinstance(raw.get("TRANSFORMER_CONFIG"), dict):
	# Create SerializableDict with safe self-reference handling
	transformer_config = SerializableDict(raw["TRANSFORMER_CONFIG"])

	# Add config_data attribute directly
	transformer_config['config_data'] = transformer_config

	# Replace the dict with our enhanced SerializableDict
	raw["TRANSFORMER_CONFIG"] = transformer_config

	# Ensure GPT-2 parameters are set
	if not isinstance(transformer_config.get("VOCAB_SIZE"), int) or transformer_config["VOCAB_SIZE"] != 50257:
	transformer_config["VOCAB_SIZE"] = 50257 # Standard GPT-2 vocab size

	if transformer_config.get("MODEL_NAME") != "gpt2":
	transformer_config["MODEL_NAME"] = "gpt2"

	# Ensure OUTPUT_SIZE matches VOCAB_SIZE
	transformer_config["OUTPUT_SIZE"] = transformer_config["VOCAB_SIZE"]

	# Add generation parameters if missing
	if "GENERATION_CONFIG" not in raw:
	raw["GENERATION_CONFIG"] = {
	"temperature": 0.7,
	"top_p": 0.95,
	"top_k": 50,
	"repetition_penalty": 1.3,
	"no_repeat_ngram_size": 3,
	"do_sample": True,
	"penalty_alpha": 0.6
	}
	except Exception as e:
	logger.error(f"Failed to read config.json: {e}", exc_info=True)
	raise

	# Try to create AppConfig with pydantic validation
	if pydantic_available:
	try:
	cfg = AppConfig(**raw)

	# Just log success message
	logger.debug("Config loaded successfully")
	return cfg

	except ValidationError as ve:
	logger.error(f"Config validation error: {ve}", exc_info=True)

	# Fall back to returning the raw config as a dict
	logger.warning("Using raw config dictionary due to validation failure")
	return raw_config
	else:
	# If pydantic not available, just return the raw dict
	return raw_config

	# Global application config
	app_config = load_config()

	def get_model_architecture_params():
	"""Get model architecture parameters from config file"""
	if hasattr(app_config, "TRANSFORMER_CONFIG"):
	tc = app_config.TRANSFORMER_CONFIG
	# CRITICAL: Use 767 consistently for max_seq_length to match config.json
	return {
	"vocab_size": getattr(tc, "VOCAB_SIZE", 50257),
	"embedding_dim": 768, # Fixed to 768 for embedding dimensions
	"num_heads": 12, # 12 heads works with 768 (768/12=64)
	"hidden_dim": 768, # Fixed to 768 for hidden dimensions
	"num_layers": getattr(tc, "NUM_LAYERS", 12),
	"output_size": getattr(tc, "VOCAB_SIZE", 50257),
	"dropout": getattr(tc, "DROPOUT", 0.1),
	"max_seq_length": 767 # IMPORTANT: Fixed to 767 to match config.json
	}
	else:
	# Default parameters if config not available
	return {
	"vocab_size": 50257,
	"embedding_dim": 768,
	"num_heads": 12,
	"hidden_dim": 768,
	"num_layers": 12,
	"output_size": 50257,
	"dropout": 0.1,
	"max_seq_length": 767 # IMPORTANT: Fixed to 767 to match config.json
	}

	if __name__ == "__main__":
	args = argparse.ArgumentParser(description="Tiny Language Model Configuration").parse_args()
	print("Configuration loaded successfully!")