| |
|
| | import os
|
| | import json
|
| | import logging
|
| | import argparse
|
| | import pydantic
|
| | from utils import dependency_helpers
|
| | from pathlib import Path
|
| | from typing import Optional, Dict, List, Literal, Any, Union
|
| |
|
| |
|
| | pydantic_available = True
|
| |
|
| |
|
| | try:
|
| | from pydantic import BaseModel, Field, ValidationError, ConfigDict
|
| | except ImportError:
|
| | pydantic_available = False
|
| | logger = logging.getLogger(__name__)
|
| | logger.warning("pydantic not available, using dummy BaseModel")
|
| | class BaseModel:
|
| | def __init__(self, **kwargs):
|
| | for k, v in kwargs.items(): setattr(self, k, v)
|
| | Field = lambda *args, **kwargs: None
|
| | ValidationError = Exception
|
| | ConfigDict = dict
|
| |
|
| |
|
| | logging.basicConfig(
|
| | level=logging.INFO,
|
| | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| | )
|
| | logger = logging.getLogger(__name__)
|
| |
|
| | if pydantic_available:
|
| | logger.info(f"Loaded pydantic v{pydantic.__version__}")
|
| | else:
|
| | logger.debug("Operating with dummy pydantic types")
|
| |
|
| | class PathConfig:
|
| | """Handle path configurations"""
|
| | @staticmethod
|
| | def get_project_root() -> Path:
|
| | return Path(__file__).resolve().parent
|
| |
|
| | @staticmethod
|
| | def get_data_dir() -> Path:
|
| | """Get writable data directory, falling back to temp if needed"""
|
| |
|
| | project_dir = PathConfig.get_project_root()
|
| | data_dir = project_dir / "data"
|
| |
|
| |
|
| | try:
|
| | if not data_dir.exists():
|
| | data_dir.mkdir(parents=True, exist_ok=True)
|
| |
|
| | test_file = data_dir / ".write_test"
|
| | test_file.touch()
|
| | test_file.unlink()
|
| | return data_dir
|
| | except (PermissionError, IOError):
|
| |
|
| | import tempfile
|
| | tmp_dir = Path(tempfile.gettempdir()) / "wildnerve_data"
|
| | tmp_dir.mkdir(parents=True, exist_ok=True)
|
| | logger.info("Using temporary directory for data: %s", tmp_dir)
|
| | return tmp_dir
|
| |
|
| | @staticmethod
|
| | def get_checkpoint_dir() -> Path:
|
| |
|
| | project_dir = PathConfig.get_project_root()
|
| | checkpoint_dir = project_dir / "checkpoints"
|
| |
|
| |
|
| | if os.access(project_dir, os.W_OK):
|
| | return checkpoint_dir
|
| |
|
| |
|
| | import tempfile
|
| | tmp_dir = Path(tempfile.gettempdir()) / "wildnerve_checkpoints"
|
| | return tmp_dir
|
| |
|
| |
|
| | BASE_DIR = PathConfig.get_project_root()
|
| | DATA_DIR = PathConfig.get_data_dir()
|
| | CHECKPOINT_DIR = PathConfig.get_checkpoint_dir()
|
| |
|
| |
|
| | INPUT_SIZE = 768
|
| | OUTPUT_SIZE = 768
|
| | HIDDEN_SIZE = 768
|
| |
|
| |
|
| | SPECIALIZATIONS = [
|
| | "python",
|
| | "rust",
|
| | "solidity",
|
| | "computer",
|
| | "cpp",
|
| | "go",
|
| | "java",
|
| | "javascript",
|
| | "mathematics",
|
| | "nim",
|
| | "other_information",
|
| | "physics"
|
| | ]
|
| |
|
| |
|
| | DATASET_PATHS = {
|
| | "python": [
|
| | str(DATA_DIR / "data" / "python_mbpp.json"),
|
| | str(DATA_DIR / "data" / "python_programming.json"),
|
| | str(DATA_DIR / "data" / "python_transformer_model.json")
|
| | ],
|
| | "rust": [
|
| | str(DATA_DIR / "data" / "rust_ai_language_model.json"),
|
| | str(DATA_DIR / "data" / "rust_blockchain.json"),
|
| | str(DATA_DIR / "data" / "rust_mbrp.json"),
|
| | str(DATA_DIR / "data" / "rust_programming.json")
|
| | ],
|
| | "solidity": [
|
| | str(DATA_DIR / "data" / "solidity_programming.json")
|
| | ],
|
| | "computer": [
|
| | str(DATA_DIR / "data" / "computer_advanced_debugging.json"),
|
| | str(DATA_DIR / "data" / "computer_agenticAI.json"),
|
| | str(DATA_DIR / "data" / "computer_architecture.json"),
|
| | str(DATA_DIR / "data" / "computer_cloud_security.json"),
|
| | str(DATA_DIR / "data" / "computer_cloudCI-CD.json"),
|
| | str(DATA_DIR / "data" / "computer_creativity.json"),
|
| | str(DATA_DIR / "data" / "computer_crossplatform.json"),
|
| | str(DATA_DIR / "data" / "computer_cybersecurity.json"),
|
| | str(DATA_DIR / "data" / "computer_error_handling_examples.json"),
|
| | str(DATA_DIR / "data" / "computer_gitInstruct.json")
|
| | ],
|
| | "cpp": [
|
| | str(DATA_DIR / "data" / "cpp_ai_language_model.json"),
|
| | str(DATA_DIR / "data" / "cpp_blockchain.json"),
|
| | str(DATA_DIR / "data" / "cpp_mbcppp.json"),
|
| | str(DATA_DIR / "data" / "cpp_programming.json")
|
| | ],
|
| | "go": [
|
| | str(DATA_DIR / "data" / "golang_ai_language_model.json"),
|
| | str(DATA_DIR / "data" / "golang_mbgp.json"),
|
| | str(DATA_DIR / "data" / "golang_programming.json")
|
| | ],
|
| | "java": [
|
| | str(DATA_DIR / "data" / "java_ai_language_model.json"),
|
| | str(DATA_DIR / "data" / "java_blockchain.json"),
|
| | str(DATA_DIR / "data" / "java_mbjp.json"),
|
| | str(DATA_DIR / "data" / "java_programming.json"),
|
| | str(DATA_DIR / "data" / "java_transformer_language_model.json")
|
| | ],
|
| | "javascript": [
|
| | str(DATA_DIR / "data" / "javascript_chatbot.json"),
|
| | str(DATA_DIR / "data" / "javascript_n_Typescript_backend.json"),
|
| | str(DATA_DIR / "data" / "javascript_n_Typescript_frontend.json"),
|
| | str(DATA_DIR / "data" / "javascript_n_Typescript_programming.json")
|
| | ],
|
| | "mathematics": [
|
| | str(DATA_DIR / "data" / "mathematics.json"),
|
| | str(DATA_DIR / "data" / "mathematics_training.json")
|
| | ],
|
| | "nim": [
|
| | str(DATA_DIR / "data" / "nim_ai_language_model.json"),
|
| | str(DATA_DIR / "data" / "nim_blockchain.json"),
|
| | str(DATA_DIR / "data" / "nim_chatbot.json"),
|
| | str(DATA_DIR / "data" / "nim_conversation.json"),
|
| | str(DATA_DIR / "data" / "nim_mbnp.json"),
|
| | str(DATA_DIR / "data" / "nim_programming.json")
|
| | ],
|
| | "other_information": [
|
| | str(DATA_DIR / "data" / "other_information.json")
|
| | ],
|
| | "physics": [
|
| | str(DATA_DIR / "data" / "physics_n_engineering.json"),
|
| | str(DATA_DIR / "data" / "physics_n_engineering_applied.json"),
|
| | str(DATA_DIR / "data" / "project_structure.json"),
|
| | str(DATA_DIR / "data" / "python_chatbot_guide.json")
|
| | ]
|
| | }
|
| |
|
| |
|
| | class TrainingConfig(BaseModel):
|
| | PATIENCE: int = Field(..., description="Early stopping patience")
|
| | DELTA: float = Field(..., description="Minimum change in the monitored value")
|
| | VERBOSE: bool = Field(..., description="Verbosity of training logs")
|
| | NUM_EPOCHS: int = Field(..., description="Number of training epochs")
|
| | LEARNING_RATE: float = Field(..., description="Learning rate for optimizer")
|
| | TRANSFORMER_LEARNING_RATE: float = Field(..., description="Learning rate for transformer")
|
| | TRANSFORMER_NUM_EPOCHS: int = Field(..., description="Transformer training epochs")
|
| |
|
| | model_config = ConfigDict(
|
| | validate_assignment=True,
|
| | extra="allow"
|
| | )
|
| |
|
| | class CheckpointConfig(BaseModel):
|
| | PATH: str = Field(..., description="Checkpoint saving folder")
|
| | BASE_DIR: str = Field(..., description="Base directory for checkpoints")
|
| | TRANSFORMER_FILENAME_FORMAT: str = Field(..., description="Transformer checkpoint filename format")
|
| | SNN_FILENAME_FORMAT: str = Field(..., description="SNN checkpoint filename format")
|
| |
|
| | model_config = ConfigDict(
|
| | validate_assignment=True,
|
| | extra="allow"
|
| | )
|
| |
|
| | class TokenizerConfig(BaseModel):
|
| | MODEL_NAME: str = Field(..., description="Name of the tokenizer model")
|
| | MAX_SEQ_LENGTH: int = Field(..., description="Maximum length the tokenizer handles")
|
| | POOLING_MODE: str = Field(..., description="Pooling mode for embeddings")
|
| |
|
| | model_config = ConfigDict(
|
| | validate_assignment=True,
|
| | extra="allow"
|
| | )
|
| |
|
| | class DataLoaderConfig(BaseModel):
|
| | SHUFFLE: bool = Field(..., description="Whether to Shuffle the dataset")
|
| | BATCH_SIZE: int = Field(..., description="Batch size for dataloader")
|
| | NUM_WORKERS: int = Field(..., description="Number of workers for dataloader")
|
| | INCLUDE_CRAWL: bool = Field(..., description="Include crawl parameter")
|
| |
|
| | model_config = ConfigDict(
|
| | validate_assignment=True,
|
| | extra="allow"
|
| | )
|
| |
|
| | class GenerationConfig(BaseModel):
|
| | temperature: float = Field(0.7, description="Decoding temperature.")
|
| | top_p: float = Field(0.9, description="Nucleus sampling probability.")
|
| | num_return_sequences: int = Field(1, description="Number of sequences to generate.")
|
| |
|
| | model_config = ConfigDict(
|
| | validate_assignment=True,
|
| | extra="allow"
|
| | )
|
| |
|
| | class PretrainedLimitsConfig(BaseModel):
|
| | GPT2: int = Field(1024, description="Maximum sequence length for GPT-2")
|
| | BERT: int = Field(512, description="Maximum sequence length for BERT")
|
| |
|
| | model_config = ConfigDict(
|
| | validate_assignment=True,
|
| | extra="allow"
|
| | )
|
| |
|
| | class CustomWindowsConfig(BaseModel):
|
| | MAX_SEQ_LENGTH: int = Field(2048, description="Maximum sequence length for custom models")
|
| | WINDOW_SIZE: int = Field(1024, description="Window size for sliding window attention")
|
| | STRIDE: int = Field(512, description="Stride for sliding window attention")
|
| |
|
| | model_config = ConfigDict(
|
| | validate_assignment=True,
|
| | extra="allow"
|
| | )
|
| |
|
| | class AttentionConfig(BaseModel):
|
| | PRETRAINED_LIMITS: PretrainedLimitsConfig = Field(default_factory=PretrainedLimitsConfig)
|
| | CUSTOM_WINDOWS: CustomWindowsConfig = Field(default_factory=CustomWindowsConfig)
|
| |
|
| | model_config = ConfigDict(
|
| | validate_assignment=True,
|
| | extra="allow"
|
| | )
|
| |
|
| | class TransformerConfig(BaseModel):
|
| | ATTENTION_MECHANISM: Dict[str, Any] = Field(
|
| | default={
|
| | "TYPE": "hybrid",
|
| | "WINDOW_SIZE": 1024,
|
| | "STRIDE": 512,
|
| | "USE_MEMORY": True,
|
| | "ATTENTION_TYPES": {
|
| | "SLIDING": True,
|
| | "HIERARCHICAL": True,
|
| | "GLOBAL": True
|
| | }
|
| | },
|
| | description="Attention mechanism configuration"
|
| | )
|
| |
|
| | BASE_DIR: str = Field(..., description="Base directory for transformer checkpoints")
|
| | TRANSFORMER_FILENAME_FORMAT: str = Field(..., description="Filename format for transformer checkpoints")
|
| | MODEL_NAME: str = Field("bert-base-uncased", description="Name of the primary model from Hugging Face")
|
| | NUM_EPOCHS: int = Field(30, description="Number of epochs for transformer training")
|
| | LEARNING_RATE: float = Field(..., description="Learning rate for transformer")
|
| | BATCH_SIZE: int = Field(..., description="Batch size for transformer training")
|
| | EMBEDDING_DIM: int = Field(..., description="Embedding dimension")
|
| | NUM_HEADS: int = Field(..., description="Number of attention heads")
|
| | HIDDEN_DIM: int = Field(..., description="Hidden dimension")
|
| | NUM_LAYERS: int = Field(..., description="Number of layers")
|
| | DROPOUT: float = Field(..., description="Dropout rate")
|
| | specialization: Optional[str] = Field(
|
| | default="general",
|
| | description="Specialization type (defaults to 'general')"
|
| | )
|
| | DATASET_PATH: str = Field(..., description="Path to the dataset")
|
| | OUTPUT_SIZE: int = Field(..., description="Size of the output (usually vocab size)")
|
| | MAX_SEQ_LENGTH: int = Field(..., description="Maximum sequence length")
|
| | POOLING_MODE: str = Field(..., description="Pooling mode")
|
| | VOCAB_SIZE: int = Field(..., description="Vocabulary size")
|
| | MAX_RATE: int = Field(..., description="Maximum rate")
|
| | MODE: str = Field(..., description="Model mode")
|
| | MODE2: str = Field(..., description="Secondary mode")
|
| | SHUFFLE: bool = Field(..., description="Shuffle flag for transformer")
|
| | SIMILARITY_THRESHOLD: float = Field(..., description="Similarity threshold for weight sharing")
|
| | USE_PRETRAINED_ENCODER: bool = Field(..., description="Enable pretrained encoder branch")
|
| |
|
| | model_config = ConfigDict(
|
| | validate_assignment=True,
|
| | extra="allow"
|
| | )
|
| |
|
| | class PreprocessingConfig(BaseModel):
|
| | LOWERCASE: bool = Field(True, description="Convert text to lowercase")
|
| | REMOVE_SPECIAL_CHARACTERS: bool = Field(True, description="Remove special characters from text")
|
| | REPLACE_MULTIPLE_SPACES: bool = Field(True, description="Replace multiple spaces with a single space")
|
| |
|
| | model_config = ConfigDict(
|
| | validate_assignment=True,
|
| | extra="allow"
|
| | )
|
| |
|
| | class STDPConfig(BaseModel):
|
| | WEIGHT_THRESHOLD: float = Field(..., description="Threshold for STDP weight update")
|
| | ACTIVATION_THRESHOLD: float = Field(..., description="Threshold for STDP activation")
|
| | USE_SNN: bool = Field(..., description="Use spiking neural network")
|
| | ALPHA: float = Field(..., description="STDP alpha parameter")
|
| | BETA: float = Field(..., description="STDP beta parameter")
|
| | BASE_DIR: str = Field(..., description="Directory for STDP checkpoints")
|
| | SNN_FILENAME_FORMAT: str = Field(..., description="Filename format for SNN checkpoints")
|
| | STDPLearningRate: float = Field(..., description="STDP learning rate")
|
| | STDPMemDecay: float = Field(..., description="STDP memory decay factor")
|
| | SpikeThreshold: float = Field(..., description="Spike threshold")
|
| | firing_rate: int = Field(..., description="Firing rate")
|
| | MAX_SEQ_LENGTH: int = Field(..., description="Maximum sequence length")
|
| | STDP_PRETRAIN_EPOCHS: int = Field(..., description="Pre-training epochs for STDP")
|
| | STDP_FINETUNE_EPOCHS: int = Field(..., description="Fine-tuning epochs for STDP")
|
| | BATCH_SIZE_PRETRAIN: int = Field(..., description="Batch size during STDP pre-training")
|
| | BATCH_SIZE_FINETUNE: int = Field(..., description="Batch size during STDP fine-tuning")
|
| | NUM_NEURONS: int = Field(..., description="Number of neurons in the STDP model")
|
| | MAX_RATE: int = Field(..., description="Maximum rate for STDP")
|
| |
|
| | model_config = ConfigDict(
|
| | validate_assignment=True,
|
| | extra="allow"
|
| | )
|
| |
|
| | class SerializableDict(dict):
|
| | """Dictionary subclass with attribute-style access that can be serialized safely"""
|
| | def __getattr__(self, key):
|
| | if key in self:
|
| | return self[key]
|
| | return None
|
| |
|
| | def __setattr__(self, key, value):
|
| | self[key] = value
|
| |
|
| | def __delattr__(self, key):
|
| | if key in self:
|
| | del self[key]
|
| |
|
| |
|
| | def __getstate__(self):
|
| | """Return state for pickling - exclude config_data if it's self"""
|
| | state = dict(self)
|
| | if 'config_data' in state and id(state['config_data']) == id(self):
|
| | state['config_data'] = '__self__'
|
| | return state
|
| |
|
| | def __repr__(self):
|
| | """Safe representation that handles circular references"""
|
| | items = []
|
| | for k, v in self.items():
|
| | if k == "config_data" and v is self:
|
| | items.append(f"{k}=<self>")
|
| | else:
|
| | items.append(f"{k}={v!r}")
|
| | return f"{self.__class__.__name__}({', '.join(items)})"
|
| |
|
| | class AppConfig(BaseModel):
|
| | """Main application configuration with proper serialization handling"""
|
| |
|
| | SELECTED_MODEL: List[str] = Field(
|
| | default=["model_Custm.py", "model_PrTr.py"],
|
| | description="Default model files (custom first, then pretrained)"
|
| | )
|
| | DATA_DIR: str = Field(default="/tmp/tlm_data", description="Local data directory")
|
| | MODEL_DIR: str = Field(default="/tmp/tlm_data/models", description="Local model weights directory")
|
| | HF_DATASET_URL: str = Field(
|
| | default="https://huggingface.co/datasets/EvolphTech/data",
|
| | description="Remote dataset repository URL"
|
| | )
|
| | HF_WEIGHTS_URL: str = Field(
|
| | default="https://huggingface.co/EvolphTech/Weights",
|
| | description="Remote weights repository URL"
|
| | )
|
| | HF_MODEL_URL: str = Field(
|
| | default="https://huggingface.co/EvolphTech/Wildnerve-tlm01_Hybrid_Model",
|
| | description="Remote model repository URL"
|
| | )
|
| | HF_CHATBOT_SPACE_URL: str = Field(
|
| | default="https://huggingface.co/spaces/EvolphTech/Wildnerve-tlm01-0.05Bx12",
|
| | description="Chatbot Space URL"
|
| | )
|
| | WP_PLUGIN_FILE: str = Field(
|
| | default="wildnerve-chatbot.php",
|
| | description="WordPress chatbot plugin file"
|
| | )
|
| | TRANSFORMER_CONFIG: Dict[str, Any] = Field(
|
| | default_factory=dict,
|
| | description="Transformer configuration overrides"
|
| | )
|
| | SIMILARITY_THRESHOLD: float = Field(default=0.85)
|
| | TOP_K: int = Field(default=3)
|
| | MAX_ACTIVE_MODELS: int = Field(default=2)
|
| | MODEL_IDLE_THRESHOLD: int = Field(default=600)
|
| |
|
| |
|
| | model_config = ConfigDict(
|
| | extra="allow",
|
| | arbitrary_types_allowed=True,
|
| | populate_by_name=True,
|
| | json_encoders={
|
| |
|
| | SerializableDict: lambda v: {k: v[k] for k in v if not k.startswith("_")}
|
| | },
|
| | validate_assignment=False
|
| | )
|
| |
|
| | def load_config() -> Union[AppConfig, Dict[str, Any]]:
|
| | """Load configuration from JSON file with robust error handling"""
|
| | config_path = os.path.join(os.path.dirname(__file__), "config.json")
|
| | logger.info(f"Loading config from {config_path}")
|
| | raw_config = {}
|
| |
|
| | try:
|
| | with open(config_path, "r") as f:
|
| | try:
|
| | raw = json.load(f)
|
| | raw_config = raw
|
| | except json.JSONDecodeError as e:
|
| | logger.error(f"JSON parsing error in config.json: {e}")
|
| | logger.error(f"Error at line {e.lineno}, column {e.colno}: {e.msg}")
|
| | raise
|
| |
|
| |
|
| | if isinstance(raw.get("TRANSFORMER_CONFIG"), dict):
|
| |
|
| | transformer_config = SerializableDict(raw["TRANSFORMER_CONFIG"])
|
| |
|
| |
|
| | transformer_config['config_data'] = transformer_config
|
| |
|
| |
|
| | raw["TRANSFORMER_CONFIG"] = transformer_config
|
| |
|
| |
|
| | if not isinstance(transformer_config.get("VOCAB_SIZE"), int) or transformer_config["VOCAB_SIZE"] != 50257:
|
| | transformer_config["VOCAB_SIZE"] = 50257
|
| |
|
| | if transformer_config.get("MODEL_NAME") != "gpt2":
|
| | transformer_config["MODEL_NAME"] = "gpt2"
|
| |
|
| |
|
| | transformer_config["OUTPUT_SIZE"] = transformer_config["VOCAB_SIZE"]
|
| |
|
| |
|
| | if "GENERATION_CONFIG" not in raw:
|
| | raw["GENERATION_CONFIG"] = {
|
| | "temperature": 0.7,
|
| | "top_p": 0.95,
|
| | "top_k": 50,
|
| | "repetition_penalty": 1.3,
|
| | "no_repeat_ngram_size": 3,
|
| | "do_sample": True,
|
| | "penalty_alpha": 0.6
|
| | }
|
| | except Exception as e:
|
| | logger.error(f"Failed to read config.json: {e}", exc_info=True)
|
| | raise
|
| |
|
| |
|
| | if pydantic_available:
|
| | try:
|
| | cfg = AppConfig(**raw)
|
| |
|
| |
|
| | logger.debug("Config loaded successfully")
|
| | return cfg
|
| |
|
| | except ValidationError as ve:
|
| | logger.error(f"Config validation error: {ve}", exc_info=True)
|
| |
|
| |
|
| | logger.warning("Using raw config dictionary due to validation failure")
|
| | return raw_config
|
| | else:
|
| |
|
| | return raw_config
|
| |
|
| |
|
| | app_config = load_config()
|
| |
|
| | def get_model_architecture_params():
|
| | """Get model architecture parameters from config file"""
|
| | if hasattr(app_config, "TRANSFORMER_CONFIG"):
|
| | tc = app_config.TRANSFORMER_CONFIG
|
| |
|
| | return {
|
| | "vocab_size": getattr(tc, "VOCAB_SIZE", 50257),
|
| | "embedding_dim": 768,
|
| | "num_heads": 12,
|
| | "hidden_dim": 768,
|
| | "num_layers": getattr(tc, "NUM_LAYERS", 12),
|
| | "output_size": getattr(tc, "VOCAB_SIZE", 50257),
|
| | "dropout": getattr(tc, "DROPOUT", 0.1),
|
| | "max_seq_length": 767
|
| | }
|
| | else:
|
| |
|
| | return {
|
| | "vocab_size": 50257,
|
| | "embedding_dim": 768,
|
| | "num_heads": 12,
|
| | "hidden_dim": 768,
|
| | "num_layers": 12,
|
| | "output_size": 50257,
|
| | "dropout": 0.1,
|
| | "max_seq_length": 767
|
| | }
|
| |
|
| | if __name__ == "__main__":
|
| | args = argparse.ArgumentParser(description="Tiny Language Model Configuration").parse_args()
|
| | print("Configuration loaded successfully!") |