WildnerveAI commited on Apr 26, 2025

Commit

0861a59

verified ·

1 Parent(s): 6e29355

Upload 20 files

Browse files

Files changed (20) hide show

utils/__init__.py +18 -0
utils/attention_connector.py +135 -0
utils/attention_trigger_system.py +198 -0
utils/collator.py +100 -0
utils/convert_checkpoints.py +253 -0
utils/debug_helper.py +124 -0
utils/dual_encoder_utils.py +151 -0
utils/emergency_abort.py +65 -0
utils/event_bus.py +48 -0
utils/event_system.py +169 -0
utils/gpu_config_optimizer.py +143 -0
utils/model_utils.py +84 -0
utils/nltk_stub.py +119 -0
utils/output_formatter.py +175 -0
utils/prepare_hf_training.py +149 -0
utils/prepare_hf_transformer_training.py +335 -0
utils/sentence_transformer_utils.py +41 -0
utils/smartHybridAttention.py +675 -0
utils/tokenizer_utils.py +147 -0
utils/transformer_utils.py +160 -0

utils/__init__.py CHANGED Viewed

	@@ -0,0 +1,18 @@

+import os
+import sys
+# Add project root to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Utils package initialization
+from .transformer_utils import get_tokenizer, get_sentence_transformer
+try:
+    from utils.smartHybridAttention import SmartHybridAttention, get_hybrid_attention_config
+except ImportError:
+    try:
+        from smartHybridAttention import SmartHybridAttention, get_hybrid_attention_config
+    except ImportError:
+        print("Warning: Could not import SmartHybridAttention")
+        SmartHybridAttention = None
+        get_hybrid_attention_config = None
+__all__ = ['get_tokenizer', 'get_sentence_transformer', 'SmartHybridAttention', 'get_hybrid_attention_config']

utils/attention_connector.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import os
+import json
+import logging
+from typing import Dict, Any, Optional, List, Tuple, Union
+import torch
+# Conditional import for attention profile selector
+try:
+    from attention_trigger_system import AttentionProfileSelector
+    ATTENTION_SELECTOR_AVAILABLE = True
+except ImportError:
+    ATTENTION_SELECTOR_AVAILABLE = False
+    logging.warning("AttentionProfileSelector not available - content-aware attention disabled")
+class AttentionConnector:
+    """
+    Connects the core architecture with content-aware attention mechanisms.
+    This class serves as the integration layer between:
+    1. Original text from user input
+    2. The attention configuration system
+    3. The SmartHybridAttention implementation
+    """
+    def __init__(self, config_path: Optional[str] = None):
+        """Initialize the connector with configuration"""
+        self.logger = logging.getLogger(__name__)
+        # Set up default config path if none provided
+        if config_path is None:
+            self.config_path = os.path.join(os.path.dirname(__file__), "attention_configuration.json")
+        else:
+            self.config_path = config_path
+        # Initialize profile selector if available
+        self.profile_selector = self._init_profile_selector()
+        # Thread-local storage for current input text
+        self.current_input_text = None
+        self.current_context = {}
+        self.active_profile_id = "standard"
+        self.profile_confidence = 1.0
+    def _init_profile_selector(self) -> Optional[Any]:
+        """Initialize the attention profile selector"""
+        if not ATTENTION_SELECTOR_AVAILABLE:
+            self.logger.warning("AttentionProfileSelector not available - using default attention")
+            return None
+        try:
+            selector = AttentionProfileSelector(self.config_path)
+            self.logger.info(f"Initialized AttentionProfileSelector with {len(selector.profiles)} profiles")
+            return selector
+        except Exception as e:
+            self.logger.error(f"Error initializing AttentionProfileSelector: {e}")
+            return None
+    def set_input_text(self, text: str, context: Optional[Dict[str, Any]] = None):
+        """Set the current input text for attention mechanism"""
+        self.current_input_text = text
+        self.current_context = context or {}
+        # If we have a profile selector, determine the appropriate profile
+        if self.profile_selector:
+            self.active_profile_id, self.profile_confidence = self.profile_selector.select_profile(
+                text, self.current_context
+            )
+            self.logger.info(f"Selected attention profile: {self.active_profile_id} (confidence: {self.profile_confidence:.2f})")
+    def get_attention_parameters(self) -> Dict[str, Any]:
+        """Get parameters for the current attention profile"""
+        if not self.profile_selector:
+            return {}
+        return self.profile_selector.get_profile_parameters(self.active_profile_id)
+    def inject_attention_parameters(self, attention_module: Any) -> Any:
+        """Inject content-aware parameters into an attention module"""
+        if not hasattr(attention_module, 'set_parameters'):
+            self.logger.warning(f"Attention module does not support parameter injection")
+            return attention_module
+        params = self.get_attention_parameters()
+        attention_module.set_parameters(**params)
+        return attention_module
+    def get_input_context(self) -> Dict[str, Any]:
+        """Get the current input context for attention mechanism"""
+        return {
+            "input_text": self.current_input_text,
+            "context": self.current_context,
+            "profile_id": self.active_profile_id,
+            "confidence": self.profile_confidence
+        }
+# Global singleton instance
+_connector_instance = None
+def get_attention_connector() -> AttentionConnector:
+    """Get or create the global attention connector instance"""
+    global _connector_instance
+    if _connector_instance is None:
+        _connector_instance = AttentionConnector()
+    return _connector_instance
+# Hook functions to integrate with existing architecture
+def inject_input_text(input_text: str, model_forward_kwargs: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Inject input text into model forward kwargs.
+    This function should be called in the communicator before forwarding to model.
+    """
+    connector = get_attention_connector()
+    connector.set_input_text(input_text)
+    # Add input_text to kwargs for models that support it directly
+    model_forward_kwargs["original_text"] = input_text
+    return model_forward_kwargs
+def prepare_attention_context(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> Dict[str, Any]:
+    """
+    Prepare attention context from connector for attention mechanism.
+    This should be called within the attention module's forward method.
+    """
+    connector = get_attention_connector()
+    attention_context = connector.get_input_context()
+    # Include tensor shapes for attention mechanism's reference
+    attention_context.update({
+        "query_shape": query.shape,
+        "key_shape": key.shape,
+        "value_shape": value.shape,
+    })
+    return attention_context

utils/attention_trigger_system.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import os
+import json
+from typing import Dict, Any, Optional, List, Tuple
+import re
+from pathlib import Path
+class AttentionProfileSelector:
+    """
+    Selects appropriate attention profiles based on input characteristics
+    and configuration specified in the JSON dataset.
+    """
+    def __init__(self, config_path: Optional[str] = None):
+        """
+        Initialize the selector with the provided configuration.
+        Args:
+            config_path: Path to the attention configuration JSON
+        """
+        if config_path is None:
+            # Default to the standard location
+            config_path = os.path.join(os.path.dirname(__file__), "attention_configuration.json")
+        self.config = self._load_config(config_path)
+        self.profiles = {p["profile_id"]: p for p in self.config.get("attention_profiles", [])}
+        self.default_profile_id = self.config.get("default_profile", "standard")
+        self.selection_strategy = self.config.get("profile_selection_strategy", {})
+    def _load_config(self, config_path: str) -> Dict[str, Any]:
+        """Load configuration from JSON file."""
+        try:
+            with open(config_path, 'r') as f:
+                return json.load(f)
+        except (FileNotFoundError, json.JSONDecodeError) as e:
+            print(f"Error loading attention configuration: {e}")
+            return {}
+    def select_profile(self,
+                      input_text: str,
+                      context: Optional[Dict[str, Any]] = None) -> Tuple[str, float]:
+        """
+        Select the most appropriate attention profile based on input characteristics.
+        Args:
+            input_text: The user's input text
+            context: Additional context about the interaction
+        Returns:
+            Tuple of (profile_id, confidence)
+        """
+        if not self.profiles:
+            return self.default_profile_id, 1.0
+        # Initialize scores for each profile
+        scores = {profile_id: 0.0 for profile_id in self.profiles.keys()}
+        # Calculate content length score
+        input_length = len(input_text)
+        for profile_id, profile in self.profiles.items():
+            # Check document length thresholds
+            length_threshold = profile.get("activation_signals", {}).get("document_length_threshold", 0)
+            if input_length > length_threshold and length_threshold > 0:
+                scores[profile_id] += self.selection_strategy.get("strategy_parameters", {}).get("input_length_weight", 0.3)
+        # Check content type signals
+        for profile_id, profile in self.profiles.items():
+            content_signals = profile.get("activation_signals", {}).get("content_type_signals", [])
+            matched_signals = sum(1 for signal in content_signals if signal.lower() in input_text.lower())
+            if content_signals:
+                signal_score = matched_signals / len(content_signals)
+                scores[profile_id] += signal_score * self.selection_strategy.get("strategy_parameters", {}).get("content_type_weight", 0.5)
+        # Check structure indicators
+        for profile_id, profile in self.profiles.items():
+            structure_signals = profile.get("activation_signals", {}).get("structure_indicators", [])
+            matched_signals = sum(1 for signal in structure_signals if signal.lower() in input_text.lower())
+            if structure_signals:
+                signal_score = matched_signals / len(structure_signals)
+                scores[profile_id] += signal_score * self.selection_strategy.get("strategy_parameters", {}).get("content_type_weight", 0.5)
+        # Check for explicit request in context
+        if context and "requested_attention" in context:
+            requested = context["requested_attention"]
+            if requested in self.profiles:
+                scores[requested] += self.selection_strategy.get("strategy_parameters", {}).get("explicit_request_weight", 1.0)
+        # Get the highest scoring profile
+        if not scores:
+            return self.default_profile_id, 1.0
+        best_profile_id = max(scores.items(), key=lambda x: x[1])[0]
+        confidence = scores[best_profile_id]
+        # Apply minimum confidence threshold
+        min_confidence = self.selection_strategy.get("strategy_parameters", {}).get("minimum_confidence", 0.65)
+        if confidence < min_confidence:
+            return self.default_profile_id, confidence
+        return best_profile_id, confidence
+    def get_profile_parameters(self, profile_id: str) -> Dict[str, Any]:
+        """
+        Get the parameters for the specified attention profile.
+        Args:
+            profile_id: ID of the attention profile
+        Returns:
+            Dictionary of attention parameters
+        """
+        if profile_id in self.profiles:
+            return self.profiles[profile_id].get("parameters", {})
+        return {}
+    def get_attention_type(self, profile_id: str) -> str:
+        """
+        Get the attention mechanism type for the specified profile.
+        Args:
+            profile_id: ID of the attention profile
+        Returns:
+            String identifying the attention type
+        """
+        if profile_id in self.profiles:
+            return self.profiles[profile_id].get("attention_type", "standard")
+        return "standard"
+# Factory method to create appropriate attention mechanism
+def create_attention_mechanism(profile_id: str, model_dim: int, selector: AttentionProfileSelector):
+    """
+    Factory function to create an attention mechanism based on the selected profile.
+    Args:
+        profile_id: ID of the selected attention profile
+        model_dim: Model hidden dimension
+        selector: AttentionProfileSelector instance
+    Returns:
+        Configured attention mechanism
+    """
+    # This function would integrate with your existing attention mechanisms
+    # For implementation with smartHybridAttention:
+    attention_type = selector.get_attention_type(profile_id)
+    parameters = selector.get_profile_parameters(profile_id)
+    # Import here to avoid circular imports
+    try:
+        from smartHybridAttention import EnhancedSmartHybridAttention, create_smart_hybrid_attention
+        # Map parameters from JSON to the attention mechanism
+        attention_params = {
+            "dim": model_dim,
+            "num_heads": parameters.get("num_heads", 8),
+            "window_size": parameters.get("window_size", 256),
+            "use_sliding": parameters.get("use_sliding_window", True),
+            "use_global": parameters.get("use_global_tokens", True),
+            "global_token_ratio": parameters.get("global_token_ratio", 0.05),
+            "memory_tokens": parameters.get("memory_token_count", 16)
+        }
+        # Create appropriate attention mechanism based on type
+        if attention_type == "hierarchical":
+            attention_params["use_hierarchical"] = True
+        return create_smart_hybrid_attention(**attention_params)
+    except ImportError:
+        print("Warning: smartHybridAttention not found. Using placeholder.")
+        # Return a placeholder if the module is not available
+        import torch.nn as nn
+        return nn.MultiheadAttention(model_dim, 8)
+# Usage example:
+if __name__ == "__main__":
+    selector = AttentionProfileSelector()
+    # Example inputs
+    code_input = "def calculate_fibonacci(n):\n    if n <= 1:\n        return n\n    else:\n        return calculate_fibonacci(n-1) + calculate_fibonacci(n-2)"
+    document_input = """# Chapter 1: Introduction
+    This technical document covers the architecture of our system.
+    ## Section 1.1: Overview
+    The system consists of multiple components working together.
+    """
+    conversation_input = "How did you like the movie we saw yesterday? I thought the ending was unexpected."
+    # Test profile selection
+    code_profile, code_conf = selector.select_profile(code_input)
+    doc_profile, doc_conf = selector.select_profile(document_input)
+    conv_profile, conv_conf = selector.select_profile(conversation_input)
+    print(f"Code input → {code_profile} (confidence: {code_conf:.2f})")
+    print(f"Document input → {doc_profile} (confidence: {doc_conf:.2f})")
+    print(f"Conversation input → {conv_profile} (confidence: {conv_conf:.2f})")

utils/collator.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+Custom data collators for transformer training.
+"""
+import torch
+import random
+from typing import Dict, List, Any, Union
+from dataclasses import dataclass
+@dataclass
+class DataCollatorForLanguageModeling:
+    """
+    Data collator for language modeling.
+    This collator will tokenize inputs and dynamically mask tokens
+    for masked language modeling tasks.
+    """
+    tokenizer: Any
+    mlm: bool = True  # Whether to use masked language modeling
+    mlm_probability: float = 0.15  # Probability of masking a token
+    def __call__(self, examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
+        """
+        Collate a batch of examples.
+        Args:
+            examples: List of examples from dataset
+        Returns:
+            Batch dictionary for model
+        """
+        # Extract input_ids
+        input_ids = [example["input_ids"] for example in examples]
+        # Concatenate inputs
+        batch = self.tokenizer.pad(
+            {"input_ids": input_ids},
+            return_tensors="pt"
+        )
+        # If masked language modeling is enabled
+        if self.mlm:
+            inputs, labels = self.mask_tokens(batch["input_ids"])
+            return {"input_ids": inputs, "labels": labels}
+        else:
+            labels = batch["input_ids"].clone()
+            return {
+                "input_ids": batch["input_ids"],
+                "labels": labels,
+                "attention_mask": batch.get("attention_mask", None)
+            }
+    def mask_tokens(
+        self, inputs: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling.
+        Args:
+            inputs: Input tensor
+        Returns:
+            Tuple of (masked inputs, labels)
+        """
+        labels = inputs.clone()
+        # Get probability mask
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        # Create special tokens mask
+        if hasattr(self.tokenizer, "get_special_tokens_mask"):
+            special_tokens_mask = [
+                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
+                for val in labels.tolist()
+            ]
+            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
+        else:
+            special_tokens_mask = torch.tensor(
+                [
+                    [self._is_special_token(x) for x in val]
+                    for val in labels.tolist()
+                ],
+                dtype=torch.bool,
+            )
+        # Don't mask special tokens
+        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
+        # Get mask indices
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        # Set labels for non-masked tokens to -100 (ignored in loss)
+        labels[~masked_indices] = -100
+        # Set 80% of masked tokens to [MASK]
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        if hasattr(self.tokenizer, "mask_token_id") and self.tokenizer.mask_token_id is not None:
+            inputs[indices_replaced] = self.tokenizer.mask_token_id
+        # Set

utils/convert_checkpoints.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Utility to convert PyTorch (.pt) checkpoints to Hugging Face (.bin) format
+python -m utils.convert_checkpoints --checkpoints checkpoints/stdp_model_epoch_15.pt checkpoints/stdp_model_epoch_20.pt --output hf_stdp_model
+"""
+import os
+import torch
+import logging
+import argparse
+import datetime  # Added missing import
+from pathlib import Path
+from typing import Dict, Any, Optional
+import json
+import shutil
+# Configure logging - Fix the typo in format string (levellevel → levelname)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+def convert_stdp_checkpoint(
+    checkpoint_path: str,
+    output_dir: str,
+    config_path: Optional[str] = None
+) -> str:
+    """
+    Convert STDP/SNN PyTorch checkpoint to Hugging Face format.
+    Args:
+        checkpoint_path: Path to the .pt checkpoint file
+        output_dir: Directory to save the converted model
+        config_path: Optional path to config.json file
+    Returns:
+        Path to the converted model directory
+    """
+    logger.info(f"Converting checkpoint: {checkpoint_path}")
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    try:
+        # Load checkpoint
+        checkpoint = torch.load(checkpoint_path, map_location="cpu")
+        # Extract epoch from filename if available
+        checkpoint_filename = os.path.basename(checkpoint_path)
+        epoch = None
+        if "epoch_" in checkpoint_filename:
+            try:
+                epoch = int(checkpoint_filename.split("epoch_")[1].split(".")[0])
+            except (ValueError, IndexError):
+                pass
+        # Create config for the model
+        config = {
+            "model_type": "stdp_snn",
+            "architectures": ["STDPSpikeNeuralNetwork"],
+            "epoch": epoch,
+            "original_checkpoint": checkpoint_path,
+            "conversion_date": str(datetime.datetime.now())
+        }
+        # Update with loaded config if it exists in checkpoint
+        if isinstance(checkpoint, dict) and "config" in checkpoint:
+            config.update(checkpoint["config"])
+        # Load additional config from file if provided
+        if config_path and os.path.exists(config_path):
+            with open(config_path, 'r') as f:
+                file_config = json.load(f)
+                if "STDP_CONFIG" in file_config:
+                    config.update(file_config["STDP_CONFIG"])
+        # Extract model weights
+        model_weights = {}
+        if "model_state_dict" in checkpoint:
+            model_weights = checkpoint["model_state_dict"]
+        elif "state_dict" in checkpoint:
+            model_weights = checkpoint["state_dict"]
+        elif "weights" in checkpoint:
+            model_weights = {"weights": checkpoint["weights"]}
+        elif "synaptic_weights" in checkpoint:
+            model_weights = {"synaptic_weights": checkpoint["synaptic_weights"]}
+        else:
+            # If no recognized format, assume the checkpoint itself is the weights
+            model_weights = checkpoint
+        # Create model directory structure
+        model_dir = os.path.join(output_dir, "pytorch_model.bin")
+        # Save converted weights in HF format
+        torch.save(model_weights, model_dir)
+        logger.info(f"Saved model weights to {model_dir}")
+        # Save config file
+        config_file = os.path.join(output_dir, "config.json")
+        with open(config_file, 'w') as f:
+            json.dump(config, f, indent=2)
+        logger.info(f"Saved model config to {config_file}")
+        # Create a simple README
+        readme_file = os.path.join(output_dir, "README.md")
+        with open(readme_file, 'w') as f:
+            f.write(f"# Converted STDP/SNN Model\n\n")
+            f.write(f"This model was converted from PyTorch checkpoint: `{checkpoint_path}`\n\n")
+            f.write(f"Converted on: {config['conversion_date']}\n")
+            if epoch is not None:
+                f.write(f"Training epoch: {epoch}\n")
+        return output_dir
+    except Exception as e:
+        logger.error(f"Error converting checkpoint: {e}")
+        raise
+def prepare_for_hf_upload(
+    checkpoint_paths: list,
+    output_dir: str,
+    config_path: Optional[str] = None,
+    include_code: bool = True
+) -> str:
+    """
+    Prepare multiple checkpoints for HF upload with code.
+    Args:
+        checkpoint_paths: List of paths to checkpoint files
+        output_dir: Directory to save the prepared model
+        config_path: Optional path to config.json file
+        include_code: Whether to include inference code
+    Returns:
+        Path to the prepared directory
+    """
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    # Convert each checkpoint
+    converted_models = []
+    for cp_path in checkpoint_paths:
+        model_name = os.path.splitext(os.path.basename(cp_path))[0]
+        model_dir = os.path.join(output_dir, model_name)
+        converted_models.append(convert_stdp_checkpoint(cp_path, model_dir, config_path))
+    # Include necessary code files
+    if include_code:
+        code_files = [
+            "communicator_STDP.py",
+            "config.py",
+            "model_Custm.py"
+        ]
+        for file in code_files:
+            src_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), file)
+            if os.path.exists(src_path):
+                dst_path = os.path.join(output_dir, file)
+                shutil.copy2(src_path, dst_path)
+                logger.info(f"Copied {file} to {dst_path}")
+        # Create an inference script - FIX: Use single quotes for inner docstring
+        inference_script = '''
+import torch
+import os
+import json
+import argparse
+from pathlib import Path
+def load_stdp_model(model_dir):
+    """Load STDP model from directory."""
+    weights_path = os.path.join(model_dir, "pytorch_model.bin")
+    config_path = os.path.join(model_dir, "config.json")
+    # Load weights
+    weights = torch.load(weights_path, map_location="cpu")
+    # Load config
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    return weights, config
+def main():
+    parser = argparse.ArgumentParser(description="Run inference with STDP model")
+    parser.add_argument("--model", type=str, required=True, help="Model directory")
+    parser.add_argument("--input", type=str, required=True, help="Input text or file")
+    args = parser.parse_args()
+    # Load model
+    weights, config = load_stdp_model(args.model)
+    print(f"Loaded model from {args.model}")
+    print(f"Model config: {json.dumps(config, indent=2)}")
+    # Get input
+    input_text = args.input
+    if os.path.exists(args.input):
+        with open(args.input, 'r') as f:
+            input_text = f.read()
+    print(f"Input text: {input_text[:100]}...")
+    # Run inference using communicator_STDP if available
+    try:
+        from communicator_STDP import CommSTDP
+        communicator = CommSTDP({}, device="cpu")
+        result = communicator.process(input_text, weights)
+        print(f"Result: {result}")
+    except ImportError:
+        print("communicator_STDP not available. Weights loaded successfully.")
+        print(f"Weights shape: {weights.shape if hasattr(weights, 'shape') else '[dict of tensors]'}")
+if __name__ == "__main__":
+    main()
+'''
+        inference_path = os.path.join(output_dir, "inference.py")
+        with open(inference_path, 'w') as f:
+            f.write(inference_script.strip())
+        logger.info(f"Created inference script: {inference_path}")
+    # Create an overall README
+    readme_file = os.path.join(output_dir, "README.md")
+    with open(readme_file, 'w') as f:
+        f.write("# STDP/SNN Trained Models\n\n")
+        f.write("This repository contains STDP/SNN models converted from PyTorch checkpoints for use with Hugging Face's infrastructure.\n\n")
+        f.write("## Models Included\n\n")
+        for i, model in enumerate(converted_models):
+            f.write(f"{i+1}. `{os.path.basename(model)}`\n")
+        f.write("\n## Usage\n\n")
+        f.write("```python\n")
+        f.write("from transformers import AutoModel\n\n")
+        f.write("# Load the model\n")
+        f.write("model = AutoModel.from_pretrained('your-username/your-model-name')\n")
+        f.write("```\n\n")
+        f.write("Or use the included inference.py script:\n\n")
+        f.write("```bash\npython inference.py --model ./stdp_model_epoch_15 --input 'Your input text here'\n```")
+    logger.info(f"Prepared {len(converted_models)} models for HF upload in {output_dir}")
+    return output_dir
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert PyTorch checkpoints to Hugging Face format")
+    parser.add_argument("--checkpoints", nargs="+", required=True, help="Paths to checkpoint files")
+    parser.add_argument("--output", type=str, default="hf_model", help="Output directory")
+    parser.add_argument("--config", type=str, help="Path to config.json file")
+    parser.add_argument("--no-code", action="store_true", help="Don't include inference code")
+    args = parser.parse_args()
+    prepare_for_hf_upload(
+        args.checkpoints,
+        args.output,
+        args.config,
+        not args.no_code
+    )

utils/debug_helper.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import sys
+import time
+import psutil
+import traceback
+import logging
+import threading
+from typing import Dict, Any, Optional, List
+logger = logging.getLogger(__name__)
+class DebugHelper:
+    """
+    Helper class for debugging hanging issues in STDP training.
+    This provides tools for:
+    1. Process monitoring and status reporting
+    2. Timeout management
+    3. Recovery mechanisms for hanging processes
+    4. Detailed diagnostics
+    """
+    @staticmethod
+    def get_process_info(pid: Optional[int] = None) -> Dict[str, Any]:
+        """Get detailed information about the current process."""
+        pid = pid or os.getpid()
+        try:
+            process = psutil.Process(pid)
+            # Get basic process info
+            info = {
+                'pid': pid,
+                'name': process.name(),
+                'status': process.status(),
+                'cpu_percent': process.cpu_percent(),
+                'memory_percent': process.memory_percent(),
+                'memory_info': dict(process.memory_info()._asdict()),
+                'create_time': process.create_time(),
+                'runtime': time.time() - process.create_time(),
+                'num_threads': process.num_threads(),
+                'open_files': len(process.open_files()),
+                'connections': len(process.connections()),
+            }
+            # Get thread details
+            try:
+                import threading
+                info['active_threads'] = [t.name for t in threading.enumerate()]
+            except:
+                info['active_threads'] = "Could not retrieve thread information"
+            # Get current stack trace
+            info['current_stack'] = traceback.format_stack()
+            return info
+        except Exception as e:
+            logger.error(f"Error getting process info: {e}")
+            return {'error': str(e)}
+    @staticmethod
+    def check_resource_leaks() -> Dict[str, Any]:
+        """Check for potential resource leaks."""
+        import gc
+        leaks = {
+            'gc_counts': gc.get_count(),
+            'gc_objects': len(gc.get_objects()),
+        }
+        # Check for torch memory usage if available
+        try:
+            import torch
+            if torch.cuda.is_available():
+                leaks['torch_memory_allocated'] = torch.cuda.memory_allocated()
+                leaks['torch_memory_reserved'] = torch.cuda.memory_reserved()
+                leaks['torch_max_memory_allocated'] = torch.cuda.max_memory_allocated()
+        except ImportError:
+            pass
+        return leaks
+    @staticmethod
+    def register_timeout(seconds: int, callback=None):
+        """Register a timeout that calls the callback after specified seconds."""
+        def _timeout_handler():
+            time.sleep(seconds)
+            if callback:
+                callback()
+            else:
+                print(f"TIMEOUT: Operation took longer than {seconds} seconds")
+                info = DebugHelper.get_process_info()
+                print(f"Process info: {info}")
+                traceback.print_stack()
+        thread = threading.Thread(target=_timeout_handler)
+        thread.daemon = True
+        thread.start()
+        return thread
+    @staticmethod
+    def dump_debug_info(filename: str):
+        """Dump debug information to a file."""
+        process_info = DebugHelper.get_process_info()
+        leak_info = DebugHelper.check_resource_leaks()
+        with open(filename, 'w') as f:
+            f.write("===== PROCESS INFORMATION =====\n")
+            for key, value in process_info.items():
+                f.write(f"{key}: {value}\n")
+            f.write("\n===== RESOURCE LEAK INFORMATION =====\n")
+            for key, value in leak_info.items():
+                f.write(f"{key}: {value}\n")
+            f.write("\n===== ENVIRONMENT VARIABLES =====\n")
+            for key, value in os.environ.items():
+                f.write(f"{key}: {value}\n")
+            f.write("\n===== STACK TRACE =====\n")
+            f.write(''.join(traceback.format_stack()))
+        logger.info(f"Debug info dumped to {filename}")

utils/dual_encoder_utils.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""
+Utilities for dual encoder configuration and initialization.
+"""
+import logging
+from typing import Dict, Any, Optional, Union
+import torch
+import torch.nn as nn
+from config import load_config, app_config
+logger = logging.getLogger(__name__)
+class DualEncoderConfig:
+    """Configuration object for dual encoders"""
+    def __init__(self, config_dict: Optional[Dict[str, Any]] = None):
+        """
+        Initialize dual encoder configuration.
+        Args:
+            config_dict: Optional configuration dictionary to override defaults
+        """
+        # Set defaults from app_config
+        config = load_config()
+        # Default configuration
+        self.USE_PRETRAINED_ENCODER = True
+        self.USE_CUSTOM_ENCODER = True
+        self.FUSION_METHOD = "concat"  # Options: concat, add, weighted_sum
+        self.FUSION_WEIGHTS = [0.5, 0.5]  # Weights for pretrained and custom encoders
+        self.TRAINING_MODE = "joint"  # Options: joint, alternating, pretrained_first
+        # Override defaults with app_config if available
+        if hasattr(config, "DUAL_ENCODER_CONFIG"):
+            dual_config = config.DUAL_ENCODER_CONFIG
+            for key, value in dual_config.items():
+                setattr(self, key, value)
+        # Override with provided config_dict if available
+        if config_dict:
+            for key, value in config_dict.items():
+                setattr(self, key, value)
+class DualEncoderFusion(nn.Module):
+    """
+    Module that combines outputs from pretrained and custom encoders.
+    """
+    def __init__(self, config: Optional[Union[Dict[str, Any], DualEncoderConfig]] = None):
+        """
+        Initialize fusion module.
+        Args:
+            config: Configuration for fusion (dict or DualEncoderConfig object)
+        """
+        super().__init__()
+        # Convert dict to DualEncoderConfig if needed
+        if isinstance(config, dict):
+            self.config = DualEncoderConfig(config)
+        elif config is None:
+            self.config = DualEncoderConfig()
+        else:
+            self.config = config
+        # Initialize fusion weights if using weighted sum
+        if self.config.FUSION_METHOD == "weighted_sum":
+            weights = torch.tensor(self.config.FUSION_WEIGHTS, dtype=torch.float32)
+            self.register_buffer('fusion_weights', weights / weights.sum())
+    def forward(self, pretrained_output: torch.Tensor, custom_output: torch.Tensor) -> torch.Tensor:
+        """
+        Combine encoder outputs based on fusion method.
+        Args:
+            pretrained_output: Output from pretrained encoder
+            custom_output: Output from custom encoder
+        Returns:
+            Combined tensor
+        """
+        # Handle the case where one encoder is disabled
+        if not self.config.USE_PRETRAINED_ENCODER:
+            return custom_output
+        if not self.config.USE_CUSTOM_ENCODER:
+            return pretrained_output
+        # Apply fusion method
+        if self.config.FUSION_METHOD == "concat":
+            return torch.cat([pretrained_output, custom_output], dim=-1)
+        elif self.config.FUSION_METHOD == "add":
+            # Ensure dimensions match
+            if pretrained_output.shape != custom_output.shape:
+                raise ValueError(f"Cannot add tensors with different shapes: {pretrained_output.shape} and {custom_output.shape}")
+            return pretrained_output + custom_output
+        elif self.config.FUSION_METHOD == "weighted_sum":
+            # Ensure dimensions match
+            if pretrained_output.shape != custom_output.shape:
+                raise ValueError(f"Cannot use weighted sum with different shapes: {pretrained_output.shape} and {custom_output.shape}")
+            # Apply weights
+            w1, w2 = self.fusion_weights
+            return w1 * pretrained_output + w2 * custom_output
+        else:
+            raise ValueError(f"Unknown fusion method: {self.config.FUSION_METHOD}")
+def get_dual_encoder_config() -> DualEncoderConfig:
+    """
+    Get dual encoder configuration from app_config.
+    Returns:
+        DualEncoderConfig object
+    """
+    return DualEncoderConfig()
+# Testing function
+def test_fusion_methods():
+    """Test different fusion methods"""
+    config = DualEncoderConfig()
+    # Create test tensors
+    x1 = torch.randn(2, 10, 768)
+    x2 = torch.randn(2, 10, 768)
+    # Test concat fusion
+    config.FUSION_METHOD = "concat"
+    fusion_concat = DualEncoderFusion(config)
+    output_concat = fusion_concat(x1, x2)
+    print(f"Concat output shape: {output_concat.shape}")  # Should be [2, 10, 1536]
+    # Test add fusion
+    config.FUSION_METHOD = "add"
+    fusion_add = DualEncoderFusion(config)
+    output_add = fusion_add(x1, x2)
+    print(f"Add output shape: {output_add.shape}")  # Should be [2, 10, 768]
+    # Test weighted sum fusion
+    config.FUSION_METHOD = "weighted_sum"
+    config.FUSION_WEIGHTS = [0.7, 0.3]
+    fusion_weighted = DualEncoderFusion(config)
+    output_weighted = fusion_weighted(x1, x2)
+    print(f"Weighted sum output shape: {output_weighted.shape}")  # Should be [2, 10, 768]
+    return {
+        "concat": output_concat,
+        "add": output_add,
+        "weighted_sum": output_weighted
+    }
+if __name__ == "__main__":
+    # Run tests
+    test_results = test_fusion_methods()

utils/emergency_abort.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import time
+import signal
+import logging
+import threading
+logger = logging.getLogger(__name__)
+class EmergencyAbort:
+    """Creates an abort file that can be touched to trigger process termination."""
+    def __init__(self, abort_file="emergency_abort.txt", check_interval=5):
+        self.abort_file = abort_file
+        self.check_interval = check_interval
+        self.running = False
+        self.thread = None
+        # Create initial abort file with instructions
+        with open(self.abort_file, 'w') as f:
+            f.write("# Emergency Abort File\n")
+            f.write("# To abort training, update the timestamp of this file\n")
+            f.write(f"# Last checked: {time.ctime()}\n")
+    def _check_file(self):
+        last_modified = os.path.getmtime(self.abort_file)
+        while self.running:
+            time.sleep(self.check_interval)
+            try:
+                current_modified = os.path.getmtime(self.abort_file)
+                if current_modified > last_modified:
+                    logger.warning("Emergency abort file modified! Initiating abort sequence.")
+                    # Kill this process
+                    os.kill(os.getpid(), signal.SIGTERM)
+                    return
+                # Update check timestamp in file
+                with open(self.abort_file, 'w') as f:
+                    f.write("# Emergency Abort File\n")
+                    f.write("# To abort training, update the timestamp of this file\n")
+                    f.write(f"# Last checked: {time.ctime()}\n")
+                last_modified = current_modified
+            except Exception as e:
+                logger.error(f"Error checking abort file: {e}")
+    def start(self):
+        """Start the abort file monitor."""
+        self.running = True
+        self.thread = threading.Thread(target=self._check_file)
+        self.thread.daemon = True
+        self.thread.start()
+        logger.info(f"Emergency abort monitor started. Modify {self.abort_file} to terminate training.")
+        return self
+    def stop(self):
+        """Stop the abort file monitor."""
+        self.running = False
+        if self.thread:
+            self.thread.join(timeout=2)
+        logger.info("Emergency abort monitor stopped.")

utils/event_bus.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+Simple event bus for direct component-to-component communication.
+Provides a lightweight alternative to the full EventSystem.
+"""
+import logging
+from typing import Dict, List, Callable, Any
+logger = logging.getLogger(__name__)
+class EventBus:
+    """Simple synchronous event bus for direct event routing."""
+    def __init__(self):
+        """Initialize an empty event bus."""
+        self.subscribers = {}
+        logger.info("Initialized EventBus")
+    def subscribe(self, event_type: str, callback: Callable[[str, Any], None]) -> None:
+        """Subscribe a callback to an event type."""
+        if event_type not in self.subscribers:
+            self.subscribers[event_type] = []
+        self.subscribers[event_type].append(callback)
+        logger.debug(f"Added subscriber to {event_type}")
+    def unsubscribe(self, event_type: str, callback: Callable[[str, Any], None]) -> None:
+        """Unsubscribe a callback from an event type."""
+        if event_type in self.subscribers and callback in self.subscribers[event_type]:
+            self.subscribers[event_type].remove(callback)
+            logger.debug(f"Removed subscriber from {event_type}")
+    def publish(self, event_type: str, data: Any = None) -> None:
+        """Synchronously publish an event to all subscribers."""
+        subscribers = self.subscribers.get(event_type, []).copy()
+        if not subscribers:
+            logger.debug(f"No subscribers for event {event_type}")
+            return
+        logger.debug(f"Dispatching event {event_type} to {len(subscribers)} subscribers")
+        for callback in subscribers:
+            try:
+                callback(event_type, data)
+            except Exception as e:
+                logger.error(f"Error in subscriber callback: {e}")
+# Create a global instance for convenience
+event_bus = EventBus()

utils/event_system.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""
+Event system module for enabling parallel processing across components.
+Implements a publisher-subscriber pattern to decouple components.
+"""
+import logging
+import threading
+import queue
+import time
+from typing import Dict, List, Callable, Any, Optional, Set
+import concurrent.futures
+logger = logging.getLogger(__name__)
+class Event:
+    """Event class containing event data and metadata."""
+    def __init__(self, event_type: str, data: Any = None, source: str = None):
+        self.event_type = event_type
+        self.data = data
+        self.source = source
+        self.timestamp = time.time()
+    def __repr__(self) -> str:
+        return f"Event({self.event_type}, source={self.source}, timestamp={self.timestamp})"
+class EventSystem:
+    """Event system for parallel processing of prompts and responses."""
+    def __init__(self, max_workers: int = 4):
+        """Initialize the event system."""
+        self.subscribers: Dict[str, List[Callable[[Event], None]]] = {}
+        self.lock = threading.RLock()  # Reentrant lock for thread safety
+        self.event_queue = queue.Queue()
+        self.running = False
+        self.dispatcher_thread = None
+        self.max_workers = max_workers
+        self.thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
+        self.futures = set()
+        logger.info(f"Initialized EventSystem with {max_workers} workers")
+    def subscribe(self, event_type: str, callback: Callable[[Event], None]) -> None:
+        """Subscribe a callback to a specific event type."""
+        with self.lock:
+            if event_type not in self.subscribers:
+                self.subscribers[event_type] = []
+            self.subscribers[event_type].append(callback)
+            logger.debug(f"Added subscriber to {event_type}, total: {len(self.subscribers[event_type])}")
+    def unsubscribe(self, event_type: str, callback: Callable[[Event], None]) -> None:
+        """Unsubscribe a callback from a specific event type."""
+        with self.lock:
+            if event_type in self.subscribers and callback in self.subscribers[event_type]:
+                self.subscribers[event_type].remove(callback)
+                logger.debug(f"Removed subscriber from {event_type}, remaining: {len(self.subscribers[event_type])}")
+    def publish(self, event: Event) -> None:
+        """Publish an event to all subscribers."""
+        self.event_queue.put(event)
+        logger.debug(f"Published event: {event}")
+        # Start dispatcher if not running
+        with self.lock:
+            if not self.running:
+                self.start()
+    def publish_from_dict(self, event_type: str, data: Dict[str, Any], source: str = None) -> None:
+        """Convenient method to publish an event from a dictionary."""
+        event = Event(event_type, data, source)
+        self.publish(event)
+    def start(self) -> None:
+        """Start the event dispatcher thread."""
+        with self.lock:
+            if not self.running:
+                self.running = True
+                self.dispatcher_thread = threading.Thread(target=self._dispatch_events)
+                self.dispatcher_thread.daemon = True
+                self.dispatcher_thread.start()
+                logger.info("Event dispatcher thread started")
+    def stop(self) -> None:
+        """Stop the event dispatcher thread."""
+        with self.lock:
+            if self.running:
+                self.running = False
+                self.event_queue.put(None)  # Sentinel to stop the thread
+                if self.dispatcher_thread and self.dispatcher_thread.is_alive():
+                    self.dispatcher_thread.join(timeout=2.0)
+                logger.info("Event dispatcher thread stopped")
+        # Shut down thread pool
+        self.thread_pool.shutdown(wait=False)
+    def _dispatch_events(self) -> None:
+        """Dispatcher thread that processes events from the queue."""
+        while self.running:
+            try:
+                # Get next event with timeout to allow checking running flag
+                event = self.event_queue.get(timeout=0.5)
+                # Handle sentinel value
+                if event is None:
+                    break
+                # Process the event
+                self._process_event(event)
+                # Mark task as done
+                self.event_queue.task_done()
+            except queue.Empty:
+                continue
+            except Exception as e:
+                logger.error(f"Error in event dispatcher: {e}")
+        logger.info("Event dispatcher thread exiting")
+    def _process_event(self, event: Event) -> None:
+        """Process a single event by notifying subscribers."""
+        with self.lock:
+            # Get subscribers for this event type
+            subscribers = self.subscribers.get(event.event_type, []).copy()
+            # Also check for wildcard subscribers
+            wildcard_subscribers = self.subscribers.get("*", []).copy()
+            all_subscribers = subscribers + wildcard_subscribers
+        if not all_subscribers:
+            logger.debug(f"No subscribers for event {event.event_type}")
+            return
+        logger.debug(f"Dispatching event {event.event_type} to {len(all_subscribers)} subscribers")
+        # Submit a task to the thread pool for each subscriber
+        for callback in all_subscribers:
+            future = self.thread_pool.submit(self._safe_callback, callback, event)
+            self.futures.add(future)
+            future.add_done_callback(lambda f: self.futures.remove(f))
+    def _safe_callback(self, callback: Callable[[Event], None], event: Event) -> None:
+        """Execute a callback safely, catching exceptions."""
+        try:
+            callback(event)
+        except Exception as e:
+            logger.error(f"Error in subscriber callback: {e}")
+    def wait_for_all_events(self, timeout: Optional[float] = None) -> bool:
+        """Wait for all pending events to be processed."""
+        try:
+            self.event_queue.join()
+            # Also wait for all futures to complete
+            done, not_done = concurrent.futures.wait(
+                self.futures,
+                timeout=timeout,
+                return_when=concurrent.futures.ALL_COMPLETED
+            )
+            return len(not_done) == 0
+        except Exception as e:
+            logger.error(f"Error waiting for events: {e}")
+            return False
+# Common event types
+EVENT_USER_INPUT = "user_input"
+EVENT_MODEL_REQUEST = "model_request"
+EVENT_MODEL_RESPONSE = "model_response"
+EVENT_STDP_REQUEST = "stdp_request"
+EVENT_STDP_RESPONSE = "stdp_response"
+EVENT_TOKEN_GENERATED = "token_generated"
+EVENT_RESPONSE_COMPLETE = "response_complete"
+EVENT_ERROR = "error"

utils/gpu_config_optimizer.py ADDED Viewed

	@@ -0,0 +1,143 @@

+"""
+Utility to optimize transformer configuration for GPU memory constraints.
+"""
+import os
+import json
+import torch
+import logging
+from pathlib import Path
+import argparse
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+def optimize_config_for_gpu(config_path, target_vram_mb=None, batch_reduction_factor=0.5):
+    """
+    Optimize config settings for the available GPU memory.
+    Args:
+        config_path: Path to the config.json file
+        target_vram_mb: Target VRAM usage in MB (if None, will use 80% of available)
+        batch_reduction_factor: How much to reduce batch size (0.5 = half)
+    """
+    # Get current GPU memory capacity
+    if torch.cuda.is_available():
+        device = torch.cuda.current_device()
+        gpu_properties = torch.cuda.get_device_properties(device)
+        total_memory = gpu_properties.total_memory / (1024 * 1024)  # Convert to MB
+        gpu_name = gpu_properties.name
+        logger.info(f"GPU detected: {gpu_name} with {total_memory:.0f}MB VRAM")
+        # Set target memory if not specified (80% of available)
+        if target_vram_mb is None:
+            target_vram_mb = int(total_memory * 0.8)
+    else:
+        logger.warning("No GPU detected, using conservative settings for CPU")
+        target_vram_mb = 2048  # Conservative default for CPU
+        gpu_name = "CPU"
+    logger.info(f"Target VRAM usage: {target_vram_mb}MB")
+    # Load current config
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    # Store original values for reporting
+    original_batch_size = config["TRANSFORMER_CONFIG"]["BATCH_SIZE"]
+    original_sequence_length = config["TRANSFORMER_CONFIG"]["MAX_SEQ_LENGTH"]
+    original_num_layers = config["TRANSFORMER_CONFIG"]["NUM_LAYERS"]
+    # Adjust batch size based on GPU
+    if torch.cuda.is_available():
+        # RTX 4050 specific optimizations (around 6GB VRAM)
+        if "4050" in gpu_name or total_memory < 7000:
+            # Significant reductions needed for 4050
+            config["TRANSFORMER_CONFIG"]["BATCH_SIZE"] = max(4, int(original_batch_size * batch_reduction_factor))
+            # If still too large after batch reduction, reduce sequence length too
+            if target_vram_mb < 5500:  # RTX 4050 has ~6GB VRAM
+                # Maybe reduce sequence length for very large inputs
+                if config["TRANSFORMER_CONFIG"]["MAX_SEQ_LENGTH"] > 256:
+                    config["TRANSFORMER_CONFIG"]["MAX_SEQ_LENGTH"] = 256
+                # Maybe reduce model complexity if still needed
+                if config["TRANSFORMER_CONFIG"]["NUM_LAYERS"] > 6:
+                    config["TRANSFORMER_CONFIG"]["NUM_LAYERS"] = 6
+    # Enable gradient checkpointing (trades compute for memory)
+    if "OPTIMIZATION" not in config:
+        config["OPTIMIZATION"] = {}
+    config["OPTIMIZATION"]["USE_GRADIENT_CHECKPOINTING"] = True
+    config["OPTIMIZATION"]["USE_MIXED_PRECISION"] = True
+    # Create optimized filename with the GPU type
+    gpu_name_simple = gpu_name.replace(" ", "_").lower()
+    opt_config_path = config_path.replace(".json", f"_{gpu_name_simple}_optimized.json")
+    # Save optimized config
+    with open(opt_config_path, 'w') as f:
+        json.dump(config, f, indent=2)
+    # Report changes
+    logger.info(f"Optimized configuration saved to: {opt_config_path}")
+    logger.info("Changes made:")
+    logger.info(f" - Batch size: {original_batch_size} → {config['TRANSFORMER_CONFIG']['BATCH_SIZE']}")
+    logger.info(f" - Sequence length: {original_sequence_length} → {config['TRANSFORMER_CONFIG']['MAX_SEQ_LENGTH']}")
+    logger.info(f" - Num layers: {original_num_layers} → {config['TRANSFORMER_CONFIG']['NUM_LAYERS']}")
+    logger.info(f" - Gradient checkpointing: Enabled")
+    logger.info(f" - Mixed precision: Enabled")
+    return opt_config_path
+def apply_optimized_config(opt_config_path):
+    """
+    Apply the optimized config by updating the main config file.
+    """
+    # Load optimized config
+    with open(opt_config_path, 'r') as f:
+        opt_config = json.load(f)
+    # Get main config path
+    main_config_path = os.path.join(os.path.dirname(os.path.dirname(opt_config_path)), "config.json")
+    # Backup original config
+    backup_path = main_config_path + '.backup'
+    if not os.path.exists(backup_path):
+        import shutil
+        shutil.copy2(main_config_path, backup_path)
+        logger.info(f"Original config backed up to: {backup_path}")
+    # Apply optimized config
+    with open(main_config_path, 'w') as f:
+        json.dump(opt_config, f, indent=2)
+    logger.info(f"Applied optimized config to {main_config_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Optimize transformer config for GPU memory constraints")
+    parser.add_argument("--config", type=str, default="config.json", help="Path to config file")
+    parser.add_argument("--apply", action="store_true", help="Apply optimized config")
+    parser.add_argument("--batch-factor", type=float, default=0.5, help="Batch size reduction factor")
+    parser.add_argument("--target-vram", type=int, default=None, help="Target VRAM usage in MB")
+    args = parser.parse_args()
+    # Resolve config path
+    if not os.path.isabs(args.config):
+        config_dir = Path(__file__).resolve().parent.parent
+        config_path = os.path.join(config_dir, args.config)
+    else:
+        config_path = args.config
+    # Optimize config
+    opt_config_path = optimize_config_for_gpu(
+        config_path,
+        target_vram_mb=args.target_vram,
+        batch_reduction_factor=args.batch_factor
+    )
+    # Apply if requested
+    if args.apply:
+        apply_optimized_config(opt_config_path)

utils/model_utils.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import logging
+import os
+from typing import Optional, Tuple, Dict, Any
+logger = logging.getLogger(__name__)
+# List of validated model types from Hugging Face
+VALIDATED_MODEL_TYPES = [
+    'bert', 'roberta', 'distilbert', 'gpt2', 't5', 'albert',
+    'xlm-roberta', 'bart', 'electra', 'xlnet'
+]
+def validate_model_name(model_name: str) -> Tuple[bool, Optional[str]]:
+    """
+    Validates if a model name is recognized in the Hugging Face model registry.
+    Args:
+        model_name: Name of the model to validate
+    Returns:
+        Tuple containing:
+        - Boolean indicating if the model is valid
+        - Recommended fallback model name if the original is invalid, None otherwise
+    """
+    # Check if model name contains any known model type
+    is_valid = any(model_type in model_name.lower() for model_type in VALIDATED_MODEL_TYPES)
+    # Return appropriate fallback based on failure reason
+    if not is_valid:
+        return False, 'bert-base-uncased'  # Default fallback
+    return True, None
+def get_safe_model_name(config):
+    """
+    Get a validated and sanitized model name from config.
+    Args:
+        config: Either a config dictionary or a string model name
+    Returns:
+        str: A sanitized model name
+    """
+    # Handle string input directly
+    if isinstance(config, str):
+        model_name = config
+    else:
+        # Handle dictionary input (original behavior)
+        model_name = config.get('MODEL_NAME', 'bert-base-uncased')
+    # Validate the model name
+    is_valid, fallback = validate_model_name(model_name)
+    # Return original name if valid, otherwise return fallback
+    return model_name if is_valid else fallback
+def create_model_config_json(model_dir: str, model_type: str = 'bert') -> None:
+    """
+    Creates a config.json file for a custom model with proper model_type key.
+    Args:
+        model_dir: Directory where model is/will be stored
+        model_type: The type of model (e.g., 'bert', 'roberta')
+    """
+    import json
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    config_path = os.path.join(model_dir, 'config.json')
+    # Create a minimal config with the required model_type key
+    config = {
+        "model_type": model_type,
+        "architectures": [f"{model_type.capitalize()}Model"],
+        "hidden_size": 768,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12
+    }
+    with open(config_path, 'w') as f:
+        json.dump(config, f, indent=2)
+    logger.info(f"Created model config.json with model_type: {model_type} in {model_dir}")

utils/nltk_stub.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+Stub implementation of NLTK to avoid dependencies in container environments
+"""
+import logging
+logger = logging.getLogger(__name__)
+logger.info("Using stub NLTK implementation")
+# Stub for download - do nothing
+def download(*args, **kwargs):
+    logger.warning("NLTK download stub called - no actual download performed")
+    return True
+# Add the missing SimpleTokenizer class
+class SimpleTokenizer:
+    """A simple tokenizer implementation for the NLTK stub"""
+    def __init__(self):
+        logger.info("Stub SimpleTokenizer initialized")
+    def tokenize(self, text):
+        """Simple word tokenization by whitespace"""
+        return text.split()
+# Tokenization stubs
+class WordTokenizer:
+    def tokenize(self, text):
+        return text.split()
+def word_tokenize(text):
+    return text.split()
+class SentenceTokenizer:
+    def tokenize(self, text):
+        return text.split('.')
+def sent_tokenize(text):
+    return text.split('.')
+# Stemmer stubs
+class PorterStemmer:
+    def stem(self, word):
+        # Very basic stemming
+        if word.endswith('ing'):
+            return word[:-3]
+        elif word.endswith('ed'):
+            return word[:-2]
+        elif word.endswith('s') and not word.endswith('ss'):
+            return word[:-1]
+        return word
+class LancasterStemmer:
+    def stem(self, word):
+        return PorterStemmer().stem(word)
+class SimpleStemmer:
+    def __init__(self):
+        logger.info("SimpleStemmer stub initialized")
+    def stem(self, word):
+        # Very basic stemming: remove common endings
+        if word.endswith('ing'):
+            return word[:-3]
+        elif word.endswith('ed'):
+            return word[:-2]
+        elif word.endswith('s') and not word.endswith('ss'):
+            return word[:-1]
+        return word
+# Stub WordNetLemmatizer class
+class WordNetLemmatizer:
+    def __init__(self):
+        logger.info("Stub WordNetLemmatizer initialized")
+    def lemmatize(self, word, pos=None):
+        # Just return the word as is
+        return word
+# Namespace stubs for import compatibility
+class tokenize:
+    WordTokenizer = WordTokenizer
+    SentenceTokenizer = SentenceTokenizer
+    word_tokenize = word_tokenize
+    sent_tokenize = sent_tokenize
+class stem:
+    PorterStemmer = PorterStemmer
+    LancasterStemmer = LancasterStemmer
+    SimpleStemmer = SimpleStemmer
+# Stub for corpus
+class _CorpusModule:
+    class stopwords:
+        @staticmethod
+        def words(language="english"):
+            # Return basic English stopwords
+            return {
+                "i", "me", "my", "myself", "we", "our", "ours", "ourselves",
+                "you", "your", "yours", "yourself", "yourselves", "he", "him",
+                "his", "himself", "she", "her", "hers", "herself", "it", "its",
+                "itself", "they", "them", "their", "theirs", "themselves",
+                "what", "which", "who", "whom", "this", "that", "these",
+                "those", "am", "is", "are", "was", "were", "be", "been",
+                "being", "have", "has", "had", "having", "do", "does", "did",
+                "doing", "a", "an", "the", "and", "but", "if", "or", "because",
+                "as", "until", "while", "of", "at", "by", "for", "with",
+                "about", "against", "between", "into", "through", "during",
+                "before", "after", "above", "below", "to", "from", "up", "down",
+                "in", "out", "on", "off", "over", "under", "again", "further",
+                "then", "once", "here", "there", "when", "where", "why", "how",
+                "all", "any", "both", "each", "few", "more", "most", "other",
+                "some", "such", "no", "nor", "not", "only", "own", "same", "so",
+                "than", "too", "very", "s", "t", "can", "will", "just", "don",
+                "should", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain",
+                "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven",
+            }
+corpus = _CorpusModule()

utils/output_formatter.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import logging
+import re
+from typing import Optional, Dict, Any, List
+logger = logging.getLogger(__name__)
+class OutputFormatter:
+    """
+    Formats model responses for better presentation and usability.
+    """
+    def __init__(self):
+        """
+        Initialize the OutputFormatter.
+        """
+        self.post_processors = {
+            "programming_software_dev": self._format_code,
+            "mbpp": self._format_code,
+            "machine_learning_ai_data_science": self._format_technical_content,
+            "mathematics": self._format_equations,
+            "default": self._default_formatter
+        }
+        logger.info("OutputFormatter initialized")
+    def format_response(self, response: str, specialization: Optional[str] = None) -> str:
+        """
+        Format the model response based on specialization.
+        Args:
+            response: The raw response from the model
+            specialization: The specialization area (optional)
+        Returns:
+            Formatted response
+        """
+        if not response:
+            return ""
+        # Apply basic formatting to all responses
+        formatted_response = self._clean_whitespace(response)
+        # Apply specialization-specific formatting
+        processor = self.post_processors.get(specialization, self.post_processors["default"])
+        formatted_response = processor(formatted_response)
+        return formatted_response
+    def _clean_whitespace(self, text: str) -> str:
+        """
+        Clean up excessive whitespace.
+        """
+        # Replace multiple newlines with double newlines
+        text = re.sub(r'\n{3,}', '\n\n', text)
+        # Replace multiple spaces with a single space
+        text = re.sub(r' {2,}', ' ', text)
+        return text.strip()
+    def _format_code(self, text: str) -> str:
+        """
+        Format code blocks with proper syntax highlighting markers.
+        """
+        # Identify unmarked code blocks and add markdown code block syntax
+        # Look for patterns that suggest code (indentation, common programming keywords)
+        code_patterns = [
+            r'((?:^|\n)(?:def |class |import |function |public |private |var |let |const |if |for |while ).+(?:\n[ \t]+.+)+)',
+            r'((?:^|\n)(?:SELECT |INSERT |UPDATE |DELETE |CREATE |ALTER |DROP ).+(?:;)(?:\n|$))'
+        ]
+        for pattern in code_patterns:
+            def add_code_markers(match):
+                code_block = match.group(1)
+                # Try to determine the language based on keywords
+                lang = self._detect_language(code_block)
+                return f"\n```{lang}\n{code_block}\n```\n"
+            text = re.sub(pattern, add_code_markers, text)
+        return text
+    def _detect_language(self, code_block: str) -> str:
+        """
+        Attempt to detect the programming language from a code block.
+        """
+        if re.search(r'def |class |import |if __name__ ==|print\(', code_block):
+            return "python"
+        elif re.search(r'function |var |const |let |=> |document\.', code_block):
+            return "javascript"
+        elif re.search(r'public |private |class .+ {|void |String |int |boolean', code_block):
+            return "java"
+        elif re.search(r'#include|int main|std::|printf|scanf', code_block):
+            return "c++"
+        elif re.search(r'SELECT |INSERT |UPDATE |DELETE |CREATE TABLE|ALTER TABLE', code_block):
+            return "sql"
+        else:
+            return ""  # Generic code block
+    def _format_equations(self, text: str) -> str:
+        """
+        Format mathematical equations with LaTeX markers if needed.
+        """
+        # Basic pattern for unmarked equations
+        equation_patterns = [
+            r'([^$])(\\frac{.+?}{.+?}|\\sum_|\\int_|\\lim_)',
+            r'([^$])([a-zA-Z]_[0-9]+)',
+            r'([^$])([a-zA-Z]\\in)'
+        ]
+        for pattern in equation_patterns:
+            text = re.sub(pattern, r'\1$\2$', text)
+        # Ensure equation blocks use proper LaTeX delimiters
+        text = re.sub(r'\\begin{equation}(.+?)\\end{equation}', r'$$\1$$', text, flags=re.DOTALL)
+        return text
+    def _format_technical_content(self, text: str) -> str:
+        """
+        Format technical content with proper highlighting of terms and concepts.
+        """
+        # Highlight technical terms
+        technical_terms = [
+            "neural network", "machine learning", "deep learning", "algorithm",
+            "regression", "classification", "clustering", "backpropagation",
+            "gradient descent", "optimization", "hyperparameter"
+        ]
+        for term in technical_terms:
+            # Only highlight whole words, not substrings
+            text = re.sub(r'\b(' + re.escape(term) + r')\b(?![*_])', r'*\1*', text)
+        return text
+    def _default_formatter(self, text: str) -> str:
+        """
+        Default formatter that applies general improvements.
+        """
+        # Add paragraph breaks for readability when appropriate
+        text = re.sub(r'(\w\.)\s+([A-Z])', r'\1\n\n\2', text)
+        # Format lists for readability if they're not already formatted
+        text = re.sub(r'(?<!\n)(\d+\.)\s+', r'\n\1 ', text)
+        return text
+    def format_structured_output(self, data: Dict[str, Any]) -> str:
+        """
+        Format structured data outputs (like JSON) into readable text.
+        Args:
+            data: Dictionary containing structured data
+        Returns:
+            Formatted string representation
+        """
+        if not isinstance(data, dict):
+            return str(data)
+        formatted_parts = []
+        # Format main response if present
+        if "response" in data:
+            formatted_parts.append(self.format_response(data["response"]))
+        # Add metadata in a clean format if needed
+        metadata = {}
+        for key, value in data.items():
+            if key != "response" and not key.startswith("_"):
+                metadata[key] = value
+        if metadata:
+            formatted_parts.append("\n\n---\n")
+            for key, value in metadata.items():
+                formatted_parts.append(f"**{key.replace('_', ' ').title()}**: {value}")
+        return "\n".join(formatted_parts)

utils/prepare_hf_training.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+Script to prepare your model training for Hugging Face's training infrastructure.
+"""
+import os
+import shutil
+import argparse
+import logging
+import json
+from pathlib import Path
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+def prepare_hf_training(output_dir="hf_training"):
+    """Prepare code and configs for Hugging Face training"""
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    # Required files for HF training
+    required_files = [
+        "config.json",  # Configuration file
+        "model_Custm.py",  # Custom model implementation
+        "model_PrTr.py",  # Pretrained model implementation
+        "model_Combn.py",  # Combined model implementation
+        "tokenizer.py",  # Tokenizer wrapper
+        "dataloader.py",  # Data loading utilities
+        "utils",  # Utility functions
+        "data",  # Training data
+        "run_gpu_training.py"  # Main training script
+    ]
+    # Copy required files
+    for file_or_dir in required_files:
+        src_path = Path(file_or_dir)
+        dst_path = Path(output_dir) / src_path.name
+        if not src_path.exists():
+            logger.warning(f"{file_or_dir} not found, skipping...")
+            continue
+        if src_path.is_dir():
+            if dst_path.exists():
+                shutil.rmtree(dst_path)
+            shutil.copytree(src_path, dst_path)
+            logger.info(f"Copied directory {file_or_dir} to {dst_path}")
+        else:
+            shutil.copy2(src_path, dst_path)
+            logger.info(f"Copied file {file_or_dir} to {dst_path}")
+    # Create HF training script
+    create_hf_train_script(output_dir)
+    # Create requirements.txt
+    create_requirements_file(output_dir)
+    # Update config for HF environment
+    update_config_for_hf(output_dir)
+    logger.info(f"Training package prepared in {output_dir}")
+    logger.info(f"Upload this directory to Hugging Face for training")
+    logger.info("See https://huggingface.co/docs/hub/spaces-manage-deploy for deployment instructions")
+def create_hf_train_script(output_dir):
+    """Create training script specifically for Hugging Face environment"""
+    train_script = """
+# Hugging Face training script for Transformer model
+import os
+import torch
+from run_gpu_training import run_training
+if __name__ == "__main__":
+    # HF automatically provides CUDA device if available
+    # Always use mixed precision on HF
+    run_training(use_mixed_precision=True)
+    # Save model to the /tmp/model directory, which HF preserves
+    os.makedirs("/tmp/model", exist_ok=True)
+    torch.save({
+        "config": "final_model_config",
+        "type": "transformer_trained",
+        "epochs_completed": 30
+    }, "/tmp/model/model_info.json")
+    print("Training completed, model saved to /tmp/model")
+"""
+    with open(os.path.join(output_dir, "train_hf.py"), "w") as f:
+        f.write(train_script.strip())
+    logger.info("Created HF training script: train_hf.py")
+def create_requirements_file(output_dir):
+    """Create requirements.txt with dependencies"""
+    requirements = [
+        "torch>=2.0.0",
+        "transformers>=4.30.0",
+        "datasets>=2.12.0",
+        "pydantic>=2.0.0",
+        "sentence-transformers>=2.2.2",
+        "scikit-learn>=1.2.2",
+        "numpy>=1.24.0",
+        "pandas>=2.0.0",
+        "tqdm>=4.65.0",
+        "matplotlib>=3.7.1"
+    ]
+    with open(os.path.join(output_dir, "requirements.txt"), "w") as f:
+        f.write("\n".join(requirements))
+    logger.info("Created requirements.txt")
+def update_config_for_hf(output_dir):
+    """Update configuration for HF environment"""
+    config_path = os.path.join(output_dir, "config.json")
+    if not os.path.exists(config_path):
+        logger.warning("config.json not found, skipping configuration update")
+        return
+    try:
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        # Update for HF environment - restore full settings
+        config["TRANSFORMER_CONFIG"]["BATCH_SIZE"] = 32
+        config["TRANSFORMER_CONFIG"]["MAX_SEQ_LENGTH"] = 512
+        config["TRANSFORMER_CONFIG"]["NUM_LAYERS"] = 12
+        # Add HF-specific config
+        if "OPTIMIZATION" not in config:
+            config["OPTIMIZATION"] = {}
+        config["OPTIMIZATION"]["USE_MIXED_PRECISION"] = True
+        config["OPTIMIZATION"]["PLATFORM"] = "huggingface"
+        with open(config_path, "w") as f:
+            json.dump(config, f, indent=2)
+        logger.info("Updated config.json for Hugging Face environment")
+    except Exception as e:
+        logger.error(f"Error updating config: {e}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare training package for Hugging Face")
+    parser.add_argument("--output-dir", type=str, default="hf_training",
+                        help="Output directory for training package")
+    args = parser.parse_args()
+    prepare_hf_training(args.output_dir)

utils/prepare_hf_transformer_training.py ADDED Viewed

	@@ -0,0 +1,335 @@

+"""
+How to Use prepare_hf_transformer_training.py Safely
+Here's a secure way to prepare and upload your model to Hugging Face:
+Step 1: Navigate to Your Project Directory
+  cd C:/Users/User/OneDrive/Documents/tlm
+Step 2: Set Up Authentication for Hugging Face
+  huggingface-cli login
+Step 3: Run the Preparation Script
+  python -m utils.prepare_hf_transformer_training --stdp_checkpoint "checkpoints/stdp_model_epoch_20.pt" --output_dir "C:/Users/User/OneDrive/Documents/tlm/Wildnerve-tlm_HF/hf_upload"
+Step 4: Initialize Git and Upload to Hugging Face
+  cd hf_upload
+  git init
+  git add .
+  git commit -m "Add TLM model with STDP checkpoint"
+  git remote add origin https://huggingface.co/YOUR-USERNAME/Wildnerve-tlm01
+  git pull origin main --allow-unrelated-histories
+  git push origin main
+"""
+import os
+import shutil
+import logging
+import argparse
+from pathlib import Path
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+def prepare_training_package(
+    stdp_checkpoint_path,
+    output_dir="hf_transformer_training",
+    include_all=False
+):
+    """Prepare a clean training package for Hugging Face with STDP checkpoint.
+    Args:
+        stdp_checkpoint_path: Path to the STDP checkpoint file
+        output_dir: Directory where to create the package
+        include_all: Whether to include all supporting files (utils, analyzers, etc.)"""
+    os.makedirs(output_dir, exist_ok=True)
+    # Core files needed for transformer training
+    essential_files = [
+        # Core components
+        "app.py",
+        "main.py",
+        "config.json",
+        "config.py",
+        "inference.py",
+        # Model implementations
+        "model_List.py",
+        "model_Custm.py",
+        "model_PrTr.py",
+        "model_Combn.py",
+        "model_manager.py",
+        # Communication components
+        "communicator.py",
+        "communicator_STDP.py",
+        # Data and training
+        "tokenizer.py",
+        "trainer.py",
+        "dataloader.py",
+        "dataset.py",
+        "data",
+        # STDP specific components
+        "STDP_Communicator/datasets_stdp.py",
+        "STDP_Communicator/train_stdp.py",
+        # Utils (only essential ones)
+        "utils/convert_checkpoints.py",
+    ]
+    # Additional support files (only included if include_all=True)
+    additional_files = [
+        "utils/transformer_utils.py",
+        "utils/smartHybridAttention.py",
+        "utils/sentence_transformer_utils.py",
+        "utils/output_formatter.py",
+        "emergency_monitor.py",
+    ]
+    # Choose which files to copy
+    required_files = essential_files + (additional_files if include_all else [])
+    logger.info(f"Starting package preparation in {output_dir}")
+    logger.info(f"Including {'all' if include_all else 'only essential'} files")
+    # Track successful and failed copies
+    copied_files = []
+    missing_files = []
+    # Copy files
+    for file_path in required_files:
+        src = Path(file_path)
+        if not src.exists():
+            logger.warning(f"File {file_path} not found, skipping")
+            missing_files.append(str(src))
+            continue
+        # Create destination directories
+        dst = Path(output_dir) / src
+        os.makedirs(dst.parent, exist_ok=True)
+        # Copy file or directory
+        try:
+            if src.is_dir():
+                shutil.copytree(src, dst, dirs_exist_ok=True)
+            else:
+                shutil.copy2(src, dst)
+            copied_files.append(str(src))
+            logger.info(f"Copied {src} to {dst}")
+        except Exception as e:
+            logger.error(f"Error copying {src}: {e}")
+    # Copy STDP checkpoint
+    if os.path.exists(stdp_checkpoint_path):
+        stdp_dst = Path(output_dir) / "checkpoints" / Path(stdp_checkpoint_path).name
+        os.makedirs(stdp_dst.parent, exist_ok=True)
+        try:
+            shutil.copy2(stdp_checkpoint_path, stdp_dst)
+            logger.info(f"Copied STDP checkpoint to {stdp_dst}")
+            copied_files.append(str(stdp_checkpoint_path))
+        except Exception as e:
+            logger.error(f"Error copying checkpoint: {e}")
+            missing_files.append(str(stdp_checkpoint_path))
+    else:
+        logger.warning(f"STDP checkpoint not found at {stdp_checkpoint_path}")
+        missing_files.append(str(stdp_checkpoint_path))
+    # Create Hugging Face training script
+    create_transformer_training_script(output_dir, stdp_checkpoint_path)  # ADD THIS LINE
+    # Create requirements.txt if not already copied
+    if "requirements.txt" not in copied_files:
+        create_requirements(output_dir)
+        copied_files.append("requirements.txt (generated)")
+    # Create README.md if not already copied
+    if "README.md" not in copied_files:
+        create_readme(output_dir, stdp_checkpoint_path)
+        copied_files.append("README.md (generated)")
+    # Summarize what was done
+    logger.info(f"Package prepared in {output_dir}")
+    logger.info(f"Copied {len(copied_files)} files: {', '.join(copied_files[:5])}...")
+    if missing_files:
+        logger.warning(f"Missing {len(missing_files)} files: {', '.join(missing_files)}")
+    return output_dir
+def create_transformer_training_script(output_dir, stdp_checkpoint_path):
+    """Create a script to load STDP checkpoint and train transformer."""
+    # Fix: Change the inner docstring to use single quotes to avoid conflict with the outer triple quotes
+    script = """
+import os
+import torch
+import logging
+from config import load_config, app_config
+from tokenizer import TokenizerWrapper
+from model_manager import ModelManager
+from dataloader import prepare_data_loaders
+from trainer import Trainer, EarlyStopping
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+def train_transformer(stdp_checkpoint_path):
+    '''Train the transformer component after loading STDP weights.'''
+    logger.info(f"Starting transformer training with STDP checkpoint: {stdp_checkpoint_path}")
+    # Initialize components
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    logger.info(f"Using device: {device}")
+    # Create tokenizer
+    tokenizer = TokenizerWrapper()
+    # Get model manager
+    model_manager = ModelManager()
+    # Get specialization
+    specialization = app_config.TRANSFORMER_CONFIG.specialization
+    # Load STDP weights
+    if os.path.exists(stdp_checkpoint_path):
+        try:
+            stdp_checkpoint = torch.load(stdp_checkpoint_path, map_location=device)
+            logger.info(f"Loaded STDP checkpoint from {stdp_checkpoint_path}")
+            # Now integrate STDP weights with transformer model if needed
+            # This depends on your specific architecture
+        except Exception as e:
+            logger.error(f"Error loading STDP checkpoint: {e}")
+    else:
+        logger.warning(f"STDP checkpoint not found at {stdp_checkpoint_path}")
+    # Get model and move to device
+    model = model_manager.get_model(specialization)
+    model.to(device)
+    # Get data loaders
+    data_path = app_config.DATASET_PATHS.get(specialization)
+    if not data_path or not os.path.exists(data_path):
+        # Use a default dataset path
+        data_path = next(iter(app_config.DATASET_PATHS.values()))
+        logger.warning(f"Dataset for {specialization} not found, using {data_path}")
+    train_loader, val_loader = prepare_data_loaders(
+        data_path,
+        tokenizer,
+        batch_size=app_config.TRANSFORMER_CONFIG.BATCH_SIZE
+    )
+    # Set up checkpoint directory
+    checkpoint_dir = os.path.join("checkpoints", "transformer")
+    os.makedirs(checkpoint_dir, exist_ok=True)
+    # Set up early stopping
+    early_stopping = EarlyStopping(
+        patience=app_config.TRAINING_CONFIG.PATIENCE,
+        delta=app_config.TRAINING_CONFIG.DELTA,
+        verbose=True,
+        path=os.path.join(checkpoint_dir, "best_model.pt")
+    )
+    # Create trainer
+    trainer = Trainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataloader=train_loader,
+        val_dataloader=val_loader,
+        device=device,
+        early_stopping=early_stopping,
+        checkpoint_dir=checkpoint_dir,
+        total_epochs=app_config.TRAINING_CONFIG.TRANSFORMER_NUM_EPOCHS
+    )
+    # Train the model
+    logger.info("Starting transformer training...")
+    trainer.train()
+    # Save final model
+    final_model_path = os.path.join(checkpoint_dir, "final_model.pt")
+    torch.save({
+        'model_state_dict': model.state_dict(),
+        'config': {
+            'transformer_epochs': app_config.TRAINING_CONFIG.TRANSFORMER_NUM_EPOCHS,
+            'stdp_epochs': 20,  # Assuming the STDP checkpoint is from epoch 20
+            'specialization': specialization
+        }
+    }, final_model_path)
+    logger.info(f"Final model saved to {final_model_path}")
+    return final_model_path
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Train transformer after STDP")
+    parser.add_argument("--stdp_checkpoint", type=str, default="checkpoints/stdp_model_epoch_20.pt",
+                      help="Path to pre-trained STDP checkpoint")
+    args = parser.parse_args()
+    # Train transformer
+    train_transformer(args.stdp_checkpoint)
+"""
+    script_path = os.path.join(output_dir, "train_transformer_hf.py")
+    with open(script_path, "w") as f:
+        f.write(script.strip())
+    logger.info(f"Created training script at {script_path}")
+def create_requirements(output_dir):
+    """Create requirements.txt file with all necessary dependencies."""
+    requirements = [
+        "torch>=2.0.0",
+        "transformers>=4.30.0",
+        "datasets>=2.12.0",
+        "pydantic>=2.0.0",
+        "sentence-transformers>=2.2.2",
+        "scikit-learn>=1.2.2",
+        "numpy>=1.24.0",
+        "pandas>=2.0.0",
+        "tqdm>=4.65.0",
+        "matplotlib>=3.7.1",
+        "snntorch>=0.7.0"
+    ]
+    with open(os.path.join(output_dir, "requirements.txt"), "w") as f:
+        f.write("\n".join(requirements))
+    logger.info("Created requirements.txt")
+def create_readme(output_dir, stdp_checkpoint_path):
+    """Create README with model information and usage instructions."""
+    readme = f"""# Wildnerve-tlm01: Transformer Language Model with STDP
+This repository contains the Wildnerve-tlm01 model, a transformer-based language model enhanced with
+STDP (Spike-Timing-Dependent Plasticity) for improved learning capabilities.
+## Pre-trained STDP Checkpoint
+The STDP component was trained for 20 epochs and saved in: `{os.path.basename(stdp_checkpoint_path)}`
+## Model Architecture
+Wildnerve-tlm01 combines:
+- Transformer architecture for language understanding
+- Spiking Neural Network (SNN) with STDP for biological learning
+- Smart Hybrid Attention for efficient processing
+## Usage
+"""
+    with open(os.path.join(output_dir, "README.md"), "w") as f:
+        f.write(readme)
+    logger.info("Created README.md")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Prepare Hugging Face training package")
+    parser.add_argument("--stdp_checkpoint", type=str, default="checkpoints/stdp_model_epoch_20.pt",
+                      help="Path to pre-trained STDP checkpoint")
+    parser.add_argument("--output_dir", type=str, default="hf_upload",
+                      help="Output directory for training package")
+    parser.add_argument("--include_all", action="store_true",
+                      help="Include additional supporting files")
+    args = parser.parse_args()
+    prepare_training_package(args.stdp_checkpoint, args.output_dir, args.include_all)

utils/sentence_transformer_utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""
+Utilities for loading and working with sentence transformers.
+"""
+import logging
+import torch
+import os
+from typing import Optional
+from sentence_transformers import SentenceTransformer
+logger = logging.getLogger(__name__)
+# Constants
+DEFAULT_SENTENCE_TRANSFORMER = "Wildnerve-tlm01-0.05Bx12"  # Removed fallback to all-MiniLM-L6-v2
+# Cache for loaded models to avoid reloading
+_sentence_transformer_cache = {}
+def get_sentence_transformer(model_name: str = DEFAULT_SENTENCE_TRANSFORMER):
+    """
+    Get a sentence transformer model.
+    Args:
+        model_name: Name of the model to load (default is our primary model)
+    Returns:
+        SentenceTransformer model
+    """
+    # Define the expected local directory for your custom model
+    local_model_dir = os.path.join("c:/Users/User/OneDrive/Documents/tlm/Wildnerve-tlm_HF/models", model_name)
+    # Use the local directory if it exists; otherwise, use the provided model_name identifier (which must be on HuggingFace)
+    model_path = local_model_dir if os.path.isdir(local_model_dir) else model_name
+    try:
+        return SentenceTransformer(model_path)
+    except Exception as e:
+        logger.error(f"Failed to load SentenceTransformer from {model_path}: {e}")
+        raise
+def clear_sentence_transformer_cache():
+    """Clear the sentence transformer cache to free memory."""
+    global _sentence_transformer_cache
+    _sentence_transformer_cache.clear()
+    logger.info("Cleared sentence transformer cache")

utils/smartHybridAttention.py ADDED Viewed

	@@ -0,0 +1,675 @@

+# smartHybridAttention.py - Enhanced SmartHybridAttention that combines the best features of both implementations:
+import os
+import sys
+import math
+import json
+import torch
+import torch.nn as nn
+import logging as logger
+from typing import Optional, Tuple, List, Dict, Any, Union
+# Fix imports for service_registry - make it more robust with fallbacks
+try:
+    # Try direct import first
+    from service_registry import ServiceRegistry
+except ImportError:
+    try:
+        # Try adding parent directories to path
+        parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        sys.path.append(parent_dir)
+        from service_registry import ServiceRegistry
+    except ImportError:
+        # Create dummy registry if not found
+        class DummyRegistry:
+            def get_service(self, name): return None
+            def register_service(self, name, service): pass
+        registry = DummyRegistry()
+# Use conditional import for AttentionProfileSelector
+try:
+    # Try direct import first
+    from utils.attention_trigger_system import AttentionProfileSelector
+except ImportError:
+    # Try setting up different paths
+    try:
+        data_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
+        sys.path.append(data_dir)
+        from utils.attention_trigger_system import AttentionProfileSelector
+    except ImportError:
+        # Create a minimal placeholder if not found
+        class DummyAttentionProfileSelector:
+            def __init__(self, config_path=None): pass
+            def select_profile(self, text, context=None): return "standard", 1.0
+            def get_attention_type(self, profile_id): return "standard"
+            def get_profile_parameters(self, profile_id): return {}
+        AttentionProfileSelector = DummyAttentionProfileSelector
+# Merging the two functions into a single robust implementation
+def get_hybrid_attention_config(config_path: Optional[str] = None) -> Dict[str, Any]:
+    """Get configuration for hybrid attention mechanism from multiple sources.
+    Args:
+        config_path: Optional path to a JSON configuration file
+    Returns:
+        Dictionary with attention configuration parameters
+    """
+    # Start with default configuration
+    default_config = {
+        "DIM": 768,
+        "NUM_HEADS": 12,
+        "WINDOW_SIZE": 256,
+        "USE_SLIDING": True,
+        "USE_GLOBAL": True,
+        "USE_HIERARCHICAL": False,
+        "GLOBAL_TOKEN_RATIO": 0.05,
+        "MEMORY_TOKENS": 32,
+        "STRIDE": 128
+    }
+    # Try to load from app_config if available
+    try:
+        from config import app_config
+        # Use safe value loading with proper type checking
+        dim = safe_get_int_value(app_config, 'EMBEDDING_DIM', default_config["DIM"])
+        num_heads = safe_get_int_value(app_config, 'NUM_HEADS', default_config["NUM_HEADS"])
+        window_size = safe_get_int_value(app_config, 'WINDOW_SIZE', default_config["WINDOW_SIZE"])
+        # Update config with values from app_config
+        default_config.update({
+            'DIM': dim,
+            'NUM_HEADS': num_heads,
+            'WINDOW_SIZE': window_size,
+            # These values might be different in app_config, so keep them from the original defaults
+            'USE_HIERARCHICAL': True,  # Changed from default based on app_config version
+            'GLOBAL_TOKEN_RATIO': 0.2,  # Changed from default based on app_config version
+            'MEMORY_TOKENS': 16,  # Changed from default based on app_config version
+        })
+        # Calculate stride based on window size with proper type checking
+        if isinstance(window_size, int) and window_size > 0:
+            default_config['STRIDE'] = window_size // 2
+        else:
+            default_config['STRIDE'] = 128
+    except Exception as e:
+        logger.warning(f"Error loading config from app_config: {e}, using defaults")
+    # Try to load from JSON file if path provided
+    if config_path and os.path.exists(config_path):
+        try:
+            with open(config_path, "r") as f:
+                user_config = json.load(f)
+                # Merge configs, with user config taking precedence
+                for key, value in user_config.items():
+                    default_config[key.upper()] = value
+            logger.info(f"Loaded attention config from {config_path}")
+        except Exception as e:
+            logger.warning(f"Error loading attention config from {config_path}: {e}")
+    return default_config
+# Update the get_attention_config function to use our new merged function
+def get_attention_config(config_path: Optional[str] = None) -> Dict[str, Any]:
+    """Get attention configuration using the most appropriate method available"""
+    return get_hybrid_attention_config(config_path)
+# Add helper function for the above
+def safe_get_int_value(config_obj, key, default=512):
+    """Safely get an integer value from config with proper type checking"""
+    try:
+        if hasattr(config_obj, key):
+            value = getattr(config_obj, key)
+        elif hasattr(config_obj, 'TRANSFORMER_CONFIG') and hasattr(config_obj.TRANSFORMER_CONFIG, key):
+            value = getattr(config_obj.TRANSFORMER_CONFIG, key)
+        else:
+            return default
+        if isinstance(value, dict):
+            logger.warning(f"Config value {key} is a dictionary, using default: {default}")
+            return default
+        elif isinstance(value, (int, float)):
+            return int(value)
+        else:
+            logger.warning(f"Config value {key} is not a number, using default: {default}")
+            return default
+    except Exception as e:
+        logger.warning(f"Error getting config value {key}: {e}")
+        return default
+class SmartHybridAttention(nn.Module):
+    """SmartHybridAttention that combines the best features of both implementations:
+    - Memory storage via global token selection from Wildnerve-tlm_HF
+    - Multiple attention strategies from utils version
+    - HuggingFace compatibility layer
+    - Optimized for extremely large context windows"""
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        window_size: int = 256,
+        use_sliding: bool = True,
+        use_global: bool = True,
+        use_hierarchical: bool = False,
+        global_token_ratio: float = 0.05,
+        memory_tokens: int = 32,
+        config_path: Optional[str] = None,
+        registry_key: Optional[str] = None,
+        attention_config_path: Optional[str] = None
+    ):
+        super().__init__()
+        # Ensure all parameters are the correct types
+        self.dim = int(dim) if isinstance(dim, (int, float)) else 768
+        self.num_heads = int(num_heads) if isinstance(num_heads, (int, float)) else 8
+        self.head_dim = self.dim // self.num_heads  # Safe integer division
+        self.window_size = int(window_size) if isinstance(window_size, (int, float)) else 256
+        self.scale = self.head_dim ** -0.5
+        # Feature flags - ensure boolean types
+        self.use_sliding = bool(use_sliding)
+        self.use_global = bool(use_global)
+        self.use_hierarchical = bool(use_hierarchical)
+        # Ensure float type for ratio
+        self.global_token_ratio = float(global_token_ratio) if isinstance(global_token_ratio, (int, float)) else 0.05
+        # Ensure int type for memory tokens
+        self.memory_tokens = int(memory_tokens) if isinstance(memory_tokens, (int, float)) else 32
+        # Initialize memory parameter
+        self.persistent_memory = nn.Parameter(torch.zeros(self.memory_tokens, 1, self.dim))
+        nn.init.normal_(self.persistent_memory, mean=0.0, std=0.02)
+        # Projections
+        self.q_proj = nn.Linear(self.dim, self.dim)
+        self.k_proj = nn.Linear(self.dim, self.dim)
+        self.v_proj = nn.Linear(self.dim, self.dim)
+        self.out_proj = nn.Linear(self.dim, self.dim)
+        # Initialize optional components
+        self.config = self._load_config(config_path) if config_path else {}
+        self.registry_key = registry_key
+        self.prompt_analyzer = None
+        self._init_external_services()
+        # Initialize content-aware attention selector
+        self.attention_config_path = attention_config_path
+        if self.attention_config_path:
+            try:
+                self.profile_selector = AttentionProfileSelector(self.attention_config_path)
+            except Exception as e:
+                logger.warning(f"Could not initialize AttentionProfileSelector: {e}")
+                self.profile_selector = None
+        else:
+            self.profile_selector = None
+    def _init_external_services(self):
+        """Initialize external services like registry and analyzer if available."""
+        try:
+            # Try to import service registry
+            sys.path.extend([
+                os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "utils"),
+                os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "Wildnerve-tlm_HF", "utils")
+            ])
+            if self.registry_key:
+                registry = ServiceRegistry.get_instance()
+                self.prompt_analyzer = registry.get_service("prompt_analyzer")
+                # Register self if registry key provided
+                registry.register_service(self.registry_key, self)
+        except (ImportError, AttributeError, Exception):
+            logger.debug("External services not available")
+    def _load_config(self, config_path: str) -> Dict:
+        """Load configuration from JSON file."""
+        if not os.path.exists(config_path):
+            return {}
+        try:
+            with open(config_path, "r") as f:
+                return json.load(f)
+        except:
+            return {}
+    def _create_sliding_window_mask(
+        self,
+        seq_len: int,
+        window_size: int,
+        global_token_indices: Optional[List[int]] = None,
+        memory_size: int = 0
+    ) -> torch.Tensor:
+        """Create attention mask for sliding window with memory tokens and global tokens."""
+        total_len = seq_len + memory_size
+        mask = torch.zeros(total_len, total_len, dtype=torch.bool)
+        # Memory tokens attend to everything and everything attends to memory
+        if memory_size > 0:
+            mask[:memory_size, :] = True  # Memory attends to all
+            mask[:, :memory_size] = True  # All attend to memory
+        # Set sliding window attention for content tokens
+        for i in range(memory_size, total_len):
+            # Adjust window bounds
+            start = max(memory_size, i - window_size // 2)
+            end = min(total_len, i + window_size // 2 + 1)
+            mask[i, start:end] = True
+        # Add global token attention if provided
+        if global_token_indices is not None:
+            # Adjust indices to account for memory tokens
+            adjusted_indices = [idx + memory_size for idx in global_token_indices]
+            # Global tokens attend to all tokens
+            mask[adjusted_indices, :] = True
+            # All tokens attend to global tokens
+            mask[:, adjusted_indices] = True
+        return mask
+    def _select_global_tokens(
+        self,
+        key_layer: torch.Tensor,
+        ratio: float = None,
+        memory_size: int = 0
+    ) -> List[int]:
+        """Select global tokens based on importance scoring.
+           Returns: List of indices of selected global tokens"""
+        if ratio is None:
+            ratio = self.global_token_ratio
+        seq_len = key_layer.size(0) - memory_size
+        num_global_tokens = max(1, int(seq_len * ratio))
+        # Skip memory tokens when scoring importance
+        content_keys = key_layer[memory_size:]
+        # Score tokens by L2 norm and recency (more recent = more important)
+        base_scores = torch.norm(content_keys, dim=-1).mean(dim=-1)  # [seq_len]
+        # Add recency bias
+        seq_positions = torch.arange(seq_len, device=base_scores.device) / seq_len
+        recency_scores = 0.3 * seq_positions  # Mild recency bias
+        final_scores = base_scores + recency_scores
+        # Select top-k indices
+        _, indices = torch.topk(final_scores, k=min(num_global_tokens, seq_len))
+        return indices.tolist()
+    def _apply_memory_augmented_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Apply attention with persistent memory tokens for long-range context.
+           Returns: Output tensor after attention [seq_len, batch, dim]"""
+        seq_len, batch_size, _ = query.size()
+        # Expand memory tokens to batch size
+        memory_batch = self.persistent_memory.expand(-1, batch_size, -1)
+        # Prepend memory tokens to input
+        query_with_memory = torch.cat([memory_batch, query], dim=0)
+        key_with_memory = torch.cat([memory_batch, key], dim=0)
+        value_with_memory = torch.cat([memory_batch, value], dim=0)
+        # Project query, key, value
+        q = self.q_proj(query_with_memory)
+        k = self.k_proj(key_with_memory)
+        v = self.v_proj(value_with_memory)
+        # Select global tokens for additional global attention
+        global_token_indices = None
+        if self.use_global and seq_len > self.window_size:
+            global_token_indices = self._select_global_tokens(k, memory_size=self.memory_tokens)
+        # Create or modify attention mask
+        memory_size = self.memory_tokens
+        full_seq_len = seq_len + memory_size
+        if attention_mask is None:
+            window_mask = self._create_sliding_window_mask(
+                seq_len, self.window_size, global_token_indices, memory_size
+            )
+            attention_mask = ~window_mask
+            attention_mask = attention_mask.to(q.device).unsqueeze(0).unsqueeze(0)
+            attention_mask = attention_mask * -1e9  # Large negative for masked positions
+        else:
+            # Modify existing mask to include memory tokens
+            memory_mask = torch.zeros(full_seq_len, full_seq_len, device=attention_mask.device)
+            memory_mask[memory_size:, memory_size:] = attention_mask
+            # Make memory attend to everything and everything attend to memory
+            memory_mask[:memory_size, :] = 0  # 0 = attend (not masked)
+            memory_mask[:, :memory_size] = 0
+            attention_mask = memory_mask
+        # Reshape for multi-head attention
+        q = q.view(full_seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1).transpose(1, 2)
+        k = k.view(full_seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1).transpose(1, 2)
+        v = v.view(full_seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1).transpose(1, 2)
+        # Compute attention
+        scores = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+        if attention_mask is not None:
+            scores = scores + attention_mask
+        attention_weights = torch.softmax(scores, dim=-1)
+        context = torch.matmul(attention_weights, v)
+        # Reshape back
+        context = context.transpose(1, 2).transpose(0, 1).contiguous()
+        context = context.view(full_seq_len, batch_size, self.dim)
+        # Remove memory tokens from output
+        context = context[memory_size:]
+        # Final projection
+        output = self.out_proj(context)
+        return output
+    def _apply_hierarchical_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor
+    ) -> torch.Tensor:
+        """Apply hierarchical attention for very long sequences."""
+        seq_len, batch_size, _ = query.size()
+        chunk_size = min(512, seq_len)
+        num_chunks = math.ceil(seq_len / chunk_size)
+        # First level: process chunks independently
+        chunk_outputs = []
+        for i in range(num_chunks):
+            start_idx = i * chunk_size
+            end_idx = min((i + 1) * chunk_size, seq_len)
+            # Extract chunk
+            q_chunk = query[start_idx:end_idx]
+            k_chunk = key[start_idx:end_idx]
+            v_chunk = value[start_idx:end_idx]
+            # Process chunk with full attention
+            q_proj = self.q_proj(q_chunk)
+            k_proj = self.k_proj(k_chunk)
+            v_proj = self.v_proj(v_chunk)
+            # Reshape for multi-head attention
+            chunk_len = end_idx - start_idx
+            q_proj = q_proj.view(chunk_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1).transpose(1, 2)
+            k_proj = k_proj.view(chunk_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1).transpose(1, 2)
+            v_proj = v_proj.view(chunk_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1).transpose(1, 2)
+            # Compute attention
+            scores = torch.matmul(q_proj, k_proj.transpose(-1, -2)) * self.scale
+            weights = torch.softmax(scores, dim=-1)
+            context = torch.matmul(weights, v_proj)
+            # Reshape back
+            context = context.transpose(1, 2).transpose(0, 1).contiguous()
+            context = context.view(chunk_len, batch_size, self.dim)
+            chunk_outputs.append(context)
+        # Concatenate chunks
+        output = torch.cat(chunk_outputs, dim=0)
+        output = self.out_proj(output)
+        return output
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        input_text: Optional[str] = None,  # New parameter for content detection
+        context: Optional[Dict] = None,    # New parameter for additional context
+        **kwargs
+    ) -> torch.Tensor:
+        """Forward pass for the enhanced smart hybrid attention layer."""
+        seq_len, batch_size, _ = query.size()
+        # If input_text is provided, try to use content-aware attention
+        if input_text:
+            try:
+                # Try multiple import paths to find attention_connector
+                connector = None
+                import_paths = [
+                    # Try direct import
+                    lambda: __import__('attention_connector').get_attention_connector(),
+                    # Try data subdirectory
+                    lambda: __import__('data.attention_connector').get_attention_connector(),
+                    # Try relative path
+                    lambda: __import__('.'.join(['..', 'data', 'attention_connector']),
+                                      fromlist=['get_attention_connector']).get_attention_connector()
+                ]
+                for import_path in import_paths:
+                    try:
+                        connector = import_path()
+                        break
+                    except (ImportError, AttributeError):
+                        continue
+                if connector:
+                    connector.set_input_text(input_text)
+            except Exception as e:
+                # Silently continue if connector not available
+                logger.debug(f"Error setting input text for content-aware attention: {e}")
+        # Analyze sequence characteristics to choose strategy
+        strategy_weights = self._get_attention_strategy(seq_len, input_text, context)
+        # For very short sequences, use standard attention
+        if seq_len < 128:
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+            # Multi-head attention
+            q = q.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1).transpose(1, 2)
+            k = k.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1).transpose(1, 2)
+            v = v.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1).transpose(1, 2)
+            scores = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+            if attention_mask is not None:
+                scores = scores + attention_mask
+            attn_weights = torch.softmax(scores, dim=-1)
+            context = torch.matmul(attn_weights, v)
+            context = context.transpose(1, 2).transpose(0, 1).contiguous()
+            context = context.view(seq_len, batch_size, self.dim)
+            return self.out_proj(context)
+        # For longer sequences, use memory-augmented attention
+        if strategy_weights["memory"] > 0:
+            return self._apply_memory_augmented_attention(query, key, value, attention_mask)
+        # For very long sequences where memory doesn't fit, use hierarchical
+        if strategy_weights["hierarchical"] > 0:
+            return self._apply_hierarchical_attention(query, key, value)
+        # Fallback: standard sliding window without memory
+        q = self.q_proj(query)
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        global_token_indices = self._select_global_tokens(k, memory_size=0) if self.use_global else None
+        window_mask = self._create_sliding_window_mask(seq_len, self.window_size, global_token_indices, 0)
+        masked_attn = ~window_mask
+        masked_attn = masked_attn.to(query.device).unsqueeze(0).unsqueeze(0) * -1e9
+        # Standard multi-head attention with mask
+        q = q.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1).transpose(1, 2)
+        k = k.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1).transpose(1, 2)
+        v = v.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1).transpose(1, 2)
+        scores = torch.matmul(q, k.transpose(-1, -2)) * self.scale
+        scores = scores + masked_attn
+        attn_weights = torch.softmax(scores, dim=-1)
+        context = torch.matmul(attn_weights, v)
+        context = context.transpose(1, 2).transpose(0, 1).contiguous()
+        context = context.view(seq_len, batch_size, self.dim)
+        return self.out_proj(context)
+    def _get_attention_strategy(self, seq_len: int, input_text: Optional[str] = None, context: Optional[Dict] = None) -> Dict[str, float]:
+        """Determine which attention strategy to use based on sequence length
+           and optional prompt analysis."""
+        weights = {
+            "standard": 0.0,
+            "sliding": 0.0,
+            "memory": 0.0,
+            "hierarchical": 0.0
+        }
+        # Try content-aware selection if available and input_text is provided
+        if self.profile_selector and input_text:
+            try:
+                profile_id, confidence = self.profile_selector.select_profile(input_text, context)
+                # If we have high confidence in the profile selection, use profile-specific weights
+                if confidence > 0.65:
+                    attention_type = self.profile_selector.get_attention_type(profile_id)
+                    if attention_type == "hierarchical":
+                        weights["hierarchical"] = 0.7
+                        weights["memory"] = 0.3
+                        return weights
+                    elif attention_type == "smartHybrid":
+                        weights["memory"] = 0.5
+                        weights["sliding"] = 0.5
+                        return weights
+                    elif attention_type == "recencyBiased":
+                        weights["memory"] = 0.8
+                        weights["sliding"] = 0.2
+                        return weights
+                    # Additional attention types can be added here
+            except Exception as e:
+                print(f"Warning: Error in content-based attention selection: {e}")
+        # Fall back to sequence length-based selection if content detection fails or has low confidence
+        if seq_len < 128:
+            weights["standard"] = 1.0
+        elif seq_len < 2048:
+            weights["sliding"] = 0.2
+            weights["memory"] = 0.8
+        elif seq_len < 8192:
+            weights["memory"] = 1.0
+        else:
+            weights["memory"] = 0.7
+            weights["hierarchical"] = 0.3
+        # Adjust based on prompt analyzer if available
+        if self.prompt_analyzer:
+            try:
+                analysis = self.prompt_analyzer.get_current_analysis()
+                if analysis:
+                    complexity = analysis.get("complexity", 0.5)
+                    structure = analysis.get("structure_score", 0.5)
+                    # Adjust for highly structured content
+                    if structure > 0.7:
+                        weights["hierarchical"] = min(0.8, weights["hierarchical"] + 0.3)
+                        weights["memory"] = max(0.2, weights["memory"] - 0.3)
+                    # Adjust for high complexity content
+                    if complexity > 0.8 and seq_len > 1024:
+                        weights["memory"] = min(1.0, weights["memory"] + 0.2)
+            except:
+                logger.debug("Error in prompt analysis")
+        return weights
+    def to_hf_attention(self):
+        """
+        Convert to HuggingFace-compatible attention layer.
+        """
+        class HFCompatibleAttention(nn.Module):
+            def __init__(self, smart_attention):
+                super().__init__()
+                self.smart_attention = smart_attention
+            def __call__(self, hidden_states, attention_mask=None, **kwargs):
+                # Convert HF format attention mask if present
+                if attention_mask is not None:
+                    if attention_mask.dim() == 4:  # [batch, 1, 1, seq_len]
+                        attention_mask = attention_mask.squeeze(1).squeeze(1)
+                    attention_mask = attention_mask.to(dtype=torch.float32)
+                    attention_mask = (1.0 - attention_mask) * -10000.0
+                # Convert from [batch, seq, dim] to [seq, batch, dim]
+                seq_first = hidden_states.transpose(0, 1)
+                # Apply attention
+                output = self.smart_attention(
+                    seq_first, seq_first, seq_first,
+                    attention_mask=attention_mask,
+                    **kwargs
+                )
+                # Convert back to [batch, seq, dim]
+                return output.transpose(0, 1)
+        return HFCompatibleAttention(self)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        """
+        Create an instance from a pretrained Hugging Face model configuration.
+        """
+        try:
+            from transformers import AutoConfig
+            # Load config from HF model
+            config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
+            # Extract relevant attributes
+            attention_kwargs = {
+                "dim": config.hidden_size,
+                "num_heads": config.num_attention_heads,
+                "window_size": kwargs.get("window_size", 512),
+                "memory_tokens": kwargs.get("memory_tokens", min(32, config.max_position_embeddings // 64)),
+            }
+            # Update with any user-provided kwargs
+            attention_kwargs.update(kwargs)
+            # Create instance
+            return cls(**attention_kwargs)
+        except ImportError:
+            raise ImportError("transformers library required to load from pretrained model")
+        except Exception as e:
+            raise ValueError(f"Failed to initialize from pretrained model: {e}")
+def create_smart_hybrid_attention(
+    dim: int = 768,
+    num_heads: int = 12,
+    max_sequence_length: int = 8192,
+    for_huggingface: bool = True,
+    **kwargs
+) -> Union[SmartHybridAttention, nn.Module]:
+    """Factory function to create an attention mechanism suitable for the given context.
+    Args:
+        dim: Hidden dimension size
+        num_heads: Number of attention heads
+        max_sequence_length: Maximum expected sequence length
+        for_huggingface: Whether to return a HuggingFace-compatible version
+    Returns:
+        An attention module that can be used in a transformer"""
+    # Determine appropriate memory size based on sequence length
+    memory_tokens = min(max(16, max_sequence_length // 256), 64)
+    # Determine appropriate window size
+    window_size = min(512, max(128, max_sequence_length // 32))
+    # Create attention module
+    attention = SmartHybridAttention(
+        dim=dim,
+        num_heads=num_heads,
+        window_size=window_size,
+        memory_tokens=memory_tokens,
+        **kwargs
+    )
+    # Wrap for HuggingFace if requested
+    if for_huggingface:
+        return attention.to_hf_attention()
+    return attention

utils/tokenizer_utils.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""
+Utility functions for tokenizer-related operations.
+"""
+import torch
+import logging
+from typing import Dict, List, Any, Union, Optional
+from transformers import AutoTokenizer
+logger = logging.getLogger(__name__)
+def get_special_tokens_mask(tokenizer, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    """
+    Retrieve special tokens mask.
+    Args:
+        tokenizer: Tokenizer to use
+        token_ids_0: First token IDs
+        token_ids_1: Second token IDs (for pairs)
+        already_has_special_tokens: Whether token_ids already contain special tokens
+    Returns:
+        List of 1s and 0s, where 1 indicates a special token
+    """
+    if already_has_special_tokens:
+        return tokenizer.get_special_tokens_mask(
+            token_ids_0,
+            token_ids_1=token_ids_1,
+            already_has_special_tokens=True
+        )
+    if token_ids_1 is None:
+        return tokenizer.get_special_tokens_mask(
+            token_ids_0,
+            token_ids_1=None,
+            already_has_special_tokens=False
+        )
+    return tokenizer.get_special_tokens_mask(
+        token_ids_0,
+        token_ids_1=token_ids_1,
+        already_has_special_tokens=False
+    )
+def add_tokens_to_tokenizer(tokenizer, new_tokens):
+    """
+    Add new tokens to tokenizer vocabulary.
+    Args:
+        tokenizer: Tokenizer to modify
+        new_tokens: List of new tokens to add
+    Returns:
+        Number of tokens added
+    """
+    return tokenizer.add_tokens(new_tokens)
+def format_batch_for_model(
+    batch: Dict[str, torch.Tensor],
+    device: torch.device = None
+) -> Dict[str, torch.Tensor]:
+    """
+    Format a batch for model input, moving tensors to specified device.
+    Args:
+        batch: Dictionary of tensors
+        device: Device to move tensors to
+    Returns:
+        Formatted batch dictionary
+    """
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    formatted_batch = {}
+    for k, v in batch.items():
+        if isinstance(v, torch.Tensor):
+            formatted_batch[k] = v.to(device)
+        else:
+            formatted_batch[k] = v
+    return formatted_batch
+def batch_encode_plus(
+    tokenizer,
+    texts: List[str],
+    batch_size: int = 32,
+    max_length: int = 512,
+    return_tensors: str = "pt",
+    **kwargs
+) -> List[Dict[str, torch.Tensor]]:
+    """
+    Encode a large batch of texts in smaller chunks.
+    Args:
+        tokenizer: Tokenizer to use
+        texts: List of texts to encode
+        batch_size: Size of each processing batch
+        max_length: Maximum sequence length
+        return_tensors: Return format ('pt' for PyTorch)
+        **kwargs: Additional encoding parameters
+    Returns:
+        List of encoded batches
+    """
+    batches = []
+    for i in range(0, len(texts), batch_size):
+        batch_texts = texts[i:i + batch_size]
+        encoded = tokenizer(
+            batch_texts,
+            max_length=max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors=return_tensors,
+            **kwargs
+        )
+        batches.append(encoded)
+    return batches
+def get_tokenizer_info(tokenizer) -> Dict[str, Any]:
+    """
+    Get information about a tokenizer.
+    Args:
+        tokenizer: Tokenizer to inspect
+    Returns:
+        Dictionary with tokenizer information
+    """
+    info = {
+        "vocab_size": len(tokenizer),
+        "model_name": getattr(tokenizer, "name_or_path", None),
+        "special_tokens": {}
+    }
+    # Get special token attributes if available
+    special_tokens = [
+        "pad_token", "unk_token", "sep_token",
+        "cls_token", "mask_token", "bos_token", "eos_token"
+    ]
+    for token_name in special_tokens:
+        token_value = getattr(tokenizer, f"{token_name}", None)
+        if token_value is not None:
+            info["special_tokens"][token_name] = token_value
+    return info

utils/transformer_utils.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""
+Unified utilities for handling transformers, tokenizers, and embeddings.
+"""
+import os
+import logging
+import torch
+from typing import Dict, Any, Optional, Union, List
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerBase, AutoModel
+# Import the new sentence transformer utilities
+from utils.sentence_transformer_utils import get_sentence_transformer as load_sentence_transformer
+logger = logging.getLogger(__name__)
+# Constants
+DEFAULT_SENTENCE_TRANSFORMER = "sentence-transformers/Wildnerve-tlm01-0.05Bx12"
+DEFAULT_TOKENIZER = "bert-base-uncased"
+FALLBACK_TOKENIZERS = ["bert-base-uncased", "gpt2", "roberta-base"]
+# Cache for loaded models to avoid reloading
+_model_cache = {}
+_tokenizer_cache = {}
+_sentence_transformer_cache = {}
+def get_sentence_transformer(model_name):
+    try:
+        from sentence_transformers import SentenceTransformer
+        return SentenceTransformer(model_name)
+    except Exception as e:
+        logging.error(f"Failed to load sentence transformer {model_name}: {e}")
+        logging.warning("Falling back to default model: Wildnerve-tlm01-0.05Bx12")
+        from sentence_transformers import SentenceTransformer
+        return SentenceTransformer("Wildnerve-tlm01-0.05Bx12")
+def get_tokenizer(model_name: str = "bert-base-uncased"):
+    """Get a tokenizer with proper error handling"""
+    try:
+        from transformers import AutoTokenizer
+        logger.info(f"Loading tokenizer: {model_name}")
+        return AutoTokenizer.from_pretrained(model_name)
+    except Exception as e:
+        logger.error(f"Failed to load tokenizer {model_name}: {e}")
+        # Return a minimal dummy tokenizer that won't break everything
+        logger.warning("Using dummy tokenizer as fallback")
+        class DummyTokenizer:
+            def __init__(self):
+                self.vocab_size = 30522  # BERT vocab size
+                self.pad_token_id = 0
+                self.eos_token_id = 102
+                self.bos_token_id = 101
+            def __call__(self, text, **kwargs):
+                """Convert text to a dict with dummy tensors"""
+                import torch
+                # Handle batch vs single input
+                is_batch = isinstance(text, list)
+                texts = text if is_batch else [text]
+                # Create random but deterministic IDs based on text length
+                input_ids = []
+                attention_mask = []
+                for t in texts:
+                    # Use text length to create deterministic pseudo-random sequence
+                    import hashlib
+                    hash_obj = hashlib.md5(t.encode())
+                    seed = int(hash_obj.hexdigest(), 16) % 10000
+                    import random
+                    random.seed(seed)
+                    # Get length or use max_length if provided
+                    max_length = kwargs.get("max_length", 128)
+                    length = min(len(t.split()), max_length)
+                    # Generate ids and mask
+                    ids = [self.bos_token_id] + [random.randint(1000, 30000) for _ in range(length-2)] + [self.eos_token_id]
+                    mask = [1] * len(ids)
+                    # Pad if needed
+                    if "padding" in kwargs:
+                        pad_length = max_length - len(ids)
+                        if pad_length > 0:
+                            ids.extend([self.pad_token_id] * pad_length)
+                            mask.extend([0] * pad_length)
+                    input_ids.append(torch.tensor(ids))
+                    attention_mask.append(torch.tensor(mask))
+                # Stack tensors
+                if "return_tensors" in kwargs and kwargs["return_tensors"] == "pt":
+                    if is_batch or len(texts) > 1:
+                        return {
+                            "input_ids": torch.stack(input_ids),
+                            "attention_mask": torch.stack(attention_mask)
+                        }
+                    else:
+                        return {
+                            "input_ids": input_ids[0].unsqueeze(0),
+                            "attention_mask": attention_mask[0].unsqueeze(0)
+                        }
+                else:
+                    return {
+                        "input_ids": input_ids[0] if not is_batch and len(texts) == 1 else input_ids,
+                        "attention_mask": attention_mask[0] if not is_batch and len(texts) == 1 else attention_mask
+                    }
+            def decode(self, token_ids, skip_special_tokens=True, **kwargs):
+                """Convert token IDs back to text"""
+                if isinstance(token_ids, (list, tuple)) and len(token_ids) > 0:
+                    return f"Decoded text from {len(token_ids)} tokens"
+                return "Decoded text"
+        return DummyTokenizer()
+def get_hybrid_attention_config():
+    """Get configuration for smart hybrid attention mechanism"""
+    from utils.smartHybridAttention import get_hybrid_attention_config
+    return get_hybrid_attention_config()
+def load_transformer_model(model_name: str, device: Optional[torch.device] = None) -> AutoModel:
+    """
+    Load a transformer model.
+    Args:
+        model_name: Name of the model to load
+        device: Optional device to load the model on
+    Returns:
+        Loaded transformer model
+    """
+    try:
+        logger.info(f"Loading transformer model: {model_name}")
+        model = AutoModel.from_pretrained(model_name)
+        if device:
+            model = model.to(device)
+        logger.info(f"Successfully loaded model: {model_name}")
+        return model
+    except Exception as e:
+        logger.error(f"Error loading model {model_name}: {e}")
+        raise
+def clear_cache():
+    """Clear all model and tokenizer caches to free memory."""
+    global _model_cache, _tokenizer_cache, _sentence_transformer_cache
+    _model_cache.clear()
+    _tokenizer_cache.clear()
+    _sentence_transformer_cache.clear()
+    logger.info("Cleared transformer model and tokenizer caches")
+def get_embedding(text: str, model: Optional[SentenceTransformer] = None) -> torch.Tensor:
+    """Get embedding for a text string using a sentence transformer model."""
+    if model is None:
+        model = get_sentence_transformer(DEFAULT_SENTENCE_TRANSFORMER)
+    return model.encode(text, convert_to_tensor=True)