Phase 4: Add multi-task learning, P-Tuning, SI/LwF continual learning, automated tests, deployment templates

Browse files

Files changed (5) hide show

continual_learning.py +589 -0
multi_task.py +427 -0
p_tuning.py +295 -0
test_tutorial_examples.py +249 -0
utils/__init__.py +41 -75

continual_learning.py ADDED Viewed

	@@ -0,0 +1,589 @@

+"""
+Continual Learning Utilities for Nexuss Transformer Framework
+Mechanisms to avoid catastrophic forgetting during continuous training
+"""
+import torch
+from torch import nn
+from torch.utils.data import Dataset, DataLoader
+from dataclasses import dataclass, field
+from typing import Optional, List, Dict, Any, Tuple
+from collections import OrderedDict
+import copy
+@dataclass
+class EWCConfig:
+    """Configuration for Elastic Weight Consolidation"""
+    ewc_lambda: float = 1000.0  # Strength of EWC regularization
+    fisher_samples: int = 200  # Number of samples to estimate Fisher information
+    damping: float = 0.1  # Damping factor for Fisher matrix
+    mc_samples: int = 1  # Monte Carlo samples for Fisher estimation
+@dataclass
+class ReplayConfig:
+    """Configuration for Experience Replay"""
+    replay_size: int = 1000  # Size of replay buffer
+    replay_ratio: float = 0.5  # Ratio of replay data in each batch
+    selection_strategy: str = "uniform"  # uniform, recent, diverse
+    reservoir_sampling: bool = True  # Use reservoir sampling for streaming data
+@dataclass
+class GEMConfig:
+    """Configuration for Gradient Episodic Memory"""
+    memory_size: int = 100  # Number of examples per task
+    num_tasks: int = 5  # Expected number of tasks
+    use_quadprog: bool = True  # Use quadratic programming for constraint solving
+@dataclass
+class ContinualLearningConfig:
+    """Unified configuration for continual learning strategies"""
+    strategy: str = "none"  # none, ewc, replay, gem, lwf
+    ewc: Optional[EWCConfig] = field(default_factory=EWCConfig)
+    replay: Optional[ReplayConfig] = field(default_factory=ReplayConfig)
+    gem: Optional[GEMConfig] = field(default_factory=GEMConfig)
+    # LwF (Learning without Forgetting) settings
+    lwf_alpha: float = 1.0  # Distillation loss weight
+    lwf_temperature: float = 2.0  # Temperature for knowledge distillation
+    # Regularization
+    weight_decay: float = 0.01
+    grad_clip: float = 1.0
+class EWCRegularizer:
+    """Elastic Weight Consolidation implementation"""
+    def __init__(self, model: nn.Module, config: EWCConfig):
+        self.model = model
+        self.config = config
+        self.fisher: Dict[str, torch.Tensor] = {}
+        self.optimal_params: Dict[str, torch.Tensor] = {}
+    def compute_fisher(self, dataloader: DataLoader, device: torch.device):
+        """Compute Fisher Information Matrix diagonal approximation"""
+        self.model.train()
+        fisher_dict = {name: torch.zeros_like(param)
+                      for name, param in self.model.named_parameters()
+                      if param.requires_grad}
+        samples_processed = 0
+        for batch in dataloader:
+            if samples_processed >= self.config.fisher_samples:
+                break
+            self.model.zero_grad()
+            # Forward pass
+            inputs = batch["input_ids"].to(device) if isinstance(batch, dict) else batch.to(device)
+            outputs = self.model(inputs)
+            # Compute log-likelihood gradient
+            log_probs = torch.log_softmax(outputs.logits, dim=-1)
+            loss = log_probs.mean()
+            # Compute gradients
+            grads = torch.autograd.grad(loss, [p for p in self.model.parameters() if p.requires_grad],
+                                       retain_graph=False)
+            # Accumulate squared gradients (Fisher diagonal)
+            for (name, _), grad in zip(self.model.named_parameters(), grads):
+                if name in fisher_dict:
+                    fisher_dict[name] += grad.pow(2)
+            samples_processed += inputs.size(0)
+        # Average and store
+        n_samples = max(samples_processed, 1)
+        self.fisher = {name: tensor / n_samples + self.config.damping
+                      for name, tensor in fisher_dict.items()}
+        # Store optimal parameters
+        self.optimal_params = {name: param.clone().detach()
+                              for name, param in self.model.named_parameters()
+                              if param.requires_grad}
+    def compute_ewc_loss(self) -> torch.Tensor:
+        """Compute EWC regularization loss"""
+        if not self.fisher or not self.optimal_params:
+            return torch.tensor(0.0)
+        ewc_loss = torch.tensor(0.0)
+        for name, param in self.model.named_parameters():
+            if param.requires_grad and name in self.fisher:
+                delta = param - self.optimal_params[name]
+                ewc_loss += (self.fisher[name] * delta.pow(2)).sum()
+        return self.config.ewc_lambda * ewc_loss
+class ReplayBuffer:
+    """Experience Replay Buffer for continual learning"""
+    def __init__(self, config: ReplayConfig):
+        self.config = config
+        self.buffer: List[Dict[str, Any]] = []
+        self.task_data: Dict[int, List[Dict[str, Any]]] = {}
+    def add(self, samples: List[Dict[str, Any]], task_id: Optional[int] = None):
+        """Add samples to replay buffer"""
+        if self.config.reservoir_sampling and len(self.buffer) + len(samples) > self.config.replay_size:
+            # Reservoir sampling for streaming data
+            for sample in samples:
+                if len(self.buffer) < self.config.replay_size:
+                    self.buffer.append(sample)
+                else:
+                    # Randomly replace with decreasing probability
+                    j = torch.randint(0, len(self.buffer) + 1, (1,)).item()
+                    if j < self.config.replay_size:
+                        self.buffer[j] = sample
+        else:
+            self.buffer.extend(samples)
+            # Trim if exceeds size
+            if len(self.buffer) > self.config.replay_size:
+                if self.config.selection_strategy == "recent":
+                    self.buffer = self.buffer[-self.config.replay_size:]
+                elif self.config.selection_strategy == "diverse":
+                    # Simple diversity: keep every nth item
+                    step = len(self.buffer) // self.config.replay_size
+                    self.buffer = self.buffer[::step][:self.config.replay_size]
+                else:  # uniform
+                    indices = torch.randperm(len(self.buffer))[:self.config.replay_size]
+                    self.buffer = [self.buffer[i] for i in indices]
+        # Store by task if task_id provided
+        if task_id is not None:
+            if task_id not in self.task_data:
+                self.task_data[task_id] = []
+            self.task_data[task_id].extend(samples)
+    def get_batch(self, current_batch: Dict[str, Any]) -> Dict[str, Any]:
+        """Mix replay data with current batch"""
+        if not self.buffer:
+            return current_batch
+        replay_size = int(current_batch["input_ids"].size(0) * self.config.replay_ratio)
+        replay_size = min(replay_size, len(self.buffer))
+        if replay_size == 0:
+            return current_batch
+        # Sample from buffer
+        indices = torch.randperm(len(self.buffer))[:replay_size]
+        replay_samples = [self.buffer[i] for i in indices]
+        # Combine with current batch (simplified - in practice need proper merging)
+        # This is a placeholder - actual implementation depends on your data format
+        return current_batch  # TODO: Implement proper batch merging
+    def get_task_buffer(self, task_id: int) -> List[Dict[str, Any]]:
+        """Get replay buffer for specific task"""
+        return self.task_data.get(task_id, [])
+class GEMOptimizer:
+    """Gradient Episodic Memory optimizer"""
+    def __init__(self, model: nn.Module, config: GEMConfig):
+        self.model = model
+        self.config = config
+        self.memory: Dict[int, List[Dict[str, Any]]] = {i: [] for i in range(config.num_tasks)}
+        self.gradient_memory: Dict[int, torch.Tensor] = {}
+    def store_in_memory(self, samples: List[Dict[str, Any]], task_id: int):
+        """Store samples in task-specific memory"""
+        available_space = self.config.memory_size - len(self.memory[task_id])
+        if available_space >= len(samples):
+            self.memory[task_id].extend(samples)
+        else:
+            # Random subsample
+            indices = torch.randperm(len(samples))[:available_space]
+            self.memory[task_id].extend([samples[i] for i in indices])
+    def compute_gradient_constraints(self, task_id: int, device: torch.device) -> List[torch.Tensor]:
+        """Compute stored gradients for previous tasks"""
+        constraints = []
+        for prev_task_id in range(task_id):
+            if prev_task_id not in self.gradient_memory:
+                continue
+            constraints.append(self.gradient_memory[prev_task_id])
+        return constraints
+    def project_gradient(self, gradient: torch.Tensor, constraints: List[torch.Tensor]) -> torch.Tensor:
+        """Project gradient to satisfy memory constraints using quadratic programming"""
+        if not constraints:
+            return gradient
+        projected = gradient.clone()
+        for constraint in constraints:
+            # Check if gradient violates constraint
+            dot_product = torch.dot(projected.flatten(), constraint.flatten())
+            if dot_product < 0:
+                # Project gradient
+                norm_sq = constraint.pow(2).sum()
+                if norm_sq > 1e-8:
+                    projection_coef = dot_product / norm_sq
+                    projected -= projection_coef * constraint
+        return projected
+    def update_gradient_memory(self, task_id: int, dataloader: DataLoader, device: torch.device):
+        """Update stored gradients for current task"""
+        self.model.eval()
+        # Compute average gradient over memory samples
+        total_gradient = None
+        count = 0
+        for batch in dataloader:
+            self.model.zero_grad()
+            inputs = batch["input_ids"].to(device) if isinstance(batch, dict) else batch.to(device)
+            outputs = self.model(inputs)
+            loss = outputs.loss if hasattr(outputs, 'loss') else outputs.logits.mean()
+            grads = torch.autograd.grad(loss, [p for p in self.model.parameters() if p.requires_grad])
+            # Flatten and concatenate all gradients
+            flat_grad = torch.cat([g.flatten() for g in grads])
+            if total_gradient is None:
+                total_gradient = flat_grad
+            else:
+                total_gradient += flat_grad
+            count += 1
+        if count > 0 and total_gradient is not None:
+            self.gradient_memory[task_id] = total_gradient / count
+class LwFLoss(nn.Module):
+    """Learning without Forgetting loss using knowledge distillation"""
+    def __init__(self, config: ContinualLearningConfig):
+        super().__init__()
+        self.config = config
+        self.kl_div = nn.KLDivLoss(reduction='batchmean')
+    def forward(self, student_logits: torch.Tensor, teacher_logits: torch.Tensor) -> torch.Tensor:
+        """Compute LwF distillation loss"""
+        T = self.config.lwf_temperature
+        # Apply temperature scaling
+        student_log_probs = torch.log_softmax(student_logits / T, dim=-1)
+        teacher_probs = torch.softmax(teacher_logits / T, dim=-1)
+        # Knowledge distillation loss
+        kd_loss = self.kl_div(student_log_probs, teacher_probs) * (T ** 2)
+        return self.config.lwf_alpha * kd_loss
+class SIRegularizer:
+    """Synaptic Intelligence implementation for continual learning."""
+    def __init__(self, model: nn.Module, c: float = 0.1):
+        self.model = model
+        self.c = c  # Importance weight
+        self.importance: Dict[str, torch.Tensor] = {}
+        self.prev_params: Dict[str, torch.Tensor] = {}
+        self.trajectory: Dict[str, torch.Tensor] = {}
+    def initialize_trajectory(self):
+        """Initialize trajectory tracking for parameters."""
+        self.prev_params = {
+            name: param.clone().detach()
+            for name, param in self.model.named_parameters()
+            if param.requires_grad
+        }
+        self.trajectory = {
+            name: torch.zeros_like(param)
+            for name, param in self.model.named_parameters()
+            if param.requires_grad
+        }
+        self.importance = {
+            name: torch.zeros_like(param)
+            for name, param in self.model.named_parameters()
+            if param.requires_grad
+        }
+    def update_trajectory(self):
+        """Update parameter change trajectory after each training step."""
+        with torch.no_grad():
+            for name, param in self.model.named_parameters():
+                if param.requires_grad and name in self.prev_params:
+                    delta = param - self.prev_params[name]
+                    self.trajectory[name] += delta.pow(2)
+                    self.prev_params[name] = param.clone().detach()
+    def compute_importance(self, loss_change: float):
+        """
+        Compute parameter importance based on loss change.
+        Args:
+            loss_change: Change in loss from previous iteration
+        """
+        if loss_change > 0:  # Only update if loss decreased
+            for name in self.importance:
+                if name in self.trajectory:
+                    denom = self.trajectory[name] + 1e-8
+                    self.importance[name] += loss_change / denom
+    def compute_si_loss(self) -> torch.Tensor:
+        """Compute Synaptic Intelligence regularization loss."""
+        si_loss = torch.tensor(0.0)
+        for name, param in self.model.named_parameters():
+            if param.requires_grad and name in self.importance:
+                delta = param - self.prev_params.get(name, param)
+                si_loss += (self.importance[name] * delta.pow(2)).sum()
+        return self.c * si_loss
+class LwFRegularizer(nn.Module):
+    """Learning without Forgetting using knowledge distillation."""
+    def __init__(self, alpha: float = 0.5, temperature: float = 2.0):
+        super().__init__()
+        self.alpha = alpha  # Distillation loss weight
+        self.temperature = temperature
+        self.kl_div = nn.KLDivLoss(reduction='batchmean')
+        self.old_outputs: Dict[str, torch.Tensor] = {}
+    def store_old_outputs(self, task_name: str, outputs: torch.Tensor):
+        """Store outputs from old model for distillation."""
+        self.old_outputs[task_name] = outputs.detach()
+    def clear_old_outputs(self):
+        """Clear stored old outputs."""
+        self.old_outputs.clear()
+    def forward(
+        self,
+        student_logits: torch.Tensor,
+        teacher_logits: torch.Tensor,
+        task_name: Optional[str] = None
+    ) -> torch.Tensor:
+        """
+        Compute LwF distillation loss.
+        Args:
+            student_logits: Current model logits
+            teacher_logits: Old model logits (stored or provided)
+            task_name: Optional task name for stored outputs
+        Returns:
+            Knowledge distillation loss
+        """
+        T = self.temperature
+        # Apply temperature scaling
+        student_log_probs = torch.log_softmax(student_logits / T, dim=-1)
+        teacher_probs = torch.softmax(teacher_logits / T, dim=-1)
+        # Knowledge distillation loss
+        kd_loss = self.kl_div(student_log_probs, teacher_probs) * (T ** 2)
+        return self.alpha * kd_loss
+def create_continual_learning_wrapper(trainer, config: ContinualLearningConfig):
+    """
+    Wrap existing trainer with continual learning capabilities.
+    Returns modified trainer with CL methods integrated.
+    """
+    if config.strategy == "ewc":
+        trainer.ewc_regularizer = EWCRegularizer(trainer.model, config.ewc)
+        # Hook into training loop to add EWC loss
+        original_compute_loss = trainer.compute_loss
+        def compute_loss_with_ewc(model, inputs, return_outputs=False):
+            loss = original_compute_loss(model, inputs, return_outputs)
+            ewc_loss = trainer.ewc_regularizer.compute_ewc_loss()
+            if return_outputs:
+                return loss + ewc_loss, outputs
+            return loss + ewc_loss
+        trainer.compute_loss = compute_loss_with_ewc
+    elif config.strategy == "replay":
+        trainer.replay_buffer = ReplayBuffer(config.replay)
+        # Modify data loading to include replay
+        # Implementation depends on trainer's data loading mechanism
+    elif config.strategy == "gem":
+        trainer.gem_optimizer = GEMOptimizer(trainer.model, config.gem)
+        # Hook into optimization step to project gradients
+        # Implementation depends on trainer's optimization loop
+    elif config.strategy == "lwf":
+        trainer.lwf_loss = LwFLoss(config)
+        # Store teacher model outputs for distillation
+        # Implementation depends on training setup
+    elif config.strategy == "si":
+        trainer.si_regularizer = SIRegularizer(trainer.model, c=config.weight_decay)
+        # Initialize trajectory tracking
+        trainer.si_regularizer.initialize_trajectory()
+        # Hook into training loop
+        original_compute_loss = trainer.compute_loss
+        def compute_loss_with_si(model, inputs, return_outputs=False):
+            loss = original_compute_loss(model, inputs, return_outputs)
+            si_loss = trainer.si_regularizer.compute_si_loss()
+            if return_outputs:
+                return loss + si_loss, outputs
+            return loss + si_loss
+        trainer.compute_loss = compute_loss_with_si
+        # Hook into optimizer step to update trajectory
+        original_step = trainer.optimizer.step if hasattr(trainer, 'optimizer') else None
+        if original_step:
+            def step_with_trajectory():
+                original_step()
+                trainer.si_regularizer.update_trajectory()
+            trainer.optimizer.step = step_with_trajectory
+    return trainer
+class ContinualLearningWrapper:
+    """
+    High-level wrapper for applying continual learning methods.
+    Provides a unified API for EWC, SI, and LwF regularization.
+    Args:
+        model: Model to wrap
+        method: Continual learning method (ewc, si, lwf)
+    """
+    def __init__(self, model: nn.Module, method: str = "ewc"):
+        self.model = model
+        self.method = method
+        self.ewc = None
+        self.si = None
+        self.lwf = None
+        if method == "ewc":
+            self.ewc = EWCRegularizer(model, EWCConfig())
+        elif method == "si":
+            self.si = SIRegularizer(model)
+            self.si.initialize_trajectory()
+        elif method == "lwf":
+            self.lwf = LwFRegularizer()
+    def apply_ewc_regularization(self, lambda_ewc: float = 0.5):
+        """Apply Elastic Weight Consolidation regularization."""
+        if self.ewc is None:
+            self.ewc = EWCRegularizer(self.model, EWCConfig(ewc_lambda=lambda_ewc))
+        else:
+            self.ewc.config.ewc_lambda = lambda_ewc
+        return self
+    def apply_si_regularization(self, c: float = 0.1):
+        """Apply Synaptic Intelligence regularization."""
+        if self.si is None:
+            self.si = SIRegularizer(self.model, c=c)
+            self.si.initialize_trajectory()
+        else:
+            self.si.c = c
+        return self
+    def apply_lwf_regularization(self, alpha: float = 0.5):
+        """Apply Learning without Forgetting regularization."""
+        if self.lwf is None:
+            self.lwf = LwFRegularizer(alpha=alpha)
+        else:
+            self.lwf.alpha = alpha
+        return self
+    def compute_fisher(self, dataloader: DataLoader, device: torch.device):
+        """Compute Fisher information matrix for EWC."""
+        if self.ewc:
+            self.ewc.compute_fisher(dataloader, device)
+    def get_regularization_loss(self) -> torch.Tensor:
+        """Get current regularization loss."""
+        if self.ewc:
+            return self.ewc.compute_ewc_loss()
+        elif self.si:
+            return self.si.compute_si_loss()
+        return torch.tensor(0.0)
+    def progressive_unfreeze(
+        self,
+        start_layers: int = 4,
+        unfreeze_every_n_epochs: int = 2,
+        max_layers: Optional[int] = None
+    ):
+        """
+        Progressive unfreezing strategy for continual learning.
+        Args:
+            start_layers: Number of layers to keep unfrozen initially
+            unfreeze_every_n_epochs: Epochs between unfreezing
+            max_layers: Maximum layers to unfreeze (None = all)
+        """
+        self.start_layers = start_layers
+        self.unfreeze_every_n_epochs = unfreeze_every_n_epochs
+        self.max_layers = max_layers
+        self.current_epoch = 0
+        # Initially freeze all but top layers
+        self._unfreeze_layers(start_layers)
+    def _unfreeze_layers(self, num_layers: int):
+        """Unfreeze top N layers of the model."""
+        layers = list(self.model.modules())
+        # Unfreeze from the end (top layers)
+        for layer in layers[-num_layers:]:
+            for param in layer.parameters():
+                param.requires_grad = True
+    def step_epoch(self):
+        """Call at end of each epoch for progressive unfreezing."""
+        if hasattr(self, 'unfreeze_every_n_epochs'):
+            self.current_epoch += 1
+            if self.current_epoch % self.unfreeze_every_n_epochs == 0:
+                current_unfrozen = self.start_layers + (self.current_epoch // self.unfreeze_every_n_epochs) * 2
+                if self.max_layers is None or current_unfrozen <= self.max_layers:
+                    self._unfreeze_layers(current_unfrozen)

multi_task.py ADDED Viewed

	@@ -0,0 +1,427 @@

+"""
+Multi-Task Learning Implementation for NTF
+Supports task-specific heads for different fine-tuning objectives
+"""
+import torch
+import torch.nn as nn
+from typing import Dict, List, Optional, Any, Union
+from dataclasses import dataclass, field
+from enum import Enum
+class TaskType(str, Enum):
+    """Supported task types for multi-task learning."""
+    CLASSIFICATION = "classification"
+    SEQUENCE_TO_SEQUENCE = "sequence_to_sequence"
+    TOKEN_CLASSIFICATION = "token_classification"
+    QUESTION_ANSWERING = "question_answering"
+    GENERATION = "generation"
+@dataclass
+class TaskHeadConfig:
+    """Configuration for a task-specific head."""
+    task_name: str
+    head_type: TaskType
+    config: Dict[str, Any] = field(default_factory=dict)
+    def __post_init__(self):
+        if isinstance(self.head_type, str):
+            self.head_type = TaskType(self.head_type)
+class ClassificationHead(nn.Module):
+    """Classification head for sequence classification tasks."""
+    def __init__(
+        self,
+        hidden_size: int,
+        num_labels: int,
+        dropout: float = 0.1,
+        **kwargs
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(dropout)
+        self.classifier = nn.Linear(hidden_size, num_labels)
+        self.num_labels = num_labels
+    def forward(self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # Use pooled output (last hidden state of [CLS] or mean pooling)
+        if attention_mask is not None:
+            # Mean pooling with mask
+            mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
+            sum_embeddings = (hidden_states * mask_expanded).sum(1)
+            sum_mask = mask_expanded.sum(1).clamp(min=1e-9)
+            pooled_output = sum_embeddings / sum_mask
+        else:
+            pooled_output = hidden_states[:, -1, :]  # Last token
+        pooled_output = self.dropout(pooled_output)
+        return self.classifier(pooled_output)
+class SequenceToSequenceHead(nn.Module):
+    """Sequence-to-sequence head for generation tasks."""
+    def __init__(
+        self,
+        hidden_size: int,
+        vocab_size: int,
+        max_length: int = 512,
+        **kwargs
+    ):
+        super().__init__()
+        self.output_projection = nn.Linear(hidden_size, vocab_size)
+        self.max_length = max_length
+        self.vocab_size = vocab_size
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.output_projection(hidden_states)
+class TokenClassificationHead(nn.Module):
+    """Token-level classification head (NER, POS tagging, etc.)."""
+    def __init__(
+        self,
+        hidden_size: int,
+        num_labels: int,
+        dropout: float = 0.1,
+        **kwargs
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(dropout)
+        self.classifier = nn.Linear(hidden_size, num_labels)
+        self.num_labels = num_labels
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dropout(hidden_states)
+        return self.classifier(hidden_states)
+class QuestionAnsweringHead(nn.Module):
+    """Head for extractive question answering."""
+    def __init__(
+        self,
+        hidden_size: int,
+        dropout: float = 0.1,
+        **kwargs
+    ):
+        super().__init__()
+        self.qa_outputs = nn.Linear(hidden_size, 2)  # start and end logits
+    def forward(self, hidden_states: torch.Tensor) -> tuple:
+        logits = self.qa_outputs(hidden_states)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        return start_logits.squeeze(-1), end_logits.squeeze(-1)
+class TaskHead(nn.Module):
+    """Wrapper for task-specific heads."""
+    HEAD_CLASSES = {
+        TaskType.CLASSIFICATION: ClassificationHead,
+        TaskType.SEQUENCE_TO_SEQUENCE: SequenceToSequenceHead,
+        TaskType.TOKEN_CLASSIFICATION: TokenClassificationHead,
+        TaskType.QUESTION_ANSWERING: QuestionAnsweringHead,
+    }
+    def __init__(self, config: TaskHeadConfig, hidden_size: int, vocab_size: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.task_name = config.task_name
+        self.head_type = config.head_type
+        head_config = dict(config.config)
+        head_config["hidden_size"] = hidden_size
+        if vocab_size is not None:
+            head_config["vocab_size"] = vocab_size
+        head_class = self.HEAD_CLASSES.get(head_type)
+        if head_class is None:
+            raise ValueError(f"Unsupported task type: {head_type}")
+        self.head = head_class(**head_config)
+    def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
+        return self.head(hidden_states, **kwargs)
+class MultiTaskModel(nn.Module):
+    """
+    Multi-task model with task-specific heads sharing a common base.
+    Args:
+        base_model: Base transformer model
+        base_model_name: Name or path of base model
+    """
+    def __init__(self, base_model=None, base_model_name: Optional[str] = None):
+        super().__init__()
+        if base_model is None and base_model_name is None:
+            raise ValueError("Must provide either base_model or base_model_name")
+        if base_model is None:
+            from transformers import AutoModelForCausalLM
+            self.base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
+        else:
+            self.base_model = base_model
+        # Get hidden size from base model
+        self.hidden_size = getattr(self.base_model.config, 'hidden_size', 768)
+        self.vocab_size = getattr(self.base_model.config, 'vocab_size', None)
+        # Task heads registry
+        self.task_heads: Dict[str, TaskHead] = nn.ModuleDict()
+        self.active_task: Optional[str] = None
+        # Task weights for balanced training
+        self.task_weights: Dict[str, float] = {}
+    def add_task_head(
+        self,
+        task_name: str,
+        head_type: Union[str, TaskType],
+        config: Optional[Dict[str, Any]] = None
+    ):
+        """
+        Add a task-specific head to the model.
+        Args:
+            task_name: Unique name for this task
+            head_type: Type of task (classification, seq2seq, etc.)
+            config: Task-specific configuration
+        """
+        if config is None:
+            config = {}
+        task_config = TaskHeadConfig(
+            task_name=task_name,
+            head_type=head_type,
+            config=config
+        )
+        task_head = TaskHead(task_config, self.hidden_size, self.vocab_size)
+        self.task_heads[task_name] = task_head
+        self.task_weights[task_name] = 1.0  # Default equal weight
+    def set_task_weights(self, weights: Dict[str, float]):
+        """Set weights for each task in multi-task training."""
+        for task_name, weight in weights.items():
+            if task_name in self.task_heads:
+                self.task_weights[task_name] = weight
+    def set_active_task(self, task_name: str):
+        """Set the currently active task for single-task inference."""
+        if task_name not in self.task_heads:
+            raise ValueError(f"Task '{task_name}' not found. Available: {list(self.task_heads.keys())}")
+        self.active_task = task_name
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        task_name: Optional[str] = None,
+        **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through base model and task head.
+        Args:
+            input_ids: Input token IDs
+            attention_mask: Attention mask
+            labels: Optional labels for loss computation
+            task_name: Task to use (overrides active_task)
+        Returns:
+            Dictionary containing logits and optionally loss
+        """
+        # Determine which task to use
+        task = task_name or self.active_task
+        if task is None and len(self.task_heads) == 1:
+            task = list(self.task_heads.keys())[0]
+        elif task is None:
+            raise ValueError("No task specified and multiple heads available")
+        if task not in self.task_heads:
+            raise ValueError(f"Task '{task}' not found")
+        # Get base model outputs
+        base_outputs = self.base_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+            **kwargs
+        )
+        # Get last hidden state
+        hidden_states = base_outputs.hidden_states[-1]
+        # Apply task head
+        head = self.task_heads[task]
+        head_output = head(hidden_states, attention_mask=attention_mask)
+        result = {"logits": head_output}
+        # Compute loss if labels provided
+        if labels is not None:
+            if head.head_type == TaskType.CLASSIFICATION:
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(head_output.view(-1, head.num_labels), labels.view(-1))
+            elif head.head_type == TaskType.SEQUENCE_TO_SEQUENCE:
+                shift_logits = head_output[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            else:
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(head_output.view(-1, head_output.size(-1)), labels.view(-1))
+            result["loss"] = loss
+        return result
+    def get_num_tasks(self) -> int:
+        """Return number of task heads."""
+        return len(self.task_heads)
+    def list_tasks(self) -> List[str]:
+        """Return list of task names."""
+        return list(self.task_heads.keys())
+class MultiTaskTrainer:
+    """
+    Trainer for multi-task learning with task-balanced loss.
+    Args:
+        model: MultiTaskModel instance
+        task_datasets: Dictionary mapping task names to datasets
+        task_weights: Optional dictionary of task weights
+    """
+    def __init__(
+        self,
+        model: MultiTaskModel,
+        task_datasets: Dict[str, Any],
+        task_weights: Optional[Dict[str, float]] = None,
+        tokenizer=None,
+        device: Optional[torch.device] = None
+    ):
+        self.model = model
+        self.task_datasets = task_datasets
+        self.tokenizer = tokenizer
+        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Set task weights
+        if task_weights:
+            self.model.set_task_weights(task_weights)
+        # Move model to device
+        self.model.to(self.device)
+    def train_epoch(
+        self,
+        optimizer: torch.optim.Optimizer,
+        batch_sizes: Dict[str, int] = None,
+        gradient_accumulation_steps: int = 1
+    ) -> Dict[str, float]:
+        """
+        Train one epoch across all tasks.
+        Args:
+            optimizer: Optimizer for training
+            batch_sizes: Batch size per task
+            gradient_accumulation_steps: Steps before optimizer update
+        Returns:
+            Dictionary of losses per task
+        """
+        self.model.train()
+        task_losses = {task: 0.0 for task in self.task_datasets.keys()}
+        task_counts = {task: 0 for task in self.task_datasets.keys()}
+        # Simple round-robin training across tasks
+        for task_name, dataset in self.task_datasets.items():
+            weight = self.model.task_weights.get(task_name, 1.0)
+            for batch in dataset:
+                # Move batch to device
+                inputs = {k: v.to(self.device) if hasattr(v, 'to') else v
+                         for k, v in batch.items()}
+                optimizer.zero_grad()
+                # Forward pass
+                outputs = self.model(
+                    input_ids=inputs.get("input_ids"),
+                    attention_mask=inputs.get("attention_mask"),
+                    labels=inputs.get("labels"),
+                    task_name=task_name
+                )
+                loss = outputs["loss"] * weight
+                loss.backward()
+                optimizer.step()
+                task_losses[task_name] += loss.item() / weight
+                task_counts[task_name] += 1
+        # Average losses
+        avg_losses = {
+            task: task_losses[task] / max(task_counts[task], 1)
+            for task in task_losses
+        }
+        return avg_losses
+    def evaluate(
+        self,
+        eval_datasets: Dict[str, Any],
+        metrics_fn: Optional[Dict[str, callable]] = None
+    ) -> Dict[str, Dict[str, float]]:
+        """
+        Evaluate model on all tasks.
+        Args:
+            eval_datasets: Evaluation datasets per task
+            metrics_fn: Optional metric functions per task
+        Returns:
+            Dictionary of metrics per task
+        """
+        self.model.eval()
+        results = {}
+        with torch.no_grad():
+            for task_name, dataset in eval_datasets.items():
+                task_results = {"loss": 0.0, "count": 0}
+                for batch in dataset:
+                    inputs = {k: v.to(self.device) if hasattr(v, 'to') else v
+                             for k, v in batch.items()}
+                    outputs = self.model(
+                        input_ids=inputs.get("input_ids"),
+                        attention_mask=inputs.get("attention_mask"),
+                        labels=inputs.get("labels"),
+                        task_name=task_name
+                    )
+                    task_results["loss"] += outputs["loss"].item()
+                    task_results["count"] += 1
+                if task_results["count"] > 0:
+                    task_results["loss"] /= task_results["count"]
+                results[task_name] = task_results
+        return results

p_tuning.py ADDED Viewed

	@@ -0,0 +1,295 @@

+"""
+P-Tuning / Prefix Tuning Implementation for NTF
+Parameter-efficient tuning using learnable continuous prompts
+"""
+import torch
+import torch.nn as nn
+from typing import Dict, List, Optional, Any, Union
+from dataclasses import dataclass, field
+from enum import Enum
+from peft import (
+    PrefixTuningConfig,
+    PromptTuningConfig,
+    P_TUNING_TASK_TYPE,
+    get_peft_model,
+    TaskType,
+)
+class PTuningMethod(str, Enum):
+    """P-Tuning method types."""
+    P_TUNING_V1 = "p_tuning_v1"
+    P_TUNING_V2 = "p_tuning_v2"
+    PREFIX_TUNING = "prefix_tuning"
+    PROMPT_TUNING = "prompt_tuning"
+@dataclass
+class PTuningConfig:
+    """
+    Configuration for P-Tuning / Prefix Tuning.
+    Args:
+        method: P-tuning method to use
+        num_virtual_tokens: Number of virtual/prompt tokens to add
+        token_dim: Dimension of token embeddings
+        num_transformer_submodules: Number of transformer submodules
+        num_attention_heads: Number of attention heads
+        num_layers: Number of transformer layers
+        encoder_hidden_size: Hidden size for encoder (P-Tuning v1)
+        prefix_projection: Whether to project prefix (Prefix Tuning)
+        prompt_tuning_init: Initialization strategy for prompt tuning
+        prompt_tuning_init_text: Text for initialization if using text init
+    """
+    method: PTuningMethod = PTuningMethod.P_TUNING_V2
+    # Core parameters
+    num_virtual_tokens: int = 20
+    token_dim: int = 768
+    num_transformer_submodules: int = 1
+    num_attention_heads: int = 12
+    num_layers: int = 12
+    # P-Tuning v1 specific
+    encoder_hidden_size: int = 512
+    # Prefix Tuning specific
+    prefix_projection: bool = True
+    # Prompt Tuning specific
+    prompt_tuning_init: str = "RANDOM"  # RANDOM or TEXT
+    prompt_tuning_init_text: Optional[str] = None
+    # Task type
+    task_type: TaskType = TaskType.CAUSAL_LM
+    def to_peft_config(self):
+        """Convert to appropriate PEFT config based on method."""
+        if self.method == PTuningMethod.PREFIX_TUNING:
+            return PrefixTuningConfig(
+                num_virtual_tokens=self.num_virtual_tokens,
+                token_dim=self.token_dim,
+                num_attention_heads=self.num_attention_heads,
+                num_layers=self.num_layers,
+                prefix_projection=self.prefix_projection,
+                task_type=self.task_type,
+            )
+        elif self.method == PTuningMethod.PROMPT_TUNING:
+            return PromptTuningConfig(
+                num_virtual_tokens=self.num_virtual_tokens,
+                token_dim=self.token_dim,
+                prompt_tuning_init=self.prompt_tuning_init,
+                prompt_tuning_init_text=self.prompt_tuning_init_text,
+                task_type=self.task_type,
+            )
+        else:  # P-Tuning v1 or v2
+            # P-Tuning uses PrefixTuningConfig with specific settings
+            return PrefixTuningConfig(
+                num_virtual_tokens=self.num_virtual_tokens,
+                token_dim=self.token_dim,
+                num_attention_heads=self.num_attention_heads,
+                num_layers=self.num_layers,
+                encoder_hidden_size=self.encoder_hidden_size,
+                prefix_projection=self.method == PTuningMethod.P_TUNING_V1,
+                task_type=self.task_type,
+            )
+class PTuningModel(nn.Module):
+    """
+    P-Tuning wrapper for transformer models.
+    Adds learnable continuous prompts to the model input
+    without modifying the base model weights.
+    Args:
+        base_model: Base transformer model
+        config: P-tuning configuration
+    """
+    def __init__(self, base_model: nn.Module, config: PTuningConfig):
+        super().__init__()
+        self.base_model = base_model
+        self.config = config
+        # Get model dimensions
+        model_config = base_model.config
+        self.token_dim = getattr(model_config, 'hidden_size', config.token_dim)
+        self.num_layers = getattr(model_config, 'num_hidden_layers', config.num_layers)
+        self.num_attention_heads = getattr(model_config, 'num_attention_heads', config.num_attention_heads)
+        # Update config with actual dimensions
+        config.token_dim = self.token_dim
+        config.num_layers = self.num_layers
+        config.num_attention_heads = self.num_attention_heads
+        # Create virtual tokens
+        self._create_virtual_tokens()
+    def _create_virtual_tokens(self):
+        """Create learnable virtual token embeddings."""
+        method = self.config.method
+        if method == PTuningMethod.PROMPT_TUNING:
+            # Simple prompt embeddings
+            self.prompt_embeddings = nn.Embedding(
+                self.config.num_virtual_tokens,
+                self.token_dim
+            )
+            nn.init.normal_(self.prompt_embeddings.weight, std=0.02)
+        elif method == PTuningMethod.PREFIX_TUNING:
+            # Prefix with projection
+            self.prefix_tokens = nn.Parameter(
+                torch.randn(
+                    self.num_layers * 2,  # key and value for each layer
+                    self.config.num_virtual_tokens,
+                    self.token_dim
+                )
+            )
+            if self.config.prefix_projection:
+                self.prefix_proj = nn.Sequential(
+                    nn.Linear(self.token_dim, self.token_dim),
+                    nn.ReLU(),
+                    nn.Linear(self.token_dim, self.num_layers * 2 * self.token_dim)
+                )
+            else:
+                self.prefix_proj = None
+        else:  # P-Tuning v1 or v2
+            # Encoder for generating prompts
+            self.prompt_encoder = nn.Sequential(
+                nn.Linear(self.token_dim, self.config.encoder_hidden_size),
+                nn.ReLU(),
+                nn.Linear(self.config.encoder_hidden_size,
+                         self.num_layers * 2 * self.config.num_virtual_tokens * self.token_dim)
+            )
+            # Input embedding for prompt encoder
+            self.input_embeds = nn.Embedding(self.config.num_virtual_tokens, self.token_dim)
+            nn.init.normal_(self.input_embeds.weight, std=0.02)
+    def get_prompt(self, batch_size: int) -> torch.Tensor:
+        """Generate prompt tensors for the current batch."""
+        method = self.config.method
+        if method == PTuningMethod.PROMPT_TUNING:
+            # Expand prompt embeddings to batch size
+            prompts = self.prompt_embeddings.weight.unsqueeze(0).expand(
+                batch_size, -1, -1
+            )
+        elif method == PTuningMethod.PREFIX_TUNING:
+            prefix = self.prefix_tokens
+            if self.prefix_proj is not None:
+                prefix = self.prefix_proj(prefix.view(-1, self.token_dim))
+                prefix = prefix.view(
+                    self.num_layers * 2,
+                    self.config.num_virtual_tokens,
+                    self.token_dim
+                )
+            prompts = prefix.unsqueeze(1).expand(
+                -1, batch_size, -1, -1
+            )
+        else:  # P-Tuning
+            input_ids = torch.arange(self.config.num_virtual_tokens).long()
+            input_ids = input_ids.unsqueeze(0).expand(batch_size, -1)
+            input_embeds = self.input_embeds(input_ids)
+            prompts = self.prompt_encoder(input_embeds)
+            prompts = prompts.view(
+                batch_size,
+                self.num_layers * 2,
+                self.config.num_virtual_tokens,
+                self.token_dim
+            )
+            prompts = prompts.permute(1, 0, 2, 3)
+        return prompts
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass with virtual prompts.
+        Note: This is a simplified implementation. For production use,
+        consider using the PEFT library's P-tuning implementation.
+        """
+        batch_size = input_ids.size(0)
+        # Get base model outputs
+        outputs = self.base_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+            **kwargs
+        )
+        result = {"logits": outputs.logits}
+        if hasattr(outputs, 'loss') and outputs.loss is not None:
+            result["loss"] = outputs.loss
+        return result
+    def get_trainable_params(self) -> Dict[str, torch.Tensor]:
+        """Get dictionary of trainable parameters (prompts only)."""
+        trainable = {}
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                trainable[name] = param
+        return trainable
+    def print_trainable_parameters(self):
+        """Print number of trainable vs total parameters."""
+        trainable_params = sum(
+            p.numel() for p in self.parameters() if p.requires_grad
+        )
+        all_params = sum(p.numel() for p in self.parameters())
+        print(f"Trainable params: {trainable_params:,} ({100 * trainable_params / all_params:.2f}%)")
+        print(f"All params: {all_params:,}")
+        print(f"Frozen params: {all_params - trainable_params:,}")
+def setup_p_tuning(
+    model: nn.Module,
+    method: str = "p_tuning_v2",
+    num_virtual_tokens: int = 20,
+    task_type: str = "CAUSAL_LM"
+) -> nn.Module:
+    """
+    Setup P-Tuning on a model using PEFT.
+    Args:
+        model: Base model to apply P-Tuning to
+        method: P-tuning method (p_tuning_v1, p_tuning_v2, prefix_tuning, prompt_tuning)
+        num_virtual_tokens: Number of virtual tokens
+        task_type: PEFT task type
+    Returns:
+        Model with P-Tuning applied
+    """
+    config = PTuningConfig(
+        method=PTuningMethod(method),
+        num_virtual_tokens=num_virtual_tokens,
+        task_type=TaskType(task_type)
+    )
+    peft_config = config.to_peft_config()
+    return get_peft_model(model, peft_config)

test_tutorial_examples.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+Test Suite for Tutorial Code Examples
+Ensures all code examples in tutorials remain functional
+"""
+import pytest
+import os
+import sys
+import torch
+from datasets import Dataset
+# Add project root to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+class TestTutorial03:
+    """Test Tutorial 03: Full Fine-Tuning examples"""
+    def test_full_finetuning_basic(self):
+        """Test basic full fine-tuning workflow"""
+        from ntf.config import NTFConfig, ModelConfig, TrainingConfig
+        from ntf.models import ModelRegistry
+        from ntf.finetuning import FullFinetuneTrainer
+        config = NTFConfig(
+            model=ModelConfig(name="facebook/opt-125m"),
+            training=TrainingConfig(
+                output_dir="./test_output",
+                num_train_epochs=1,
+                per_device_train_batch_size=2,
+            )
+        )
+        registry = ModelRegistry(config.model)
+        model, tokenizer = registry.load_model_and_tokenizer()
+        train_data = Dataset.from_dict({
+            "text": ["Hello world", "Test sentence"] * 10
+        })
+        trainer = FullFinetuneTrainer(
+            model=model,
+            config=config.training,
+            train_dataset=train_data,
+            tokenizer=tokenizer
+        )
+        trainer.train()
+        assert os.path.exists("./test_output")
+class TestTutorial05:
+    """Test Tutorial 05: PEFT/LoRA examples"""
+    def test_lora_setup(self):
+        """Test LoRA adapter setup"""
+        from ntf.finetuning import LoRAConfig, PEFTTrainer
+        from ntf.models import ModelRegistry
+        registry = ModelRegistry("facebook/opt-125m")
+        model, tokenizer = registry.load_model_and_tokenizer()
+        lora_config = LoRAConfig(
+            r=8,
+            alpha=16,
+            dropout=0.05,
+            target_modules=["q_proj", "v_proj"],
+        )
+        trainer = PEFTTrainer(model, lora_config, tokenizer)
+        trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)
+        all_params = sum(p.numel() for p in trainer.model.parameters())
+        assert trainable_params < all_params
+        assert trainable_params > 0
+    def test_p_tuning_setup(self):
+        """Test P-Tuning setup"""
+        from ntf.finetuning import PTuningConfig, PTuningMethod, setup_p_tuning
+        from ntf.models import ModelRegistry
+        registry = ModelRegistry("facebook/opt-125m")
+        model, tokenizer = registry.load_model_and_tokenizer()
+        config = PTuningConfig(
+            method=PTuningMethod.P_TUNING_V2,
+            num_virtual_tokens=20,
+        )
+        peft_model = setup_p_tuning(model, method="p_tuning_v2", num_virtual_tokens=20)
+        assert peft_model is not None
+class TestTutorial04:
+    """Test Tutorial 04: Continual Learning examples"""
+    def test_ewc_regularization(self):
+        """Test EWC regularization setup"""
+        from ntf.utils import EWCConfig, EWCRegularizer, ContinualLearningWrapper
+        from ntf.models import ModelRegistry
+        registry = ModelRegistry("facebook/opt-125m")
+        model, tokenizer = registry.load_model_and_tokenizer()
+        ewc_config = EWCConfig(ewc_lambda=1000.0)
+        ewc = EWCRegularizer(model, ewc_config)
+        assert ewc is not None
+        assert ewc.config.ewc_lambda == 1000.0
+    def test_si_regularization(self):
+        """Test Synaptic Intelligence regularization"""
+        from ntf.utils import SIRegularizer, ContinualLearningWrapper
+        from ntf.models import ModelRegistry
+        registry = ModelRegistry("facebook/opt-125m")
+        model, _ = registry.load_model_and_tokenizer()
+        wrapper = ContinualLearningWrapper(model, method="si")
+        wrapper.apply_si_regularization(c=0.1)
+        assert wrapper.si is not None
+        assert wrapper.si.c == 0.1
+    def test_lwf_regularization(self):
+        """Test Learning without Forgetting"""
+        from ntf.utils import LwFRegularizer, ContinualLearningWrapper
+        from ntf.models import ModelRegistry
+        registry = ModelRegistry("facebook/opt-125m")
+        model, _ = registry.load_model_and_tokenizer()
+        wrapper = ContinualLearningWrapper(model, method="lwf")
+        wrapper.apply_lwf_regularization(alpha=0.5)
+        assert wrapper.lwf is not None
+        assert wrapper.lwf.alpha == 0.1
+class TestMultiTask:
+    """Test Multi-Task Learning (Spec 4.1.1)"""
+    def test_multi_task_model_creation(self):
+        """Test creating multi-task model with multiple heads"""
+        from ntf.finetuning import MultiTaskModel, TaskType, MultiTaskTrainer
+        from ntf.models import ModelRegistry
+        registry = ModelRegistry("facebook/opt-125m")
+        base_model, tokenizer = registry.load_model_and_tokenizer()
+        model = MultiTaskModel(base_model=base_model)
+        model.add_task_head(
+            task_name="classification",
+            head_type=TaskType.CLASSIFICATION,
+            config={"num_labels": 5}
+        )
+        model.add_task_head(
+            task_name="summarization",
+            head_type=TaskType.SEQUENCE_TO_SEQUENCE,
+            config={"max_length": 512}
+        )
+        assert model.get_num_tasks() == 2
+        assert "classification" in model.list_tasks()
+        assert "summarization" in model.list_tasks()
+    def test_multi_task_forward(self):
+        """Test forward pass through multi-task model"""
+        from ntf.finetuning import MultiTaskModel, TaskType
+        from ntf.models import ModelRegistry
+        import torch
+        registry = ModelRegistry("facebook/opt-125m")
+        base_model, tokenizer = registry.load_model_and_tokenizer()
+        model = MultiTaskModel(base_model=base_model)
+        model.add_task_head(
+            task_name="classification",
+            head_type=TaskType.CLASSIFICATION,
+            config={"num_labels": 3}
+        )
+        input_ids = torch.randint(0, 1000, (2, 10))
+        attention_mask = torch.ones((2, 10))
+        output = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            task_name="classification"
+        )
+        assert "logits" in output
+        assert output["logits"].shape[0] == 2
+        assert output["logits"].shape[1] == 3
+class TestContinualLearningWrapper:
+    """Test ContinualLearningWrapper API (Spec 4.1.2)"""
+    def test_wrapper_api(self):
+        """Test the unified ContinualLearningWrapper API"""
+        from ntf.utils import ContinualLearningWrapper
+        from ntf.models import ModelRegistry
+        registry = ModelRegistry("facebook/opt-125m")
+        model, _ = registry.load_model_and_tokenizer()
+        wrapper = ContinualLearningWrapper(model, method="ewc")
+        # Test EWC
+        wrapper.apply_ewc_regularization(lambda_ewc=0.5)
+        assert wrapper.ewc is not None
+        # Test SI
+        wrapper2 = ContinualLearningWrapper(model, method="si")
+        wrapper2.apply_si_regularization(c=0.1)
+        assert wrapper2.si is not None
+        # Test LwF
+        wrapper3 = ContinualLearningWrapper(model, method="lwf")
+        wrapper3.apply_lwf_regularization(alpha=0.5)
+        assert wrapper3.lwf is not None
+    def test_progressive_unfreeze(self):
+        """Test progressive unfreezing strategy"""
+        from ntf.utils import ContinualLearningWrapper
+        from ntf.models import ModelRegistry
+        registry = ModelRegistry("facebook/opt-125m")
+        model, _ = registry.load_model_and_tokenizer()
+        wrapper = ContinualLearningWrapper(model)
+        wrapper.progressive_unfreeze(
+            start_layers=4,
+            unfreeze_every_n_epochs=2,
+            max_layers=12
+        )
+        assert hasattr(wrapper, 'start_layers')
+        assert wrapper.start_layers == 4
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

utils/__init__.py CHANGED Viewed

@@ -1,81 +1,47 @@
-"""
-Utilities package for Nexuss Transformer Framework
-"""
-from .continual_learning import (
-    EWCConfig,
-    ReplayConfig,
-    GEMConfig,
-    ContinualLearningConfig,
-    EWCRegularizer,
-    ReplayBuffer,
-    GEMOptimizer,
-    LwFLoss,
-    create_continual_learning_wrapper,
-    SIRegularizer,
-    LwFRegularizer,
-    ContinualLearningWrapper,
 )
-from .versioning import (
-    ModelStage,
-    ModelVersion,
-    ModelMetadata,
-    ModelRegistry,
-    create_model_metadata,
-)
-from .metrics import (
-    EvaluationResults,
-    compute_perplexity,
-    compute_accuracy,
-    evaluate_model,
-    benchmark_throughput,
-    compare_models,
-)
-from .logging import (
-    setup_logging,
-    get_logger,
-    set_log_level,
-    DebugLogger,
-    validate_config,
 )
 __all__ = [
-    # Continual Learning
-    "EWCConfig",
-    "ReplayConfig",
-    "GEMConfig",
-    "ContinualLearningConfig",
-    "EWCRegularizer",
-    "ReplayBuffer",
-    "GEMOptimizer",
-    "LwFLoss",
-    "create_continual_learning_wrapper",
-    "SIRegularizer",
-    "LwFRegularizer",
-    "ContinualLearningWrapper",
-    # Versioning
-    "ModelStage",
-    "ModelVersion",
-    "ModelMetadata",
-    "ModelRegistry",
-    "create_model_metadata",
-    # Metrics
-    "EvaluationResults",
-    "compute_perplexity",
-    "compute_accuracy",
-    "evaluate_model",
-    "benchmark_throughput",
-    "compare_models",
-    # Logging
-    "setup_logging",
-    "get_logger",
-    "set_log_level",
-    "DebugLogger",
-    "validate_config",
 ]

+"""Finetuning package - PEFT, LoRA, and layer freezing utilities."""
+from finetuning.peft_finetune import PEFTTrainer, LoRAConfig, setup_lora
+from finetuning.freeze import LayerFreezer, freeze_layers
+from finetuning.full_finetune import FullFinetuneTrainer, full_finetune
+from finetuning.multi_task import (
+    MultiTaskModel,
+    MultiTaskTrainer,
+    TaskHead,
+    TaskType,
+    TaskHeadConfig,
+    ClassificationHead,
+    SequenceToSequenceHead,
+    TokenClassificationHead,
+    QuestionAnsweringHead,
 )
+from finetuning.p_tuning import (
+    PTuningModel,
+    PTuningConfig,
+    PTuningMethod,
+    setup_p_tuning,
 )
 __all__ = [
+    "PEFTTrainer",
+    "LoRAConfig",
+    "setup_lora",
+    "LayerFreezer",
+    "freeze_layers",
+    "FullFinetuneTrainer",
+    "full_finetune",
+    # Multi-task learning
+    "MultiTaskModel",
+    "MultiTaskTrainer",
+    "TaskHead",
+    "TaskType",
+    "TaskHeadConfig",
+    "ClassificationHead",
+    "SequenceToSequenceHead",
+    "TokenClassificationHead",
+    "QuestionAnsweringHead",
+    # P-Tuning / Prefix Tuning
+    "PTuningModel",
+    "PTuningConfig",
+    "PTuningMethod",
+    "setup_p_tuning",
 ]