import torch import torch.nn as nn from transformers import AutoModel from peft import get_peft_model, LoraConfig, TaskType class VanillaBaselineModel(nn.Module): """ A simple baseline deep learning model that takes the entire trajectory and predicts if a hallucination occurred. It does NOT use the AgentSight Context Encoder or Dual Heads. """ def __init__(self, encoder_name="microsoft/deberta-v3-base"): super().__init__() base_encoder = AutoModel.from_pretrained(encoder_name, torch_dtype=torch.float32) base_encoder.gradient_checkpointing_enable() # Apply LoRA to the base encoder peft_config = LoraConfig( task_type=TaskType.FEATURE_EXTRACTION, r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["query_proj", "value_proj"] ) self.encoder = get_peft_model(base_encoder, peft_config) self.encoder.print_trainable_parameters() enc_dim = self.encoder.config.hidden_size # Simple binary classification head self.classifier = nn.Sequential( nn.Linear(enc_dim, 64), nn.ReLU(), nn.Dropout(0.1), nn.Linear(64, 1) # Raw logits for BCEWithLogitsLoss ) def forward(self, input_ids, attention_mask): # Forward pass through DeBERTa (N_steps, max_len) outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) # Use the representation of the [CLS] token (the first token) for each step cls_repr = outputs.last_hidden_state[:, 0, :] # Shape: (N_steps, enc_dim) # MEAN POOLING across all steps to create a single Trajectory Representation # This guarantees gradients flow backwards through ALL steps, unlike max() pooling! traj_repr = cls_repr.mean(dim=0).unsqueeze(0) # Shape: (1, enc_dim) # Predict hallucination for the entire sequence logits = self.classifier(traj_repr).squeeze(-1) # Shape: (1,) return logits