Spaces:
Running
Running
| import torch | |
| import torch.nn as nn | |
| from transformers import AutoModel | |
| from peft import get_peft_model, LoraConfig, TaskType | |
| class AgentSightModel(nn.Module): | |
| """ | |
| AgentSight: Step-level hallucination classifier for agent trajectories. | |
| Architecture | |
| ββββββββββββ | |
| 1. DeBERTa-v3-base encoder (frozen backbone) with LoRA adapters | |
| injected into all attention projections. | |
| 2. Transformer Context-Encoder: a shallow 3-layer Transformer that reads | |
| the sequence of per-step CLS representations and lets steps attend | |
| to each other β capturing error-accumulation dynamics. | |
| 3. Projection head β binary hallucination logit per step. | |
| Input | |
| βββββ | |
| A single trajectory encoded as N (input_ids, attention_mask) pairs, one | |
| per agent step. Each step's text is a concatenated string: | |
| [QUERY] <task> [THOUGHT] <reasoning> [ACTION] <calls> [OBS] <results> | |
| The ablation study (A1) shows that removing [ACTION]+[OBS] drops step | |
| localisation accuracy by ~28.8 points, confirming that tool execution | |
| content is the dominant signal. | |
| Output | |
| ββββββ | |
| hal_logits: (N,) float tensor β raw logits, NOT sigmoids. | |
| Apply torch.sigmoid() at inference; use BCEWithLogitsLoss during training. | |
| """ | |
| def __init__( | |
| self, | |
| encoder_name: str = "microsoft/deberta-v3-base", | |
| hidden_dim: int = 256, | |
| dropout: float = 0.2, | |
| n_context_layers: int = 3, | |
| lora_r: int = 16, | |
| lora_alpha: int = 64, | |
| ): | |
| super().__init__() | |
| # ββ 1. Backbone βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| base_encoder = AutoModel.from_pretrained(encoder_name, dtype=torch.float32) | |
| base_encoder.gradient_checkpointing_enable() | |
| peft_config = LoraConfig( | |
| task_type=TaskType.FEATURE_EXTRACTION, | |
| r=lora_r, | |
| lora_alpha=lora_alpha, | |
| lora_dropout=0.1, | |
| target_modules=["query_proj", "key_proj", "value_proj", "dense"], | |
| ) | |
| self.encoder = get_peft_model(base_encoder, peft_config) | |
| self.encoder.print_trainable_parameters() | |
| enc_dim = self.encoder.config.hidden_size # 768 for deberta-v3-base | |
| # ββ 2. Cross-step Context Encoder βββββββββββββββββββββββββββββββββ | |
| ctx_layer = nn.TransformerEncoderLayer( | |
| d_model=enc_dim, | |
| nhead=8, | |
| dim_feedforward=hidden_dim * 4, | |
| dropout=dropout, | |
| batch_first=True, | |
| norm_first=True, # Pre-LN for better gradient flow | |
| ) | |
| self.context_encoder = nn.TransformerEncoder( | |
| ctx_layer, num_layers=n_context_layers, | |
| enable_nested_tensor=False, | |
| ) | |
| # ββ 3. Projection + Classification head βββββββββββββββββββββββββββ | |
| self.fusion = nn.Sequential( | |
| nn.Linear(enc_dim, hidden_dim), | |
| nn.GELU(), | |
| nn.Dropout(dropout), | |
| ) | |
| self.cls_head = nn.Sequential( | |
| nn.Linear(hidden_dim, 64), | |
| nn.GELU(), | |
| nn.Dropout(dropout / 2), | |
| nn.Linear(64, 1), | |
| # NOTE: no Sigmoid β use BCEWithLogitsLoss for numerical stability | |
| ) | |
| # ββ forward βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor): | |
| """ | |
| Args | |
| ββββ | |
| input_ids : (N, seq_len) β one row per trajectory step | |
| attention_mask : (N, seq_len) | |
| Returns | |
| βββββββ | |
| hal_logits : (N,) β raw pre-sigmoid hallucination logits | |
| """ | |
| # Encode each step independently with DeBERTa | |
| out = self.encoder(input_ids=input_ids, attention_mask=attention_mask) | |
| step_repr = out.last_hidden_state[:, 0, :].to(torch.float32) # (N, enc_dim) | |
| # Cross-step attention: treat the N steps as a sequence of length N | |
| # (add and remove a dummy batch dim of 1) | |
| context = self.context_encoder(step_repr.unsqueeze(0)).squeeze(0) # (N, enc_dim) | |
| fused = self.fusion(context) # (N, hidden_dim) | |
| hal_logits = self.cls_head(fused).squeeze(-1) # (N,) | |
| return hal_logits | |