agentsight-api / src /models /agentsight.py
Minato Namikaze
Deploy to Hugging Face Spaces
2aed081
Raw
History Blame Contribute Delete
4.67 kB
import torch
import torch.nn as nn
from transformers import AutoModel
from peft import get_peft_model, LoraConfig, TaskType
class AgentSightModel(nn.Module):
"""
AgentSight: Step-level hallucination classifier for agent trajectories.
Architecture
────────────
1. DeBERTa-v3-base encoder (frozen backbone) with LoRA adapters
injected into all attention projections.
2. Transformer Context-Encoder: a shallow 3-layer Transformer that reads
the sequence of per-step CLS representations and lets steps attend
to each other β€” capturing error-accumulation dynamics.
3. Projection head β†’ binary hallucination logit per step.
Input
─────
A single trajectory encoded as N (input_ids, attention_mask) pairs, one
per agent step. Each step's text is a concatenated string:
[QUERY] <task> [THOUGHT] <reasoning> [ACTION] <calls> [OBS] <results>
The ablation study (A1) shows that removing [ACTION]+[OBS] drops step
localisation accuracy by ~28.8 points, confirming that tool execution
content is the dominant signal.
Output
──────
hal_logits: (N,) float tensor β€” raw logits, NOT sigmoids.
Apply torch.sigmoid() at inference; use BCEWithLogitsLoss during training.
"""
def __init__(
self,
encoder_name: str = "microsoft/deberta-v3-base",
hidden_dim: int = 256,
dropout: float = 0.2,
n_context_layers: int = 3,
lora_r: int = 16,
lora_alpha: int = 64,
):
super().__init__()
# ── 1. Backbone ───────────────────────────────────────────────────
base_encoder = AutoModel.from_pretrained(encoder_name, dtype=torch.float32)
base_encoder.gradient_checkpointing_enable()
peft_config = LoraConfig(
task_type=TaskType.FEATURE_EXTRACTION,
r=lora_r,
lora_alpha=lora_alpha,
lora_dropout=0.1,
target_modules=["query_proj", "key_proj", "value_proj", "dense"],
)
self.encoder = get_peft_model(base_encoder, peft_config)
self.encoder.print_trainable_parameters()
enc_dim = self.encoder.config.hidden_size # 768 for deberta-v3-base
# ── 2. Cross-step Context Encoder ─────────────────────────────────
ctx_layer = nn.TransformerEncoderLayer(
d_model=enc_dim,
nhead=8,
dim_feedforward=hidden_dim * 4,
dropout=dropout,
batch_first=True,
norm_first=True, # Pre-LN for better gradient flow
)
self.context_encoder = nn.TransformerEncoder(
ctx_layer, num_layers=n_context_layers,
enable_nested_tensor=False,
)
# ── 3. Projection + Classification head ───────────────────────────
self.fusion = nn.Sequential(
nn.Linear(enc_dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
)
self.cls_head = nn.Sequential(
nn.Linear(hidden_dim, 64),
nn.GELU(),
nn.Dropout(dropout / 2),
nn.Linear(64, 1),
# NOTE: no Sigmoid β€” use BCEWithLogitsLoss for numerical stability
)
# ── forward ───────────────────────────────────────────────────────────────
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor):
"""
Args
────
input_ids : (N, seq_len) β€” one row per trajectory step
attention_mask : (N, seq_len)
Returns
───────
hal_logits : (N,) β€” raw pre-sigmoid hallucination logits
"""
# Encode each step independently with DeBERTa
out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
step_repr = out.last_hidden_state[:, 0, :].to(torch.float32) # (N, enc_dim)
# Cross-step attention: treat the N steps as a sequence of length N
# (add and remove a dummy batch dim of 1)
context = self.context_encoder(step_repr.unsqueeze(0)).squeeze(0) # (N, enc_dim)
fused = self.fusion(context) # (N, hidden_dim)
hal_logits = self.cls_head(fused).squeeze(-1) # (N,)
return hal_logits