Spaces:

talha1234567
/

agentsight-api

Running

agentsight-api / src /models /agentsight.py

Minato Namikaze

Deploy to Hugging Face Spaces

2aed081 7 days ago

4.67 kB

	import torch
	import torch.nn as nn
	from transformers import AutoModel
	from peft import get_peft_model, LoraConfig, TaskType


	class AgentSightModel(nn.Module):
	"""
	AgentSight: Step-level hallucination classifier for agent trajectories.

	Architecture
	────────────
	1. DeBERTa-v3-base encoder (frozen backbone) with LoRA adapters
	injected into all attention projections.
	2. Transformer Context-Encoder: a shallow 3-layer Transformer that reads
	the sequence of per-step CLS representations and lets steps attend
	to each other — capturing error-accumulation dynamics.
	3. Projection head → binary hallucination logit per step.

	Input
	─────
	A single trajectory encoded as N (input_ids, attention_mask) pairs, one
	per agent step. Each step's text is a concatenated string:
	[QUERY] <task> [THOUGHT] <reasoning> [ACTION] <calls> [OBS] <results>

	The ablation study (A1) shows that removing [ACTION]+[OBS] drops step
	localisation accuracy by ~28.8 points, confirming that tool execution
	content is the dominant signal.

	Output
	──────
	hal_logits: (N,) float tensor — raw logits, NOT sigmoids.
	Apply torch.sigmoid() at inference; use BCEWithLogitsLoss during training.
	"""

	def __init__(
	self,
	encoder_name: str = "microsoft/deberta-v3-base",
	hidden_dim: int = 256,
	dropout: float = 0.2,
	n_context_layers: int = 3,
	lora_r: int = 16,
	lora_alpha: int = 64,
	):
	super().__init__()

	# ── 1. Backbone ───────────────────────────────────────────────────
	base_encoder = AutoModel.from_pretrained(encoder_name, dtype=torch.float32)
	base_encoder.gradient_checkpointing_enable()

	peft_config = LoraConfig(
	task_type=TaskType.FEATURE_EXTRACTION,
	r=lora_r,
	lora_alpha=lora_alpha,
	lora_dropout=0.1,
	target_modules=["query_proj", "key_proj", "value_proj", "dense"],
	)
	self.encoder = get_peft_model(base_encoder, peft_config)
	self.encoder.print_trainable_parameters()

	enc_dim = self.encoder.config.hidden_size # 768 for deberta-v3-base

	# ── 2. Cross-step Context Encoder ─────────────────────────────────
	ctx_layer = nn.TransformerEncoderLayer(
	d_model=enc_dim,
	nhead=8,
	dim_feedforward=hidden_dim * 4,
	dropout=dropout,
	batch_first=True,
	norm_first=True, # Pre-LN for better gradient flow
	)
	self.context_encoder = nn.TransformerEncoder(
	ctx_layer, num_layers=n_context_layers,
	enable_nested_tensor=False,
	)

	# ── 3. Projection + Classification head ───────────────────────────
	self.fusion = nn.Sequential(
	nn.Linear(enc_dim, hidden_dim),
	nn.GELU(),
	nn.Dropout(dropout),
	)

	self.cls_head = nn.Sequential(
	nn.Linear(hidden_dim, 64),
	nn.GELU(),
	nn.Dropout(dropout / 2),
	nn.Linear(64, 1),
	# NOTE: no Sigmoid — use BCEWithLogitsLoss for numerical stability
	)

	# ── forward ───────────────────────────────────────────────────────────────
	def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor):
	"""
	Args
	────
	input_ids : (N, seq_len) — one row per trajectory step
	attention_mask : (N, seq_len)

	Returns
	───────
	hal_logits : (N,) — raw pre-sigmoid hallucination logits
	"""
	# Encode each step independently with DeBERTa
	out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
	step_repr = out.last_hidden_state[:, 0, :].to(torch.float32) # (N, enc_dim)

	# Cross-step attention: treat the N steps as a sequence of length N
	# (add and remove a dummy batch dim of 1)
	context = self.context_encoder(step_repr.unsqueeze(0)).squeeze(0) # (N, enc_dim)

	fused = self.fusion(context) # (N, hidden_dim)
	hal_logits = self.cls_head(fused).squeeze(-1) # (N,)

	return hal_logits