Spaces:

talha1234567
/

agentsight-api

Running

agentsight-api / src /models /baseline_model.py

Minato Namikaze

Deploy to Hugging Face Spaces

2aed081 8 days ago

2.1 kB

	import torch
	import torch.nn as nn
	from transformers import AutoModel
	from peft import get_peft_model, LoraConfig, TaskType

	class VanillaBaselineModel(nn.Module):
	"""
	A simple baseline deep learning model that takes the entire trajectory
	and predicts if a hallucination occurred.
	It does NOT use the AgentSight Context Encoder or Dual Heads.
	"""
	def __init__(self, encoder_name="microsoft/deberta-v3-base"):
	super().__init__()

	base_encoder = AutoModel.from_pretrained(encoder_name, torch_dtype=torch.float32)
	base_encoder.gradient_checkpointing_enable()

	# Apply LoRA to the base encoder
	peft_config = LoraConfig(
	task_type=TaskType.FEATURE_EXTRACTION,
	r=8,
	lora_alpha=32,
	lora_dropout=0.1,
	target_modules=["query_proj", "value_proj"]
	)
	self.encoder = get_peft_model(base_encoder, peft_config)
	self.encoder.print_trainable_parameters()

	enc_dim = self.encoder.config.hidden_size

	# Simple binary classification head
	self.classifier = nn.Sequential(
	nn.Linear(enc_dim, 64),
	nn.ReLU(),
	nn.Dropout(0.1),
	nn.Linear(64, 1) # Raw logits for BCEWithLogitsLoss
	)

	def forward(self, input_ids, attention_mask):
	# Forward pass through DeBERTa (N_steps, max_len)
	outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)

	# Use the representation of the [CLS] token (the first token) for each step
	cls_repr = outputs.last_hidden_state[:, 0, :] # Shape: (N_steps, enc_dim)

	# MEAN POOLING across all steps to create a single Trajectory Representation
	# This guarantees gradients flow backwards through ALL steps, unlike max() pooling!
	traj_repr = cls_repr.mean(dim=0).unsqueeze(0) # Shape: (1, enc_dim)

	# Predict hallucination for the entire sequence
	logits = self.classifier(traj_repr).squeeze(-1) # Shape: (1,)
	return logits