Spaces:

A-R-F
/

Agentic-Reliability-Framework-v4

Running

App Files Files Community

Agentic-Reliability-Framework-v4 / hallucination_detective.py

petter2025

Update hallucination_detective.py

c45e983 verified about 2 months ago

raw

history blame

2.49 kB

	import logging
	from typing import Dict, Any, Optional
	from agentic_reliability_framework.runtime.agents.base import BaseAgent, AgentSpecialization
	from ai_event import AIEvent
	from nli_detector import NLIDetector

	logger = logging.getLogger(__name__)

	class HallucinationDetectiveAgent(BaseAgent):
	"""
	Detects potential hallucinations in generated text by combining:
	- Model confidence score (lower confidence → higher risk)
	- Natural Language Inference (NLI) entailment score (lower entailment → higher risk)
	"""

	def __init__(self, nli_detector: Optional[NLIDetector] = None):
	super().__init__(AgentSpecialization.DETECTIVE)
	self._thresholds = {
	'confidence': 0.7,
	'entailment': 0.6
	}
	self.nli = nli_detector or NLIDetector()

	async def analyze(self, event: AIEvent) -> Dict[str, Any]:
	try:
	flags = []
	risk_score = 1.0
	entail_prob = None

	if event.confidence < self._thresholds['confidence']:
	flags.append('low_confidence')
	risk_score *= 0.5

	if event.prompt and event.response and self.nli.pipeline is not None:
	entail_prob = self.nli.check(event.prompt, event.response)
	if entail_prob is not None and entail_prob < self._thresholds['entailment']:
	flags.append('low_entailment')
	risk_score *= 0.6

	is_hallucination = len(flags) > 0

	return {
	'specialization': 'ai_hallucination',
	'confidence': 1 - risk_score if is_hallucination else 0,
	'findings': {
	'is_hallucination': is_hallucination,
	'flags': flags,
	'risk_score': risk_score,
	'confidence': event.confidence,
	'entailment': entail_prob
	},
	'recommendations': [
	"Regenerate with lower temperature",
	"Provide more context",
	"Use a different model"
	] if is_hallucination else []
	}
	except Exception as e:
	logger.error(f"HallucinationDetective error: {e}", exc_info=True)
	return {
	'specialization': 'ai_hallucination',
	'confidence': 0.0,
	'findings': {},
	'recommendations': []
	}