| import logging |
| from typing import Dict, Any, Optional |
| from agentic_reliability_framework.runtime.agents.base import BaseAgent, AgentSpecialization |
| from ai_event import AIEvent |
| from nli_detector import NLIDetector |
|
|
| logger = logging.getLogger(__name__) |
|
|
| class HallucinationDetectiveAgent(BaseAgent): |
| """ |
| Detects potential hallucinations in generated text by combining: |
| - Model confidence score (lower confidence → higher risk) |
| - Natural Language Inference (NLI) entailment score (lower entailment → higher risk) |
| """ |
|
|
| def __init__(self, nli_detector: Optional[NLIDetector] = None): |
| super().__init__(AgentSpecialization.DETECTIVE) |
| self._thresholds = { |
| 'confidence': 0.7, |
| 'entailment': 0.6 |
| } |
| self.nli = nli_detector or NLIDetector() |
|
|
| async def analyze(self, event: AIEvent) -> Dict[str, Any]: |
| try: |
| flags = [] |
| risk_score = 1.0 |
| entail_prob = None |
|
|
| if event.confidence < self._thresholds['confidence']: |
| flags.append('low_confidence') |
| risk_score *= 0.5 |
|
|
| if event.prompt and event.response and self.nli.pipeline is not None: |
| entail_prob = self.nli.check(event.prompt, event.response) |
| if entail_prob is not None and entail_prob < self._thresholds['entailment']: |
| flags.append('low_entailment') |
| risk_score *= 0.6 |
|
|
| is_hallucination = len(flags) > 0 |
|
|
| return { |
| 'specialization': 'ai_hallucination', |
| 'confidence': 1 - risk_score if is_hallucination else 0, |
| 'findings': { |
| 'is_hallucination': is_hallucination, |
| 'flags': flags, |
| 'risk_score': risk_score, |
| 'confidence': event.confidence, |
| 'entailment': entail_prob |
| }, |
| 'recommendations': [ |
| "Regenerate with lower temperature", |
| "Provide more context", |
| "Use a different model" |
| ] if is_hallucination else [] |
| } |
| except Exception as e: |
| logger.error(f"HallucinationDetective error: {e}", exc_info=True) |
| return { |
| 'specialization': 'ai_hallucination', |
| 'confidence': 0.0, |
| 'findings': {}, |
| 'recommendations': [] |
| } |