petter2025 commited on
Commit
f6265f6
·
verified ·
1 Parent(s): 48e9ee1

Update hallucination_detective.py

Browse files
Files changed (1) hide show
  1. hallucination_detective.py +31 -8
hallucination_detective.py CHANGED
@@ -7,19 +7,39 @@ from nli_detector import NLIDetector
7
  logger = logging.getLogger(__name__)
8
 
9
  class HallucinationDetectiveAgent(BaseAgent):
10
- """Detects hallucinations using confidence and NLI consistency."""
 
 
 
 
 
11
  def __init__(self, nli_detector: Optional[NLIDetector] = None):
12
  super().__init__(AgentSpecialization.DETECTIVE)
 
13
  self._thresholds = {
14
- 'confidence': 0.7,
15
- 'entailment': 0.6
16
  }
17
  self.nli = nli_detector or NLIDetector()
18
 
19
  async def analyze(self, event: AIEvent) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  try:
21
  flags = []
22
  risk_score = 1.0
 
23
 
24
  # 1. Check confidence
25
  if event.confidence < self._thresholds['confidence']:
@@ -32,11 +52,9 @@ class HallucinationDetectiveAgent(BaseAgent):
32
  if entail_prob is not None and entail_prob < self._thresholds['entailment']:
33
  flags.append('low_entailment')
34
  risk_score *= 0.6
35
- else:
36
- # No NLI, so just use confidence
37
- pass
38
 
39
  is_hallucination = len(flags) > 0
 
40
  return {
41
  'specialization': 'ai_hallucination',
42
  'confidence': 1 - risk_score if is_hallucination else 0,
@@ -45,7 +63,7 @@ class HallucinationDetectiveAgent(BaseAgent):
45
  'flags': flags,
46
  'risk_score': risk_score,
47
  'confidence': event.confidence,
48
- 'entailment': entail_prob if 'entail_prob' in locals() else None
49
  },
50
  'recommendations': [
51
  "Regenerate with lower temperature",
@@ -55,4 +73,9 @@ class HallucinationDetectiveAgent(BaseAgent):
55
  }
56
  except Exception as e:
57
  logger.error(f"HallucinationDetective error: {e}", exc_info=True)
58
- return {'specialization': 'ai_hallucination', 'confidence': 0.0, 'findings': {}, 'recommendations': []}
 
 
 
 
 
 
7
  logger = logging.getLogger(__name__)
8
 
9
  class HallucinationDetectiveAgent(BaseAgent):
10
+ """
11
+ Detects potential hallucinations in generated text by combining:
12
+ - Model confidence score (lower confidence → higher risk)
13
+ - Natural Language Inference (NLI) entailment score (lower entailment → higher risk)
14
+ """
15
+
16
  def __init__(self, nli_detector: Optional[NLIDetector] = None):
17
  super().__init__(AgentSpecialization.DETECTIVE)
18
+ # Thresholds for flagging – can be overridden by subclass or config
19
  self._thresholds = {
20
+ 'confidence': 0.7, # below this → low confidence
21
+ 'entailment': 0.6 # below this → low entailment (possible hallucination)
22
  }
23
  self.nli = nli_detector or NLIDetector()
24
 
25
  async def analyze(self, event: AIEvent) -> Dict[str, Any]:
26
+ """
27
+ Analyze an AIEvent and return hallucination risk assessment.
28
+
29
+ Args:
30
+ event: AIEvent containing prompt, response, and confidence.
31
+
32
+ Returns:
33
+ Dictionary with keys:
34
+ - specialization: str
35
+ - confidence: float (0‑1, where higher means more likely hallucination)
36
+ - findings: dict with detailed flags
37
+ - recommendations: list of strings
38
+ """
39
  try:
40
  flags = []
41
  risk_score = 1.0
42
+ entail_prob = None
43
 
44
  # 1. Check confidence
45
  if event.confidence < self._thresholds['confidence']:
 
52
  if entail_prob is not None and entail_prob < self._thresholds['entailment']:
53
  flags.append('low_entailment')
54
  risk_score *= 0.6
 
 
 
55
 
56
  is_hallucination = len(flags) > 0
57
+
58
  return {
59
  'specialization': 'ai_hallucination',
60
  'confidence': 1 - risk_score if is_hallucination else 0,
 
63
  'flags': flags,
64
  'risk_score': risk_score,
65
  'confidence': event.confidence,
66
+ 'entailment': entail_prob
67
  },
68
  'recommendations': [
69
  "Regenerate with lower temperature",
 
73
  }
74
  except Exception as e:
75
  logger.error(f"HallucinationDetective error: {e}", exc_info=True)
76
+ return {
77
+ 'specialization': 'ai_hallucination',
78
+ 'confidence': 0.0,
79
+ 'findings': {},
80
+ 'recommendations': []
81
+ }