petter2025 commited on
Commit
c45e983
·
verified ·
1 Parent(s): 3240a85

Update hallucination_detective.py

Browse files
Files changed (1) hide show
  1. hallucination_detective.py +2 -18
hallucination_detective.py CHANGED
@@ -15,38 +15,22 @@ class HallucinationDetectiveAgent(BaseAgent):
15
 
16
  def __init__(self, nli_detector: Optional[NLIDetector] = None):
17
  super().__init__(AgentSpecialization.DETECTIVE)
18
- # Thresholds for flagging – can be overridden by subclass or config
19
  self._thresholds = {
20
- 'confidence': 0.7, # below this → low confidence
21
- 'entailment': 0.6 # below this → low entailment (possible hallucination)
22
  }
23
  self.nli = nli_detector or NLIDetector()
24
 
25
  async def analyze(self, event: AIEvent) -> Dict[str, Any]:
26
- """
27
- Analyze an AIEvent and return hallucination risk assessment.
28
-
29
- Args:
30
- event: AIEvent containing prompt, response, and confidence.
31
-
32
- Returns:
33
- Dictionary with keys:
34
- - specialization: str
35
- - confidence: float (0‑1, where higher means more likely hallucination)
36
- - findings: dict with detailed flags
37
- - recommendations: list of strings
38
- """
39
  try:
40
  flags = []
41
  risk_score = 1.0
42
  entail_prob = None
43
 
44
- # 1. Check confidence
45
  if event.confidence < self._thresholds['confidence']:
46
  flags.append('low_confidence')
47
  risk_score *= 0.5
48
 
49
- # 2. Check NLI entailment (if available)
50
  if event.prompt and event.response and self.nli.pipeline is not None:
51
  entail_prob = self.nli.check(event.prompt, event.response)
52
  if entail_prob is not None and entail_prob < self._thresholds['entailment']:
 
15
 
16
  def __init__(self, nli_detector: Optional[NLIDetector] = None):
17
  super().__init__(AgentSpecialization.DETECTIVE)
 
18
  self._thresholds = {
19
+ 'confidence': 0.7,
20
+ 'entailment': 0.6
21
  }
22
  self.nli = nli_detector or NLIDetector()
23
 
24
  async def analyze(self, event: AIEvent) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  try:
26
  flags = []
27
  risk_score = 1.0
28
  entail_prob = None
29
 
 
30
  if event.confidence < self._thresholds['confidence']:
31
  flags.append('low_confidence')
32
  risk_score *= 0.5
33
 
 
34
  if event.prompt and event.response and self.nli.pipeline is not None:
35
  entail_prob = self.nli.check(event.prompt, event.response)
36
  if entail_prob is not None and entail_prob < self._thresholds['entailment']: