File size: 2,727 Bytes
53bb779
 
 
 
 
 
29809c8
 
53bb779
 
 
 
29809c8
53bb779
 
 
 
 
 
29809c8
53bb779
 
 
 
 
 
f7a4d18
53bb779
29809c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
MEXAR - Evaluation Metrics Helper
Calculates common metrics across different baselines and experiments.
"""
import sys
import os
from typing import Any, Dict, Optional

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from utils.faithfulness import FaithfulnessScorer, BartNLIScorer, FActScoreCompat


class MetricsRunner:
    def __init__(self):
        self.faith_scorer = FaithfulnessScorer()
        self.bart_nli = BartNLIScorer()
        self.factscore = FActScoreCompat()

    def evaluate_all(self, answer: str, context: str) -> Dict[str, float]:
        faith_res = self.faith_scorer.score(answer, context)
        bart_res = self.bart_nli.score(answer, context)
        fact_res = self.factscore.score(answer, context)
        return {
            "faithfulness": faith_res.score,
            "bart_nli": bart_res.score,
            "factscore": fact_res.score,
        }

    def extract_faithfulness(self, response: Dict[str, Any]) -> Optional[float]:
        """Extract faithfulness score from response payloads across formats."""
        if not isinstance(response, dict):
            return None

        explainability = response.get("explainability") or {}
        confidence_breakdown = explainability.get("confidence_breakdown") or {}

        for candidate in (
            confidence_breakdown.get("faithfulness"),
            explainability.get("faithfulness"),
        ):
            parsed = self._parse_numeric(candidate)
            if parsed is not None:
                return self._clamp(parsed)

        return None

    def extract_confidence(self, response: Dict[str, Any]) -> Optional[float]:
        """Extract numeric confidence score if available."""
        if not isinstance(response, dict):
            return None

        parsed = self._parse_numeric(response.get("confidence"))
        if parsed is None:
            return None
        return self._clamp(parsed)

    @staticmethod
    def _clamp(value: float) -> float:
        return max(0.0, min(1.0, value))

    @staticmethod
    def _parse_numeric(value: Any) -> Optional[float]:
        if value is None:
            return None

        if isinstance(value, (int, float)):
            return float(value)

        if isinstance(value, str):
            cleaned = value.strip()
            if not cleaned:
                return None

            if cleaned.endswith("%"):
                cleaned = cleaned[:-1].strip()
                try:
                    return float(cleaned) / 100.0
                except ValueError:
                    return None

            try:
                return float(cleaned)
            except ValueError:
                return None

        return None