Spaces:
Running
Running
File size: 3,681 Bytes
b6f9fa8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | """
src/modules/base.py — Shared EvalResult dataclass.
Used as the standard output schema by all 4 evaluation modules.
Details shape per module is fully specified here (SRS Section 5).
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import Any, Optional
logger = logging.getLogger(__name__)
@dataclass
class EvalResult:
"""
Shared output schema for all evaluation modules.
Attributes:
module_name : Identifier string, e.g. "faithfulness"
score : Module score in [0.0, 1.0] — clipped automatically
details : Module-specific dict (see DETAILS SHAPES below)
error : None if successful; error message string if module failed
latency_ms : Wall-clock milliseconds for this module's execution
"""
module_name: str
score: float
details: dict[str, Any] = field(default_factory=dict)
error: Optional[str] = None
latency_ms: int = 0
def __post_init__(self) -> None:
"""Clip score to [0.0, 1.0] as required by SRS 4.2."""
if not (0.0 <= self.score <= 1.0):
logger.warning(
"%s: score %.4f out of [0,1], clipping.",
self.module_name,
self.score,
)
self.score = max(0.0, min(1.0, self.score))
# -------------------------------------------------------------------------
# DETAILS SHAPE REFERENCE (SRS Section 5)
# -------------------------------------------------------------------------
#
# faithfulness.details:
# {
# "total_claims": int,
# "entailed_count": int,
# "neutral_count": int,
# "contradicted_count": int,
# "claims": [
# {
# "claim": str,
# "status": "ENTAILED" | "NEUTRAL" | "CONTRADICTED",
# "best_chunk_id": str, # chunk with highest NLI score
# "nli_score": float
# }
# ]
# }
#
# entity_verifier.details:
# {
# "total_entities": int,
# "verified_count": int,
# "flagged_count": int,
# "entities": [
# {
# "entity": str,
# "type": "DRUG" | "DOSAGE" | "CONDITION" | "PROCEDURE",
# "status": "VERIFIED" | "FLAGGED" | "NOT_FOUND",
# "severity": "CRITICAL" | "MODERATE" | "MINOR" | null,
# "answer_value": str,
# "context_value": str | null,
# "rxcui": str | null
# }
# ]
# }
#
# source_credibility.details:
# {
# "method_used": "keyword" | "metadata",
# "chunks": [
# {
# "chunk_id": str,
# "tier": int, # 1–5
# "tier_weight": float,
# "pub_type": str,
# "title": str,
# "matched_keyword": str | null
# }
# ]
# }
#
# contradiction.details:
# {
# "total_sentences": int,
# "checked_pairs": int,
# "contradicted_pairs": int,
# "pairs": [
# {
# "sentence_a": str,
# "sentence_b": str,
# "contradiction_score": float,
# "flagged": bool
# }
# ]
# }
#
# aggregator.details:
# {
# "weights_used": {
# "faithfulness": float,
# "entity_accuracy": float,
# "source_credibility": float,
# "contradiction_risk": float
# },
# "weighted_composite": float,
# "component_contributions": {
# "faithfulness_contribution": float,
# "entity_contribution": float,
# "source_contribution": float,
# "contradiction_contribution": float
# }
# }
|