Spaces:
Sleeping
Sleeping
| def final_confidence( | |
| robustness: float, | |
| verifiability: float, | |
| calibration: float, | |
| contradiction: float, | |
| task_type: str = "factual" | |
| ) -> float: | |
| """ | |
| Compute weighted confidence score. | |
| Creative tasks rely entirely on calibration (fluency/quality), | |
| since variation and unverifiability are expected — not signs of failure. | |
| Factual / analytical tasks use the full multi-signal formula. | |
| """ | |
| if task_type == "creative": | |
| # Verification and robustness are meaningless for creative output. | |
| # Calibration (specificity, low hedging) is the best proxy for quality. | |
| score = calibration | |
| else: | |
| score = ( | |
| 0.4 * robustness + | |
| 0.4 * verifiability + | |
| 0.2 * calibration | |
| ) | |
| score -= 0.2 * contradiction | |
| return max(0.0, min(score, 1.0)) | |
| def get_confidence_level_and_reason( | |
| score: float, | |
| contradiction: float, | |
| task_type: str = "factual" | |
| ) -> tuple[str, str]: | |
| if task_type == "creative": | |
| if score > 0.65: | |
| return "High", "The response is well-formed, specific, and stylistically coherent." | |
| elif score > 0.40: | |
| return "Medium", "The response is reasonable but could be more specific or polished." | |
| else: | |
| return "Low", "The response is vague or heavily hedged — quality may be limited." | |
| # Factual / analytical path | |
| if score > 0.85 and contradiction < 0.1: | |
| return "High", "The model was highly consistent, specific, and self-verified its factual accuracy." | |
| elif score > 0.60: | |
| return "Medium", "Moderate variation detected across prompts, or the answer contains some unverifiable elements." | |
| else: | |
| return "Low", "Significant disagreement, high uncertainty, or contradiction detected. Trust with caution." | |