def final_confidence( robustness: float, verifiability: float, calibration: float, contradiction: float, task_type: str = "factual" ) -> float: """ Compute weighted confidence score. Creative tasks rely entirely on calibration (fluency/quality), since variation and unverifiability are expected — not signs of failure. Factual / analytical tasks use the full multi-signal formula. """ if task_type == "creative": # Verification and robustness are meaningless for creative output. # Calibration (specificity, low hedging) is the best proxy for quality. score = calibration else: score = ( 0.4 * robustness + 0.4 * verifiability + 0.2 * calibration ) score -= 0.2 * contradiction return max(0.0, min(score, 1.0)) def get_confidence_level_and_reason( score: float, contradiction: float, task_type: str = "factual" ) -> tuple[str, str]: if task_type == "creative": if score > 0.65: return "High", "The response is well-formed, specific, and stylistically coherent." elif score > 0.40: return "Medium", "The response is reasonable but could be more specific or polished." else: return "Low", "The response is vague or heavily hedged — quality may be limited." # Factual / analytical path if score > 0.85 and contradiction < 0.1: return "High", "The model was highly consistent, specific, and self-verified its factual accuracy." elif score > 0.60: return "Medium", "Moderate variation detected across prompts, or the answer contains some unverifiable elements." else: return "Low", "Significant disagreement, high uncertainty, or contradiction detected. Trust with caution."