| import math
|
| from typing import List, Dict, Any
|
| from services.core_engine.verification_module import calculate_symbolic_score
|
|
|
| def normalize_answers(answers: List[str]) -> Dict[str, List[int]]:
|
| """
|
| Normalized divergent mathematical text.
|
| Fallback implementation for Windows to avoid WinError 6 from math_verify multiprocessing.
|
| """
|
| normalized_groups = {}
|
|
|
| for idx, ans in enumerate(answers):
|
|
|
|
|
| clean_ans = ans.replace(" ", "").replace("\\", "").lower()
|
|
|
|
|
| matched = False
|
| for rep_ans_key in list(normalized_groups.keys()):
|
| rep_clean = rep_ans_key.replace(" ", "").replace("\\", "").lower()
|
| if clean_ans == rep_clean:
|
| normalized_groups[rep_ans_key].append(idx)
|
| matched = True
|
| break
|
|
|
| if not matched:
|
| normalized_groups[ans] = [idx]
|
|
|
| return normalized_groups
|
|
|
| def evaluate_consensus(agent_responses: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| """
|
| Calculates the final Adaptive Consensus scoring algorithm from the MVM2 paper:
|
| Score_j = 0.40 * V^{sym}_j + 0.35 * L^{logic}_j + 0.25 * C^{clf}_j
|
| """
|
| scores = []
|
|
|
|
|
| answers = [res["response"].get("Answer", "") for res in agent_responses]
|
| answer_groups = normalize_answers(answers)
|
|
|
|
|
| for idx, agent_data in enumerate(agent_responses):
|
| res = agent_data["response"]
|
| trace = res.get("Reasoning Trace", [])
|
|
|
|
|
| v_sym = calculate_symbolic_score(trace)
|
|
|
|
|
|
|
| l_logic = min(len(trace) / 5.0, 1.0)
|
|
|
|
|
|
|
| conf_exp = res.get("Confidence Explanation", "")
|
| c_clf = 1.0 if "guaranteed" in conf_exp.lower() or "proof" in conf_exp.lower() else 0.8
|
|
|
|
|
| score_j = (0.40 * v_sym) + (0.35 * l_logic) + (0.25 * c_clf)
|
|
|
| scores.append({
|
| "agent": agent_data["agent"],
|
| "raw_answer": res.get("Answer"),
|
| "V_sym": v_sym,
|
| "L_logic": round(l_logic, 2),
|
| "C_clf": round(c_clf, 2),
|
| "Score_j": round(score_j, 3)
|
| })
|
|
|
|
|
| final_consensus = {}
|
| top_score = -1.0
|
| best_answer = "Error: Unresolvable Divergence"
|
|
|
| for rep_ans, indices in answer_groups.items():
|
| group_score = sum(scores[i]["Score_j"] for i in indices)
|
|
|
|
|
|
|
| consistency_multiplier = 1.0 + (0.1 * (len(indices) - 1))
|
| weighted_group_score = group_score * consistency_multiplier
|
|
|
| if weighted_group_score > top_score:
|
| top_score = weighted_group_score
|
| best_answer = rep_ans
|
|
|
| final_consensus[rep_ans] = {
|
| "agent_indices": indices,
|
| "agents_supporting": [scores[i]["agent"] for i in indices],
|
| "aggregate_score": round(weighted_group_score, 3)
|
| }
|
|
|
| return {
|
| "final_verified_answer": best_answer,
|
| "winning_score": top_score,
|
| "detail_scores": scores,
|
| "divergence_groups": final_consensus
|
| }
|
|
|