import math from typing import List, Dict, Any from services.core_engine.verification_module import calculate_symbolic_score def normalize_answers(answers: List[str]) -> Dict[str, List[int]]: """ Normalized divergent mathematical text. Fallback implementation for Windows to avoid WinError 6 from math_verify multiprocessing. """ normalized_groups = {} for idx, ans in enumerate(answers): # Very simple normalization: strip spaces and convert to lowercase # In a real scenario, this would use SymPy or more robust logic clean_ans = ans.replace(" ", "").replace("\\", "").lower() # Check against existing groups matched = False for rep_ans_key in list(normalized_groups.keys()): rep_clean = rep_ans_key.replace(" ", "").replace("\\", "").lower() if clean_ans == rep_clean: normalized_groups[rep_ans_key].append(idx) matched = True break if not matched: normalized_groups[ans] = [idx] return normalized_groups def evaluate_consensus(agent_responses: List[Dict[str, Any]]) -> Dict[str, Any]: """ Calculates the final Adaptive Consensus scoring algorithm from the MVM2 paper: Score_j = 0.40 * V^{sym}_j + 0.35 * L^{logic}_j + 0.25 * C^{clf}_j """ scores = [] # 1. Normalize final answers across agents answers = [res["response"].get("Answer", "") for res in agent_responses] answer_groups = normalize_answers(answers) # 2. Evaluate individual agent execution paths for idx, agent_data in enumerate(agent_responses): res = agent_data["response"] trace = res.get("Reasoning Trace", []) # V^{sym}_j : SymPy / QWED Logical Validation (weight 0.40) v_sym = calculate_symbolic_score(trace) # L^{logic}_j : Trace density & semantic logical flow (weight 0.35) # Placeholder mapping: more steps usually imply deeper logical breakdown l_logic = min(len(trace) / 5.0, 1.0) # C^{clf}_j : Classifier Confidence output (weight 0.25) # Placeholder mapping: analyzing the confidence explanation string length or keyword mapping conf_exp = res.get("Confidence Explanation", "") c_clf = 1.0 if "guaranteed" in conf_exp.lower() or "proof" in conf_exp.lower() else 0.8 # Core Neuro-Symbolic Scoring Formula score_j = (0.40 * v_sym) + (0.35 * l_logic) + (0.25 * c_clf) scores.append({ "agent": agent_data["agent"], "raw_answer": res.get("Answer"), "V_sym": v_sym, "L_logic": round(l_logic, 2), "C_clf": round(c_clf, 2), "Score_j": round(score_j, 3) }) # 3. Aggregate Consensus by matching normalized answer groups final_consensus = {} top_score = -1.0 best_answer = "Error: Unresolvable Divergence" for rep_ans, indices in answer_groups.items(): group_score = sum(scores[i]["Score_j"] for i in indices) # MVM2 applies a divergence consistency multiplier # If multiple agents independently arrive at normalized truth, boost score consistency_multiplier = 1.0 + (0.1 * (len(indices) - 1)) weighted_group_score = group_score * consistency_multiplier if weighted_group_score > top_score: top_score = weighted_group_score best_answer = rep_ans final_consensus[rep_ans] = { "agent_indices": indices, "agents_supporting": [scores[i]["agent"] for i in indices], "aggregate_score": round(weighted_group_score, 3) } return { "final_verified_answer": best_answer, "winning_score": top_score, "detail_scores": scores, "divergence_groups": final_consensus }