Spaces:

Varshithdharmajv
/

mvm2-math-verification

Sleeping

App Files Files Community

Varshith dharmaj commited on 23 days ago

Commit

cd17d1f

verified ·

1 Parent(s): 1d7be9f

Upload services/core_engine/consensus_module.py with huggingface_hub

Browse files

Files changed (1) hide show

services/core_engine/consensus_module.py +99 -0

services/core_engine/consensus_module.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import math
+from typing import List, Dict, Any
+from verification_module import calculate_symbolic_score
+def normalize_answers(answers: List[str]) -> Dict[str, List[int]]:
+    """
+    Normalized divergent mathematical text.
+    Fallback implementation for Windows to avoid WinError 6 from math_verify multiprocessing.
+    """
+    normalized_groups = {}
+    for idx, ans in enumerate(answers):
+        # Very simple normalization: strip spaces and convert to lowercase
+        # In a real scenario, this would use SymPy or more robust logic
+        clean_ans = ans.replace(" ", "").replace("\\", "").lower()
+        # Check against existing groups
+        matched = False
+        for rep_ans_key in list(normalized_groups.keys()):
+            rep_clean = rep_ans_key.replace(" ", "").replace("\\", "").lower()
+            if clean_ans == rep_clean:
+                normalized_groups[rep_ans_key].append(idx)
+                matched = True
+                break
+        if not matched:
+            normalized_groups[ans] = [idx]
+    return normalized_groups
+def evaluate_consensus(agent_responses: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """
+    Calculates the final Adaptive Consensus scoring algorithm from the MVM2 paper:
+    Score_j = 0.40 * V^{sym}_j + 0.35 * L^{logic}_j + 0.25 * C^{clf}_j
+    """
+    scores = []
+    # 1. Normalize final answers across agents
+    answers = [res["response"].get("Answer", "") for res in agent_responses]
+    answer_groups = normalize_answers(answers)
+    # 2. Evaluate individual agent execution paths
+    for idx, agent_data in enumerate(agent_responses):
+        res = agent_data["response"]
+        trace = res.get("Reasoning Trace", [])
+        # V^{sym}_j : SymPy / QWED Logical Validation (weight 0.40)
+        v_sym = calculate_symbolic_score(trace)
+        # L^{logic}_j : Trace density & semantic logical flow (weight 0.35)
+        # Placeholder mapping: more steps usually imply deeper logical breakdown
+        l_logic = min(len(trace) / 5.0, 1.0)
+        # C^{clf}_j : Classifier Confidence output (weight 0.25)
+        # Placeholder mapping: analyzing the confidence explanation string length or keyword mapping
+        conf_exp = res.get("Confidence Explanation", "")
+        c_clf = 1.0 if "guaranteed" in conf_exp.lower() or "proof" in conf_exp.lower() else 0.8
+        # Core Neuro-Symbolic Scoring Formula
+        score_j = (0.40 * v_sym) + (0.35 * l_logic) + (0.25 * c_clf)
+        scores.append({
+            "agent": agent_data["agent"],
+            "raw_answer": res.get("Answer"),
+            "V_sym": v_sym,
+            "L_logic": round(l_logic, 2),
+            "C_clf": round(c_clf, 2),
+            "Score_j": round(score_j, 3)
+        })
+    # 3. Aggregate Consensus by matching normalized answer groups
+    final_consensus = {}
+    top_score = -1.0
+    best_answer = "Error: Unresolvable Divergence"
+    for rep_ans, indices in answer_groups.items():
+        group_score = sum(scores[i]["Score_j"] for i in indices)
+        # MVM2 applies a divergence consistency multiplier
+        # If multiple agents independently arrive at normalized truth, boost score
+        consistency_multiplier = 1.0 + (0.1 * (len(indices) - 1))
+        weighted_group_score = group_score * consistency_multiplier
+        if weighted_group_score > top_score:
+            top_score = weighted_group_score
+            best_answer = rep_ans
+        final_consensus[rep_ans] = {
+            "agent_indices": indices,
+            "agents_supporting": [scores[i]["agent"] for i in indices],
+            "aggregate_score": round(weighted_group_score, 3)
+        }
+    return {
+        "final_verified_answer": best_answer,
+        "winning_score": top_score,
+        "detail_scores": scores,
+        "divergence_groups": final_consensus
+    }