""" Calculates McNemar's test for significance between two models, using the stated binarization threshold. """ import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from modules.reasoning_engine import ReasoningEngine THRESHOLD = ReasoningEngine.MCNEMAR_BINARIZATION_THRESHOLD def mcnemars_test(scores_model_A: list, scores_model_B: list): """ Computes McNemar's test p-value for paired nominal data. scores are lists of float faithfulness scores. """ if len(scores_model_A) != len(scores_model_B): raise ValueError("Must have same number of scores") # Binarize bin_A = [1 if s >= THRESHOLD else 0 for s in scores_model_A] bin_B = [1 if s >= THRESHOLD else 0 for s in scores_model_B] # Contingency table # B correct | B wrong # A correct | a | b # A wrong | c | d a, b, c, d = 0, 0, 0, 0 for a_val, b_val in zip(bin_A, bin_B): if a_val == 1 and b_val == 1: a += 1 elif a_val == 1 and b_val == 0: b += 1 elif a_val == 0 and b_val == 1: c += 1 else: d += 1 # Chi-square statistic: (b - c)^2 / (b + c) if b + c == 0: print("Models are identical given the threshold.") return 1.0 # No difference chi_square = ((abs(b - c) - 1)**2) / (b + c) # with continuity correction print(f"McNemar's Test Results:") print(f"Binarization Threshold: {THRESHOLD}") print(f"Contingency Table: a={a}, b={b}, c={c}, d={d}") print(f"Chi-square: {chi_square:.3f}") try: from scipy.stats import chi2 p_value = 1 - chi2.cdf(chi_square, 1) print(f"p-value: {p_value:.4f}") return p_value except ImportError: print("Note: Install scipy ('pip install scipy') to automatically calculate the p-value.") return chi_square if __name__ == "__main__": # Mock data scores_mexar = [0.8, 0.9, 0.4, 0.7, 0.65, 0.8] scores_baseline = [0.5, 0.7, 0.6, 0.4, 0.55, 0.8] mcnemars_test(scores_mexar, scores_baseline)