| |
| """ |
| Evaluate LLM knowledge tracing predictions against FKT benchmark tasks. |
| |
| Tasks evaluated: |
| - Task 1 (FKT): Foundational Knowledge Tracing - predict if student answers correctly (question-level) |
| - Task 1 Variant 2: Cognitive Student Modeling - predict the actual student response |
| |
| Usage: |
| python evaluate_kt.py results.jsonl |
| """ |
|
|
| import argparse |
| import json |
| import math |
| from sklearn.metrics import roc_auc_score |
|
|
|
|
| def normalize_mcq_answer(answer_str: str) -> str: |
| """ |
| Normalize MCQ answer format for consistent comparison. |
| |
| Handles variations like: |
| - 'C, A' -> 'A, C' (order normalization) |
| - 'A,C' -> 'A, C' (spacing normalization) |
| - 'a, c' -> 'A, C' (case normalization) |
| |
| Args: |
| answer_str: Answer string to normalize |
| |
| Returns: |
| Normalized answer string, or original if not MCQ format |
| """ |
| |
| parts = [p.strip().upper() for p in answer_str.split(',')] |
| |
| parts = [p for p in parts if p] |
| |
| if parts and all(len(p) == 1 and p.isalpha() for p in parts): |
| return ', '.join(sorted(set(parts))) |
| return answer_str |
|
|
|
|
| def numerical_match(answer1: str, answer2: str, atol: float = 0.01, rtol: float = 0.01) -> bool: |
| """ |
| Check if two answers are numerically close within tolerance. |
| |
| Uses math.isclose for robust comparison that handles both absolute |
| and relative tolerance. |
| |
| Args: |
| answer1: First answer string |
| answer2: Second answer string |
| atol: Absolute tolerance (default: 0.01) |
| rtol: Relative tolerance (default: 0.01) |
| |
| Returns: |
| True if answers are numerically close, False otherwise |
| """ |
| try: |
| a = float(answer1.strip()) |
| b = float(answer2.strip()) |
| return math.isclose(a, b, abs_tol=atol, rel_tol=rtol) |
| except (ValueError, AttributeError): |
| return False |
|
|
|
|
| def answers_match(pred, actual): |
| """Check if predicted answer matches actual answer.""" |
| if pred is None or actual is None: |
| return False |
|
|
| pred_str = str(pred).strip() |
| actual_str = str(actual).strip() |
|
|
| |
| if pred_str == actual_str: |
| return True |
|
|
| |
| pred_normalized = normalize_mcq_answer(pred_str) |
| actual_normalized = normalize_mcq_answer(actual_str) |
| if pred_normalized == actual_normalized: |
| return True |
|
|
| |
| return numerical_match(pred_str, actual_str) |
|
|
|
|
| def load_results(jsonl_path): |
| """Load results from JSONL file.""" |
| results = [] |
| with open(jsonl_path, 'r') as f: |
| for line in f: |
| if line.strip(): |
| results.append(json.loads(line)) |
| return results |
|
|
|
|
| def evaluate(results): |
| """Compute evaluation metrics aligned with FKT benchmark tasks.""" |
| total = len(results) |
|
|
| if total == 0: |
| print("No results to evaluate.") |
| return |
|
|
| |
| n_correct = sum(1 for r in results if r.get('actual_score') == 1) |
| n_incorrect = total - n_correct |
|
|
| |
| valid_q = [(r.get('actual_score'), r.get('predicted_question_level')) |
| for r in results |
| if r.get('actual_score') is not None and r.get('predicted_question_level') is not None] |
|
|
| if valid_q: |
| y_true, y_pred = zip(*valid_q) |
| question_correct = sum(1 for t, p in valid_q if t == p) |
| question_acc = question_correct / len(valid_q) |
| |
| try: |
| auc_roc = roc_auc_score(y_true, y_pred) |
| except ValueError: |
| auc_roc = None |
| else: |
| question_correct = 0 |
| question_acc = 0.0 |
| auc_roc = None |
|
|
| |
| answer_correct = sum( |
| 1 for r in results |
| if answers_match(r.get('predicted_student_answer'), r.get('actual_answer')) |
| ) |
|
|
| |
| prior_baseline = 0.615 |
| majority_baseline = max(n_correct, n_incorrect) / total |
|
|
| |
| print(f"{'='*60}") |
| print(f"Evaluation Results ({total} predictions)") |
| print(f"{'='*60}") |
| print() |
| print(f"Class distribution: {n_correct} correct, {n_incorrect} incorrect") |
| print() |
|
|
| |
| print("Task 1: Foundational Knowledge Tracing (FKT) - Question-Level") |
| print(f" Accuracy: {question_correct}/{len(valid_q)} = {question_acc:.3f}") |
| if auc_roc is not None: |
| print(f" AUC-ROC: {auc_roc:.3f}") |
| else: |
| print(f" AUC-ROC: N/A (single class)") |
| print(f" Baselines: Prior={prior_baseline:.3f}, Majority={majority_baseline:.3f}") |
| print() |
|
|
| |
| print("Task 1 Variant 2: Cognitive Student Modeling") |
| print(f" Overall Accuracy: {answer_correct}/{total} = {answer_correct/total:.3f}") |
|
|
| |
| problem_types = ['Multiple Choice (select 1)', 'Multiple Choice (select all)', 'Fill-in-the-blank(s)'] |
| has_problem_type = any(r.get('problem_type') for r in results) |
| if has_problem_type: |
| print(" By problem type:") |
| for ptype in problem_types: |
| subset = [r for r in results if r.get('problem_type') == ptype] |
| if subset: |
| n = len(subset) |
| a_acc = sum(1 for r in subset if answers_match(r.get('predicted_student_answer'), r.get('actual_answer'))) / n |
| label = ptype.replace('Multiple Choice ', 'MC ') |
| print(f" {label:20s}: n={n:4d}, acc={a_acc:.3f}") |
| |
| for gt in ['correct', 'incorrect']: |
| gt_subset = [r for r in subset if r.get('prediction_type') == gt] |
| if gt_subset: |
| gt_n = len(gt_subset) |
| gt_acc = sum(1 for r in gt_subset if answers_match(r.get('predicted_student_answer'), r.get('actual_answer'))) / gt_n |
| print(f" {gt:18s}: n={gt_n:4d}, acc={gt_acc:.3f}") |
| print() |
|
|
| |
| print("By ground truth (prediction_type):") |
| for ptype in ['correct', 'incorrect']: |
| subset = [r for r in results if r.get('prediction_type') == ptype] |
| if subset: |
| n = len(subset) |
| q_acc = sum(1 for r in subset if r.get('predicted_question_level') == r.get('actual_score')) / n |
| a_acc = sum(1 for r in subset if answers_match(r.get('predicted_student_answer'), r.get('actual_answer'))) / n |
| print(f" {ptype:10s}: n={n:4d}, FKT_acc={q_acc:.3f}, cognitive_acc={a_acc:.3f}") |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Evaluate LLM knowledge tracing predictions") |
| parser.add_argument("jsonl_file", help="Path to JSONL results file") |
| args = parser.parse_args() |
|
|
| results = load_results(args.jsonl_file) |
| evaluate(results) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|