{ "string_answer_scored": 1760, "numeric_answer_scored": 1760, "skipped_no_stage3": 0, "skipped_no_answer_span": 0, "skipped_missing_answer_text": 0, "skipped_missing_numeric_target": 14, "strict_exact_match_count": 1156, "normalized_string_match_count": 1176, "numeric_match_count": 1177, "multi_value_answer_count": 1196, "numeric_predicted_value_count": 2948, "numeric_target_value_count": 2950, "numeric_value_match_count": 2377, "multi_value_exact_set_match_count": 857, "multi_value_partial_match_count": 338, "multi_value_unmatched_count": 1, "answer_length_distribution": { "17-64": 1182, "0-4": 477, "5-16": 101 }, "failure_mode_normalized_match_but_not_exact": 20, "failure_mode_numeric_miss_but_string_match": 10, "skipped_ambiguous_numeric": 14, "numeric_abs_tolerance": 1e-06, "symbolic_eval_attempt_count": 80, "symbolic_eval_success_count": 6, "symbolic_eval_failure_count": 74, "symbolic_match_count": 0, "symbolic_answer_accuracy": 0.0, "numeric_multi_value_rule": "strict_set", "notes": "Answer metrics decode only tokens in answer_mask/final_answer_mask (answer span, excluding the literal 'Final Answer:' header). stage_3_token_accuracy still uses the full stage3_mask section. Symbolic equivalence is attempted only for expression-like answers; parse failures are counted explicitly." }