| { | |
| "string_answer_scored": 1760, | |
| "numeric_answer_scored": 1760, | |
| "skipped_no_stage3": 0, | |
| "skipped_no_answer_span": 0, | |
| "skipped_missing_answer_text": 0, | |
| "skipped_missing_numeric_target": 14, | |
| "strict_exact_match_count": 1156, | |
| "normalized_string_match_count": 1176, | |
| "numeric_match_count": 1177, | |
| "multi_value_answer_count": 1196, | |
| "numeric_predicted_value_count": 2948, | |
| "numeric_target_value_count": 2950, | |
| "numeric_value_match_count": 2377, | |
| "multi_value_exact_set_match_count": 857, | |
| "multi_value_partial_match_count": 338, | |
| "multi_value_unmatched_count": 1, | |
| "answer_length_distribution": { | |
| "17-64": 1182, | |
| "0-4": 477, | |
| "5-16": 101 | |
| }, | |
| "failure_mode_normalized_match_but_not_exact": 20, | |
| "failure_mode_numeric_miss_but_string_match": 10, | |
| "skipped_ambiguous_numeric": 14, | |
| "numeric_abs_tolerance": 1e-06, | |
| "symbolic_eval_attempt_count": 80, | |
| "symbolic_eval_success_count": 6, | |
| "symbolic_eval_failure_count": 74, | |
| "symbolic_match_count": 0, | |
| "symbolic_answer_accuracy": 0.0, | |
| "numeric_multi_value_rule": "strict_set", | |
| "notes": "Answer metrics decode only tokens in answer_mask/final_answer_mask (answer span, excluding the literal 'Final Answer:' header). stage_3_token_accuracy still uses the full stage3_mask section. Symbolic equivalence is attempted only for expression-like answers; parse failures are counted explicitly." | |
| } |