File size: 1,410 Bytes
780d17a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | {
"string_answer_scored": 1760,
"numeric_answer_scored": 1760,
"skipped_no_stage3": 0,
"skipped_no_answer_span": 0,
"skipped_missing_answer_text": 0,
"skipped_missing_numeric_target": 14,
"strict_exact_match_count": 1156,
"normalized_string_match_count": 1176,
"numeric_match_count": 1177,
"multi_value_answer_count": 1196,
"numeric_predicted_value_count": 2948,
"numeric_target_value_count": 2950,
"numeric_value_match_count": 2377,
"multi_value_exact_set_match_count": 857,
"multi_value_partial_match_count": 338,
"multi_value_unmatched_count": 1,
"answer_length_distribution": {
"17-64": 1182,
"0-4": 477,
"5-16": 101
},
"failure_mode_normalized_match_but_not_exact": 20,
"failure_mode_numeric_miss_but_string_match": 10,
"skipped_ambiguous_numeric": 14,
"numeric_abs_tolerance": 1e-06,
"symbolic_eval_attempt_count": 80,
"symbolic_eval_success_count": 6,
"symbolic_eval_failure_count": 74,
"symbolic_match_count": 0,
"symbolic_answer_accuracy": 0.0,
"numeric_multi_value_rule": "strict_set",
"notes": "Answer metrics decode only tokens in answer_mask/final_answer_mask (answer span, excluding the literal 'Final Answer:' header). stage_3_token_accuracy still uses the full stage3_mask section. Symbolic equivalence is attempted only for expression-like answers; parse failures are counted explicitly."
} |