recurrent-staged-loras-model / answer_eval_diagnostics.json
jeffreywallphd's picture
Publish run artifacts
780d17a verified
{
"string_answer_scored": 1760,
"numeric_answer_scored": 1760,
"skipped_no_stage3": 0,
"skipped_no_answer_span": 0,
"skipped_missing_answer_text": 0,
"skipped_missing_numeric_target": 14,
"strict_exact_match_count": 1156,
"normalized_string_match_count": 1176,
"numeric_match_count": 1177,
"multi_value_answer_count": 1196,
"numeric_predicted_value_count": 2948,
"numeric_target_value_count": 2950,
"numeric_value_match_count": 2377,
"multi_value_exact_set_match_count": 857,
"multi_value_partial_match_count": 338,
"multi_value_unmatched_count": 1,
"answer_length_distribution": {
"17-64": 1182,
"0-4": 477,
"5-16": 101
},
"failure_mode_normalized_match_but_not_exact": 20,
"failure_mode_numeric_miss_but_string_match": 10,
"skipped_ambiguous_numeric": 14,
"numeric_abs_tolerance": 1e-06,
"symbolic_eval_attempt_count": 80,
"symbolic_eval_success_count": 6,
"symbolic_eval_failure_count": 74,
"symbolic_match_count": 0,
"symbolic_answer_accuracy": 0.0,
"numeric_multi_value_rule": "strict_set",
"notes": "Answer metrics decode only tokens in answer_mask/final_answer_mask (answer span, excluding the literal 'Final Answer:' header). stage_3_token_accuracy still uses the full stage3_mask section. Symbolic equivalence is attempted only for expression-like answers; parse failures are counted explicitly."
}