recurrent-staged-loras-model / answer_eval_diagnostics.json

Publish run artifacts

780d17a verified about 1 month ago

1.41 kB

	{
	"string_answer_scored": 1760,
	"numeric_answer_scored": 1760,
	"skipped_no_stage3": 0,
	"skipped_no_answer_span": 0,
	"skipped_missing_answer_text": 0,
	"skipped_missing_numeric_target": 14,
	"strict_exact_match_count": 1156,
	"normalized_string_match_count": 1176,
	"numeric_match_count": 1177,
	"multi_value_answer_count": 1196,
	"numeric_predicted_value_count": 2948,
	"numeric_target_value_count": 2950,
	"numeric_value_match_count": 2377,
	"multi_value_exact_set_match_count": 857,
	"multi_value_partial_match_count": 338,
	"multi_value_unmatched_count": 1,
	"answer_length_distribution": {
	"17-64": 1182,
	"0-4": 477,
	"5-16": 101
	},
	"failure_mode_normalized_match_but_not_exact": 20,
	"failure_mode_numeric_miss_but_string_match": 10,
	"skipped_ambiguous_numeric": 14,
	"numeric_abs_tolerance": 1e-06,
	"symbolic_eval_attempt_count": 80,
	"symbolic_eval_success_count": 6,
	"symbolic_eval_failure_count": 74,
	"symbolic_match_count": 0,
	"symbolic_answer_accuracy": 0.0,
	"numeric_multi_value_rule": "strict_set",
	"notes": "Answer metrics decode only tokens in answer_mask/final_answer_mask (answer span, excluding the literal 'Final Answer:' header). stage_3_token_accuracy still uses the full stage3_mask section. Symbolic equivalence is attempted only for expression-like answers; parse failures are counted explicitly."
	}