{ "reproducibility": { "seed": 42, "dataset": "fancyzhx/ag_news", "dataset_config": null, "train_split": "train", "eval_split": "test", "text_column": "text", "label_column": "label", "max_train_samples": 3000, "max_eval_samples": 600, "note": "Train and eval rows are the first N after shuffle(seed) of each split; see texts/eval-reproducibility.md." }, "metrics": { "accuracy": 0.538333, "macro_f1": 0.455368, "weighted_f1": 0.452694, "per_class_f1": { "World": 0.536585, "Sports": 0.730964, "Business": 0.0, "Sci/Tech": 0.553922 }, "confusion_matrix": [ [ 66, 67, 0, 15 ], [ 1, 144, 0, 2 ], [ 12, 15, 0, 125 ], [ 19, 21, 0, 113 ] ], "confusion_matrix_axis": "rows=true class, columns=predicted class", "label_order": [ "World", "Sports", "Business", "Sci/Tech" ] }, "dataset_quality": { "class_distribution": { "train": { "counts_by_label": { "World": 771, "Sports": 742, "Business": 691, "Sci/Tech": 796 }, "proportions_by_label": { "World": 0.257, "Sports": 0.247333, "Business": 0.230333, "Sci/Tech": 0.265333 }, "total": 3000 }, "eval": { "counts_by_label": { "World": 148, "Sports": 147, "Business": 152, "Sci/Tech": 153 }, "proportions_by_label": { "World": 0.246667, "Sports": 0.245, "Business": 0.253333, "Sci/Tech": 0.255 }, "total": 600 } } }, "error_analysis": { "top_confusions": [ { "true_label": "Business", "predicted_label": "Sci/Tech", "count": 125 }, { "true_label": "World", "predicted_label": "Sports", "count": 67 }, { "true_label": "Sci/Tech", "predicted_label": "Sports", "count": 21 }, { "true_label": "Sci/Tech", "predicted_label": "World", "count": 19 }, { "true_label": "World", "predicted_label": "Sci/Tech", "count": 15 }, { "true_label": "Business", "predicted_label": "Sports", "count": 15 }, { "true_label": "Business", "predicted_label": "World", "count": 12 }, { "true_label": "Sports", "predicted_label": "Sci/Tech", "count": 2 }, { "true_label": "Sports", "predicted_label": "World", "count": 1 } ] }, "calibration": { "max_prob_histogram": { "num_bins": 10, "bins": [ { "bin_low": 0.0, "bin_high": 0.1, "count": 0 }, { "bin_low": 0.1, "bin_high": 0.2, "count": 0 }, { "bin_low": 0.2, "bin_high": 0.3, "count": 1 }, { "bin_low": 0.3, "bin_high": 0.4, "count": 27 }, { "bin_low": 0.4, "bin_high": 0.5, "count": 156 }, { "bin_low": 0.5, "bin_high": 0.6, "count": 237 }, { "bin_low": 0.6, "bin_high": 0.7, "count": 171 }, { "bin_low": 0.7, "bin_high": 0.8, "count": 8 }, { "bin_low": 0.8, "bin_high": 0.9, "count": 0 }, { "bin_low": 0.9, "bin_high": 1.0, "count": 0 } ], "note": "Each eval example contributes one max softmax probability (winner class)." } }, "routing": { "fallback_behavior": "At inference, if the maximum softmax probability is below `min_confidence`, treat the prediction as low-confidence: route to human review, a secondary model, or a safe default class\u2014choose per product.", "min_confidence": null, "comment": "`min_confidence` is not set by training; typical starting range is 0.5\u20130.7 for routing. Tune on a validation set using `max_prob` histogram and error analysis." } }