{
  "reproducibility": {
    "seed": 42,
    "dataset": "fancyzhx/ag_news",
    "dataset_config": null,
    "train_split": "train",
    "eval_split": "test",
    "text_column": "text",
    "label_column": "label",
    "max_train_samples": 3000,
    "max_eval_samples": 600,
    "note": "Train and eval rows are the first N after shuffle(seed) of each split; see texts/eval-reproducibility.md."
  },
  "metrics": {
    "accuracy": 0.538333,
    "macro_f1": 0.455368,
    "weighted_f1": 0.452694,
    "per_class_f1": {
      "World": 0.536585,
      "Sports": 0.730964,
      "Business": 0.0,
      "Sci/Tech": 0.553922
    },
    "confusion_matrix": [
      [
        66,
        67,
        0,
        15
      ],
      [
        1,
        144,
        0,
        2
      ],
      [
        12,
        15,
        0,
        125
      ],
      [
        19,
        21,
        0,
        113
      ]
    ],
    "confusion_matrix_axis": "rows=true class, columns=predicted class",
    "label_order": [
      "World",
      "Sports",
      "Business",
      "Sci/Tech"
    ]
  },
  "dataset_quality": {
    "class_distribution": {
      "train": {
        "counts_by_label": {
          "World": 771,
          "Sports": 742,
          "Business": 691,
          "Sci/Tech": 796
        },
        "proportions_by_label": {
          "World": 0.257,
          "Sports": 0.247333,
          "Business": 0.230333,
          "Sci/Tech": 0.265333
        },
        "total": 3000
      },
      "eval": {
        "counts_by_label": {
          "World": 148,
          "Sports": 147,
          "Business": 152,
          "Sci/Tech": 153
        },
        "proportions_by_label": {
          "World": 0.246667,
          "Sports": 0.245,
          "Business": 0.253333,
          "Sci/Tech": 0.255
        },
        "total": 600
      }
    }
  },
  "error_analysis": {
    "top_confusions": [
      {
        "true_label": "Business",
        "predicted_label": "Sci/Tech",
        "count": 125
      },
      {
        "true_label": "World",
        "predicted_label": "Sports",
        "count": 67
      },
      {
        "true_label": "Sci/Tech",
        "predicted_label": "Sports",
        "count": 21
      },
      {
        "true_label": "Sci/Tech",
        "predicted_label": "World",
        "count": 19
      },
      {
        "true_label": "World",
        "predicted_label": "Sci/Tech",
        "count": 15
      },
      {
        "true_label": "Business",
        "predicted_label": "Sports",
        "count": 15
      },
      {
        "true_label": "Business",
        "predicted_label": "World",
        "count": 12
      },
      {
        "true_label": "Sports",
        "predicted_label": "Sci/Tech",
        "count": 2
      },
      {
        "true_label": "Sports",
        "predicted_label": "World",
        "count": 1
      }
    ]
  },
  "calibration": {
    "max_prob_histogram": {
      "num_bins": 10,
      "bins": [
        {
          "bin_low": 0.0,
          "bin_high": 0.1,
          "count": 0
        },
        {
          "bin_low": 0.1,
          "bin_high": 0.2,
          "count": 0
        },
        {
          "bin_low": 0.2,
          "bin_high": 0.3,
          "count": 1
        },
        {
          "bin_low": 0.3,
          "bin_high": 0.4,
          "count": 27
        },
        {
          "bin_low": 0.4,
          "bin_high": 0.5,
          "count": 156
        },
        {
          "bin_low": 0.5,
          "bin_high": 0.6,
          "count": 237
        },
        {
          "bin_low": 0.6,
          "bin_high": 0.7,
          "count": 171
        },
        {
          "bin_low": 0.7,
          "bin_high": 0.8,
          "count": 8
        },
        {
          "bin_low": 0.8,
          "bin_high": 0.9,
          "count": 0
        },
        {
          "bin_low": 0.9,
          "bin_high": 1.0,
          "count": 0
        }
      ],
      "note": "Each eval example contributes one max softmax probability (winner class)."
    }
  },
  "routing": {
    "fallback_behavior": "At inference, if the maximum softmax probability is below `min_confidence`, treat the prediction as low-confidence: route to human review, a secondary model, or a safe default class\u2014choose per product.",
    "min_confidence": null,
    "comment": "`min_confidence` is not set by training; typical starting range is 0.5\u20130.7 for routing. Tune on a validation set using `max_prob` histogram and error analysis."
  }
}