TinyModel1 / eval_report.json
anriltine's picture
Deploy TinyModel1 from GitHub Actions
b717780 verified
Raw
History Blame Contribute Delete
4.48 kB
{
"reproducibility": {
"seed": 42,
"dataset": "fancyzhx/ag_news",
"dataset_config": null,
"train_split": "train",
"eval_split": "test",
"text_column": "text",
"label_column": "label",
"max_train_samples": 3000,
"max_eval_samples": 600,
"note": "Train and eval rows are the first N after shuffle(seed) of each split; see texts/eval-reproducibility.md."
},
"metrics": {
"accuracy": 0.538333,
"macro_f1": 0.455368,
"weighted_f1": 0.452694,
"per_class_f1": {
"World": 0.536585,
"Sports": 0.730964,
"Business": 0.0,
"Sci/Tech": 0.553922
},
"confusion_matrix": [
[
66,
67,
0,
15
],
[
1,
144,
0,
2
],
[
12,
15,
0,
125
],
[
19,
21,
0,
113
]
],
"confusion_matrix_axis": "rows=true class, columns=predicted class",
"label_order": [
"World",
"Sports",
"Business",
"Sci/Tech"
]
},
"dataset_quality": {
"class_distribution": {
"train": {
"counts_by_label": {
"World": 771,
"Sports": 742,
"Business": 691,
"Sci/Tech": 796
},
"proportions_by_label": {
"World": 0.257,
"Sports": 0.247333,
"Business": 0.230333,
"Sci/Tech": 0.265333
},
"total": 3000
},
"eval": {
"counts_by_label": {
"World": 148,
"Sports": 147,
"Business": 152,
"Sci/Tech": 153
},
"proportions_by_label": {
"World": 0.246667,
"Sports": 0.245,
"Business": 0.253333,
"Sci/Tech": 0.255
},
"total": 600
}
}
},
"error_analysis": {
"top_confusions": [
{
"true_label": "Business",
"predicted_label": "Sci/Tech",
"count": 125
},
{
"true_label": "World",
"predicted_label": "Sports",
"count": 67
},
{
"true_label": "Sci/Tech",
"predicted_label": "Sports",
"count": 21
},
{
"true_label": "Sci/Tech",
"predicted_label": "World",
"count": 19
},
{
"true_label": "World",
"predicted_label": "Sci/Tech",
"count": 15
},
{
"true_label": "Business",
"predicted_label": "Sports",
"count": 15
},
{
"true_label": "Business",
"predicted_label": "World",
"count": 12
},
{
"true_label": "Sports",
"predicted_label": "Sci/Tech",
"count": 2
},
{
"true_label": "Sports",
"predicted_label": "World",
"count": 1
}
]
},
"calibration": {
"max_prob_histogram": {
"num_bins": 10,
"bins": [
{
"bin_low": 0.0,
"bin_high": 0.1,
"count": 0
},
{
"bin_low": 0.1,
"bin_high": 0.2,
"count": 0
},
{
"bin_low": 0.2,
"bin_high": 0.3,
"count": 1
},
{
"bin_low": 0.3,
"bin_high": 0.4,
"count": 27
},
{
"bin_low": 0.4,
"bin_high": 0.5,
"count": 156
},
{
"bin_low": 0.5,
"bin_high": 0.6,
"count": 237
},
{
"bin_low": 0.6,
"bin_high": 0.7,
"count": 171
},
{
"bin_low": 0.7,
"bin_high": 0.8,
"count": 8
},
{
"bin_low": 0.8,
"bin_high": 0.9,
"count": 0
},
{
"bin_low": 0.9,
"bin_high": 1.0,
"count": 0
}
],
"note": "Each eval example contributes one max softmax probability (winner class)."
}
},
"routing": {
"fallback_behavior": "At inference, if the maximum softmax probability is below `min_confidence`, treat the prediction as low-confidence: route to human review, a secondary model, or a safe default class\u2014choose per product.",
"min_confidence": null,
"comment": "`min_confidence` is not set by training; typical starting range is 0.5\u20130.7 for routing. Tune on a validation set using `max_prob` histogram and error analysis."
}
}