File size: 2,683 Bytes
0584798
53d5d9f
 
 
0584798
1519226
0584798
 
53d5d9f
 
 
0584798
53d5d9f
 
0584798
 
53d5d9f
 
 
0584798
53d5d9f
 
0584798
 
53d5d9f
 
 
0584798
53d5d9f
 
0584798
 
53d5d9f
0584798
53d5d9f
1519226
0584798
 
53d5d9f
0584798
53d5d9f
 
 
 
 
0584798
53d5d9f
 
 
 
 
 
0584798
 
1519226
53d5d9f
1519226
53d5d9f
 
 
 
 
1519226
53d5d9f
 
 
 
 
 
1519226
0584798
53d5d9f
0584798
53d5d9f
 
 
 
 
0584798
53d5d9f
 
 
 
 
 
 
 
0584798
 
1519226
0584798
1519226
 
 
 
0584798
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
{
  "accepted_accuracy": 0.32,
  "accepted_coverage": 0.8013,
  "accuracy": 0.2564,
  "count": 156,
  "dataset_path": "/content/agentic-intent-classifier/data/iab_benchmark.jsonl",
  "difficulty_breakdown": {
    "easy": {
      "accepted_accuracy": 0.35,
      "accepted_coverage": 0.7692,
      "accuracy": 0.2692,
      "count": 52,
      "fallback_rate": 0.2308,
      "macro_f1": 0.153
    },
    "hard": {
      "accepted_accuracy": 0.275,
      "accepted_coverage": 0.7692,
      "accuracy": 0.2115,
      "count": 52,
      "fallback_rate": 0.2308,
      "macro_f1": 0.1108
    },
    "medium": {
      "accepted_accuracy": 0.3333,
      "accepted_coverage": 0.8654,
      "accuracy": 0.2885,
      "count": 52,
      "fallback_rate": 0.1346,
      "macro_f1": 0.1491
    }
  },
  "fallback_rate": 0.1987,
  "head": "iab_content",
  "macro_f1": 0.105,
  "primary_source": "supervised_classifier",
  "suite": "difficulty_benchmark",
  "tier_metrics": {
    "average_prediction_depth": 1.7564,
    "error_buckets": {
      "exact_match": 40,
      "parent_safe_stop": 11,
      "right_tier1_wrong_tier2": 58,
      "wrong_deep_leaf": 1,
      "wrong_tier1": 46
    },
    "exact_path_accuracy": 0.2564,
    "parent_safe_accuracy": 0.6218,
    "tier1_accuracy": 0.7051,
    "tier2_accuracy": 0.3333,
    "tier3_accuracy": 0.2315,
    "tier4_accuracy": 0.0
  },
  "view_metrics": {
    "classifier": {
      "average_prediction_depth": 1.7564,
      "error_buckets": {
        "exact_match": 40,
        "parent_safe_stop": 11,
        "right_tier1_wrong_tier2": 58,
        "wrong_deep_leaf": 1,
        "wrong_tier1": 46
      },
      "exact_path_accuracy": 0.2564,
      "parent_safe_accuracy": 0.6218,
      "tier1_accuracy": 0.7051,
      "tier2_accuracy": 0.3333,
      "tier3_accuracy": 0.2315,
      "tier4_accuracy": 0.0
    },
    "combined_path": {
      "average_prediction_depth": 1.7564,
      "error_buckets": {
        "exact_match": 40,
        "parent_safe_stop": 11,
        "right_tier1_wrong_tier2": 58,
        "wrong_deep_leaf": 1,
        "wrong_tier1": 46
      },
      "exact_path_accuracy": 0.2564,
      "fallback_overuse_count": 13,
      "fallback_rate": 0.0833,
      "parent_safe_accuracy": 0.6218,
      "tier1_accuracy": 0.7051,
      "tier2_accuracy": 0.3333,
      "tier3_accuracy": 0.2315,
      "tier4_accuracy": 0.0
    },
    "disagreements": {
      "classifier_vs_combined": 0
    },
    "shadow_embedding_retrieval": {
      "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).",
      "reason": "disabled_by_default",
      "skipped": true
    }
  }
}