diff --git a/README.md b/README.md index 62d78c4435b2a05beac8d5400752cd3e5c655b0e..2954c57cb3712d16b125a77c8b116c6c53c35163 100644 --- a/README.md +++ b/README.md @@ -1,57 +1,10 @@ ---- -language: -- en -library_name: transformers -pipeline_tag: text-classification -base_model: distilbert-base-uncased -metrics: -- accuracy -- f1 -tags: -- intent-classification -- multitask -- iab -- conversational-ai -- adtech -- calibrated-confidence -license: apache-2.0 ---- - -# admesh/agentic-intent-classifier - -Production-ready intent + IAB classifier bundle for conversational traffic. - -Combines multitask intent modeling, supervised IAB content classification, and per-head confidence calibration to support safe monetization decisions in real time. - -## Links - -- Hugging Face: https://huggingface.co/admesh/agentic-intent-classifier -- GitHub: https://github.com/GouniManikumar12/agentic-intent-classifier +# Agentic Intent Classifier -## What It Predicts +`agentic-intent-classifier` is a multi-head query classification stack for conversational traffic. -| Field | Description | -|---|---| -| `intent.type` | `commercial`, `informational`, `navigational`, `transactional`, … | -| `intent.subtype` | `product_discovery`, `comparison`, `how_to`, … | -| `intent.decision_phase` | `awareness`, `consideration`, `decision`, … | -| `iab_content` | IAB Content Taxonomy 3.0 tier1 / tier2 / tier3 labels | -| `component_confidence` | Per-head calibrated confidence with threshold flags | -| `system_decision` | Monetization eligibility, opportunity type, policy | - ---- - -## Deployment Options - -### 0. Colab / Kaggle Quickstart (copy/paste) - -```python -!pip -q install -U pip -!pip -q install -U "torch==2.10.0" "torchvision==0.25.0" "torchaudio==2.10.0" -!pip -q install -U "transformers>=4.36.0" "huggingface_hub>=0.20.0" "safetensors>=0.4.0" -``` +## Quickstart (recommended): run from Hugging Face Hub -Restart the runtime after installs (**Runtime → Restart runtime**) so the new Torch version is actually used. +This is the easiest way for developers to test the full production stack (multitask intent + IAB + calibration) without training locally. ```python from transformers import pipeline @@ -59,212 +12,539 @@ from transformers import pipeline clf = pipeline( "admesh-intent", model="admesh/agentic-intent-classifier", - trust_remote_code=True, # required (custom pipeline + multi-model bundle) + trust_remote_code=True, ) out = clf("Which laptop should I buy for college?") -print(out["meta"]) print(out["model_output"]["classification"]["intent"]) +print(out["model_output"]["classification"]["iab_content"]) +print(out["meta"]) ``` ---- +If you’re running in Colab/Kaggle and see Torch version conflicts, follow `COLAB_SETUP.md`. + +## Latency / inference timing (developer quick check) -## Latency / inference timing (quick check) +The first call includes model/code loading; measure latency after a warm-up call. -The first call includes model/code loading. Warm up once, then measure: +Single query: ```python import time +from transformers import pipeline + +clf = pipeline("admesh-intent", model="admesh/agentic-intent-classifier", trust_remote_code=True) q = "Which laptop should I buy for college?" _ = clf("warm up") t0 = time.perf_counter() out = clf(q) -print(f"latency_ms={(time.perf_counter() - t0) * 1000:.1f}") +dt_ms = (time.perf_counter() - t0) * 1000 + +print(f"latency_ms={dt_ms:.1f}") +print(out["model_output"]["classification"]["intent"]) ``` -### 1. `transformers.pipeline()` — anywhere (Python) +Warm p50 / p95 over 20 runs: ```python -from transformers import pipeline +import time, statistics -clf = pipeline( - "admesh-intent", - model="admesh/agentic-intent-classifier", - trust_remote_code=True, +times = [] +for _ in range(20): + t0 = time.perf_counter() + _ = clf(q) + times.append((time.perf_counter() - t0) * 1000) + +times_sorted = sorted(times) +print(f"p50={statistics.median(times):.1f}ms p95={times_sorted[int(0.95*len(times))-1]:.1f}ms mean={statistics.mean(times):.1f}ms") +``` + +It currently produces: + +- `intent.type` +- `intent.subtype` +- `intent.decision_phase` +- `iab_content` +- calibrated confidence per head +- combined fallback / policy / opportunity decisions + +The repo is beyond the original v0.1 baseline. It now includes: + +- shared config and label ownership +- reusable model runtime +- calibrated confidence and threshold gating +- combined inference with fallback/policy logic +- request/response validation in the demo API +- repeatable evaluation and regression suites +- full-TSV IAB taxonomy retrieval support through tier4 +- a local embedding index for taxonomy-node retrieval over IAB content paths +- a separate synthetic full-intent-taxonomy augmentation dataset for non-IAB heads +- a dedicated intent-type difficulty dataset and held-out benchmark with `easy`, `medium`, and `hard` cases +- a dedicated decision-phase difficulty dataset and held-out benchmark with `easy`, `medium`, and `hard` cases + +Generated model weights are intentionally not committed. + +## Current Taxonomy + +### `intent.type` + +- `informational` +- `exploratory` +- `commercial` +- `transactional` +- `support` +- `personal_reflection` +- `creative_generation` +- `chit_chat` +- `ambiguous` +- `prohibited` + +### `intent.decision_phase` + +- `awareness` +- `research` +- `consideration` +- `decision` +- `action` +- `post_purchase` +- `support` + +### `intent.subtype` + +- `education` +- `product_discovery` +- `comparison` +- `evaluation` +- `deal_seeking` +- `provider_selection` +- `signup` +- `purchase` +- `booking` +- `download` +- `contact_sales` +- `task_execution` +- `onboarding_setup` +- `troubleshooting` +- `account_help` +- `billing_help` +- `follow_up` +- `emotional_reflection` + +### `iab_content` + +- candidates are derived from every row in [data/iab-content/Content Taxonomy 3.0.tsv](data/iab-content/Content%20Taxonomy%203.0.tsv) +- retrieval output supports `tier1`, `tier2`, `tier3`, and optional `tier4` + +## What The System Does + +- runs three classifier heads: + - `intent_type` + - `intent_subtype` + - `decision_phase` +- resolves `iab_content` through a local embedding index over taxonomy nodes plus generic label/path reranking +- applies calibration artifacts when present +- computes `commercial_score` +- applies fallback when confidence is too weak or policy-safe blocking is required +- emits a schema-validated combined envelope + +## What The System Does Not Do + +- it is not a multi-turn memory system +- it is not a production-optimized low-latency serving path +- it is not yet trained on large real-traffic human-labeled intent data +- combined decision logic is still heuristic, even though it is materially stronger than the original baseline + +## Project Layout + +- [config.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/config.py): labels, thresholds, artifact paths, model paths +- [model_runtime.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/model_runtime.py): shared calibrated inference runtime +- [combined_inference.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/combined_inference.py): composed system response +- [inference_intent_type.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/inference_intent_type.py): direct `intent_type` inference entrypoint +- [inference_iab_classifier.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/inference_iab_classifier.py): direct supervised `iab_content` inference entrypoint +- [schemas.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/schemas.py): request/response validation +- [demo_api.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/demo_api.py): local validated API +- [iab_taxonomy.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/iab_taxonomy.py): full taxonomy parser/index +- [iab_classifier.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/iab_classifier.py): supervised IAB runtime with taxonomy-aware parent fallback +- [iab_retrieval.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/iab_retrieval.py): optional shadow retrieval baseline +- [training/build_full_intent_taxonomy_dataset.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/build_full_intent_taxonomy_dataset.py): separate synthetic intent augmentation dataset +- [training/build_intent_type_difficulty_dataset.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/build_intent_type_difficulty_dataset.py): extra `intent_type` augmentation plus held-out difficulty benchmark +- [training/build_decision_phase_difficulty_dataset.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/build_decision_phase_difficulty_dataset.py): extra `decision_phase` augmentation plus held-out difficulty benchmark +- [training/build_subtype_difficulty_dataset.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/build_subtype_difficulty_dataset.py): extra `intent_subtype` augmentation plus held-out difficulty benchmark +- [training/build_subtype_dataset.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/build_subtype_dataset.py): subtype dataset generation from existing corpora +- [training/train_iab.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/train_iab.py): train the supervised IAB classifier head +- [training/build_iab_taxonomy_embeddings.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/build_iab_taxonomy_embeddings.py): build local IAB node embedding artifacts +- [training/run_full_training_pipeline.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/run_full_training_pipeline.py): full multi-head training/calibration/eval pipeline +- [evaluation/run_evaluation.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/evaluation/run_evaluation.py): repeatable benchmark runner +- [evaluation/run_regression_suite.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/evaluation/run_regression_suite.py): known-failure regression runner +- [evaluation/run_iab_mapping_suite.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/evaluation/run_iab_mapping_suite.py): IAB behavior-lock regression runner +- [evaluation/run_iab_quality_suite.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/evaluation/run_iab_quality_suite.py): curated IAB quality-target runner +- [known_limitations.md](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/known_limitations.md): current gaps and caveats + +## Quickstart: Run From Hugging Face + +Download the trained bundle and run inference in three lines — no local training required. + +```python +import sys +from huggingface_hub import snapshot_download + +# Download the full bundle (models + calibration + code) +local_dir = snapshot_download( + repo_id="admesh/agentic-intent-classifier", + repo_type="model", ) +sys.path.insert(0, local_dir) + +# Import and instantiate +from pipeline import AdmeshIntentPipeline +clf = AdmeshIntentPipeline() +# Classify +import json result = clf("Which laptop should I buy for college?") +print(json.dumps(result, indent=2)) +``` + +Or use the one-liner factory method: + +```python +from pipeline import AdmeshIntentPipeline # after sys.path.insert above + +clf = AdmeshIntentPipeline.from_pretrained("admesh/agentic-intent-classifier") +result = clf("I need a CRM for a 5-person startup") ``` -Batch and custom thresholds: +Batch mode and custom thresholds are also supported: ```python -# batch +# Batch results = clf([ "Best running shoes under $100", - "How does TCP work?", + "How does gradient descent work?", "Buy noise-cancelling headphones", ]) -# custom confidence thresholds +# Custom confidence thresholds result = clf( - "Buy headphones", + "Buy noise-cancelling headphones", threshold_overrides={"intent_type": 0.6, "intent_subtype": 0.35}, ) ``` +Verify artifacts and run a smoke test from the CLI: + +```bash +cd "" +python3 training/pipeline_verify.py +python3 combined_inference.py "Which CRM should I buy for a 3-person startup?" +``` + +Pin a specific revision for reproducibility: + +```python +local_dir = snapshot_download( + repo_id="admesh/agentic-intent-classifier", + repo_type="model", + revision="0584798f8efee6beccd778b0afa06782ab5add60", +) +``` + --- -### 2. HF Inference Endpoints (managed, deploy to AWS / Azure / GCP) +## Setup (for local training) -1. Go to https://ui.endpoints.huggingface.co -2. **New Endpoint** → select `admesh/agentic-intent-classifier` -3. Framework: **PyTorch** — Task: **Text Classification** -4. Enable **"Load with trust_remote_code"** -5. Deploy +```bash +python3 -m venv .venv +source .venv/bin/activate +pip install -r agentic-intent-classifier/requirements.txt +``` + +## Inference (local training path) -The endpoint serves the same `pipeline()` interface above via REST: +Run one query locally: ```bash -curl https://.endpoints.huggingface.cloud \ - -H "Authorization: Bearer $HF_TOKEN" \ - -H "Content-Type: application/json" \ - -d '{"inputs": "Which laptop should I buy for college?"}' +cd agentic-intent-classifier +python3 training/train_iab.py +python3 training/calibrate_confidence.py --head iab_content +python3 combined_inference.py "Which CRM should I buy for a 3-person startup?" ``` ---- +Run only the `intent_type` head: -### 3. HF Spaces (Gradio / Streamlit demo) +```bash +cd agentic-intent-classifier +python3 inference_intent_type.py "best shoes under 100" +``` -```python -# app.py for a Gradio Space -import gradio as gr -from transformers import pipeline +Run the demo API: -clf = pipeline( - "admesh-intent", - model="admesh/agentic-intent-classifier", - trust_remote_code=True, -) +```bash +cd agentic-intent-classifier +python3 demo_api.py +``` -def classify(text): - return clf(text) +Example request: -gr.Interface(fn=classify, inputs="text", outputs="json").launch() +```bash +curl -sS -X POST http://127.0.0.1:8008/classify \ + -H 'Content-Type: application/json' \ + -d '{"text":"I cannot log into my account"}' ``` ---- +Infra endpoints: -### 4. Local / notebook via `snapshot_download` +```bash +curl -sS http://127.0.0.1:8008/health +curl -sS http://127.0.0.1:8008/version +``` -```python -import sys -from huggingface_hub import snapshot_download +Train only the IAB classifier head: -local_dir = snapshot_download( - repo_id="admesh/agentic-intent-classifier", - repo_type="model", -) -sys.path.insert(0, local_dir) +```bash +cd agentic-intent-classifier +python3 training/train_iab.py +python3 training/calibrate_confidence.py --head iab_content +``` -from pipeline import AdmeshIntentPipeline -clf = AdmeshIntentPipeline() -result = clf("I need a CRM for a 5-person startup") +The online `iab_content` path now uses the compact supervised classifier. Retrieval is still available as an optional shadow baseline. + +Build the optional retrieval shadow index: + +```bash +cd agentic-intent-classifier +python3 training/build_iab_taxonomy_embeddings.py ``` -Or the one-liner factory: +By default the shadow retrieval path uses `Alibaba-NLP/gte-Qwen2-1.5B-instruct`. The retrieval runtime applies the model's query-side instruction format and last-token pooling, matching the Hugging Face usage guidance. If you want to point retrieval at a different embedding model, set `IAB_RETRIEVAL_MODEL_NAME_OVERRIDE` before building the index. -```python -from pipeline import AdmeshIntentPipeline -clf = AdmeshIntentPipeline.from_pretrained("admesh/agentic-intent-classifier") +Open-source users can swap in their own embedding model, but the contract is: + +- query embeddings and taxonomy-node embeddings must be produced by the same model and model revision +- after changing models, you must rebuild `artifacts/iab/taxonomy_embeddings.pt` +- the repository only tests and supports the default model path out of the box +- not every Hugging Face embedding model is drop-in compatible with this runtime; some require custom pooling, query instructions, or `trust_remote_code` + +Example override: + +```bash +cd agentic-intent-classifier +export IAB_RETRIEVAL_MODEL_NAME_OVERRIDE=mixedbread-ai/mxbai-embed-large-v1 +python3 training/build_iab_taxonomy_embeddings.py ``` ---- +This writes: + +- `artifacts/iab/taxonomy_nodes.json` +- `artifacts/iab/taxonomy_embeddings.pt` + +## Training + +### Full local pipeline + +```bash +cd agentic-intent-classifier +python3 training/run_full_training_pipeline.py +``` + +This pipeline now does: + +1. build separate full-intent-taxonomy augmentation data +2. build separate `intent_type` difficulty augmentation + benchmark +3. train `intent_type` +4. build subtype corpus +5. build separate `intent_subtype` difficulty augmentation + benchmark +6. train `intent_subtype` +7. build separate `decision_phase` difficulty augmentation + benchmark +8. train `decision_phase` +9. train `iab_content` +10. calibrate all classifier heads, including `iab_content` +11. run regression/evaluation unless `--skip-full-eval` is used + +### Build datasets individually + +Separate full-intent augmentation: + +```bash +cd agentic-intent-classifier +python3 training/build_full_intent_taxonomy_dataset.py +``` + +Intent-type difficulty augmentation and benchmark: + +```bash +cd agentic-intent-classifier +python3 training/build_intent_type_difficulty_dataset.py +``` + +Decision-phase difficulty augmentation and benchmark: + +```bash +cd agentic-intent-classifier +python3 training/build_decision_phase_difficulty_dataset.py +``` + +Subtype difficulty augmentation and benchmark: + +```bash +cd agentic-intent-classifier +python3 training/build_subtype_difficulty_dataset.py +``` + +Subtype dataset: + +```bash +cd agentic-intent-classifier +python3 training/build_subtype_dataset.py +``` + +IAB embedding index: + +```bash +cd agentic-intent-classifier +python3 training/build_iab_taxonomy_embeddings.py +``` + +### Train heads individually + +```bash +cd agentic-intent-classifier +python3 training/train.py +python3 training/train_subtype.py +python3 training/train_decision_phase.py +``` + +### Calibration + +```bash +cd agentic-intent-classifier +python3 training/calibrate_confidence.py --head intent_type +python3 training/calibrate_confidence.py --head intent_subtype +python3 training/calibrate_confidence.py --head decision_phase +``` + +## Evaluation + +Full evaluation: + +```bash +cd agentic-intent-classifier +python3 evaluation/run_evaluation.py +``` -## Troubleshooting (avoid environment errors) +Known-failure regression: -### `No module named 'combined_inference'` (or similar) +```bash +cd agentic-intent-classifier +python3 evaluation/run_regression_suite.py +``` -This means the Hub repo root is missing required Python files. Ensure these exist at the **root of the model repo** (same level as `pipeline.py`): +IAB behavior-lock regression: + +```bash +cd agentic-intent-classifier +python3 evaluation/run_iab_mapping_suite.py +``` -- `pipeline.py`, `config.json`, `config.py` -- `combined_inference.py`, `schemas.py` -- `model_runtime.py`, `multitask_runtime.py`, `multitask_model.py` -- `inference_intent_type.py`, `inference_subtype.py`, `inference_decision_phase.py`, `inference_iab_classifier.py` -- `iab_classifier.py`, `iab_taxonomy.py` +IAB quality-target evaluation: -### `does not appear to have a file named model.safetensors` +```bash +cd agentic-intent-classifier +python3 evaluation/run_iab_quality_suite.py +``` + +Threshold sweeps: + +```bash +cd agentic-intent-classifier +python3 evaluation/sweep_intent_threshold.py +``` -Transformers requires a standard checkpoint at the repo root for `pipeline()` to initialize. This repo includes a **small dummy** `model.safetensors` + tokenizer files at the root for compatibility; the *real* production weights live in: +Artifacts are written to: -- `multitask_intent_model_output/` -- `iab_classifier_model_output/` - `artifacts/calibration/` +- `artifacts/evaluation/latest/` ---- +## Google Colab -## Example Output - -```json -{ - "model_output": { - "classification": { - "iab_content": { - "taxonomy": "IAB Content Taxonomy", - "taxonomy_version": "3.0", - "tier1": {"id": "552", "label": "Style & Fashion"}, - "tier2": {"id": "579", "label": "Men's Fashion"}, - "mapping_mode": "exact", - "mapping_confidence": 0.73 - }, - "intent": { - "type": "commercial", - "subtype": "product_discovery", - "decision_phase": "consideration", - "confidence": 0.9549, - "commercial_score": 0.656 - } - } - }, - "system_decision": { - "policy": { - "monetization_eligibility": "allowed_with_caution", - "eligibility_reason": "commercial_discovery_signal_present" - }, - "opportunity": {"type": "soft_recommendation", "strength": "medium"} - }, - "meta": { - "system_version": "0.6.0-phase4", - "calibration_enabled": true, - "iab_mapping_is_placeholder": false - } -} -``` - -## Reproducible Revision +Use Colab for the full retraining pass if local memory is limited. -```python -from huggingface_hub import snapshot_download -local_dir = snapshot_download( - repo_id="admesh/agentic-intent-classifier", - repo_type="model", - revision="0584798f8efee6beccd778b0afa06782ab5add60", -) +Clone once: + +```bash +%cd /content +!git clone https://github.com/GouniManikumar12/agentic-intent-classifier.git +%cd /content/agentic-intent-classifier +``` + +If the repo is already cloned and you want the latest code, pull manually: + +```bash +!git pull origin main ``` -## Included Artifacts +Full pipeline: + +```bash +!python training/run_full_training_pipeline.py +``` + +If full evaluation is too heavy for the current Colab runtime: + +```bash +!python training/run_full_training_pipeline.py \ + --iab-embedding-batch-size 32 \ + --skip-full-eval +``` + +Then run eval separately after training: + +```bash +!python evaluation/run_regression_suite.py +!python evaluation/run_iab_mapping_suite.py +!python evaluation/run_iab_quality_suite.py +!python evaluation/run_evaluation.py +``` + +## Current Saved Metrics + +Generate fresh metrics with: + +```bash +cd agentic-intent-classifier +python3 evaluation/run_evaluation.py +``` + +Do not treat any checked-in summary as canonical unless it was regenerated after the current code and artifacts were built. The IAB path is now retrieval-based, so older saved reports from the deleted hierarchy stack are not meaningful. + +## Latency Note + +`combined_inference.py` is a debugging/offline path, not a production latency path. + +Current production truth: + +- per-request CLI execution is not a sub-50ms architecture +- production serving should use a long-lived API process with preloaded models +- if sub-50ms becomes a hard requirement, the serving path will need: + - persistent loaded models + - runtime optimization + - likely fewer model passes or a shared multi-head model + +## Current Status + +Current repo status: -| Path | Contents | -|---|---| -| `multitask_intent_model_output/` | DistilBERT multitask weights + tokenizer | -| `iab_classifier_model_output/` | IAB content classifier weights + tokenizer | -| `artifacts/calibration/` | Per-head temperature + threshold JSONs | -| `pipeline.py` | `AdmeshIntentPipeline` (transformers.Pipeline subclass) | -| `combined_inference.py` | Core inference logic | +- full 10-class `intent.type` taxonomy is wired +- subtype and phase heads are present +- difficulty benchmarks are wired for `intent_type`, `intent_subtype`, and `decision_phase` +- full-TSV IAB taxonomy retrieval is wired through tier4 +- separate full-intent augmentation dataset is in place +- evaluation/runtime memory handling is improved for large IAB splits -## Notes +The main remaining gap is not basic infrastructure anymore. It is improving real-world robustness, especially for: -- `trust_remote_code=True` is required because this model uses a custom multi-head architecture that does not map to a single standard `AutoModel` checkpoint. -- `meta.iab_mapping_is_placeholder: true` means IAB artifacts were missing or skipped; train and calibrate IAB for full production accuracy. -- For long-running servers, instantiate once and reuse — models are cached in memory after the first call. +- `decision_phase` +- `intent_subtype` +- confidence quality on borderline commercial queries +- real-traffic supervision beyond synthetic data diff --git a/artifacts/calibration/decision_phase.json b/artifacts/calibration/decision_phase.json index 06b02a1a1b4b84af5804899ce505f8536fd5c41b..059c90fb3995abd233e041d70ff9430aa692b2e5 100644 --- a/artifacts/calibration/decision_phase.json +++ b/artifacts/calibration/decision_phase.json @@ -1,31 +1,31 @@ { "calibrated": true, "confidence_threshold": 0.22, - "generated_at": "2026-03-25T20:15:26.091588+00:00", + "generated_at": "2026-03-25T21:20:02.753657+00:00", "head": "decision_phase", "metrics": { - "calibrated_accuracy": 0.8621, - "calibrated_expected_calibration_error": 0.1012, - "calibrated_negative_log_likelihood": 0.4275, - "mean_calibrated_confidence": 0.8431, - "mean_raw_confidence": 0.8274, - "raw_accuracy": 0.8621, - "raw_expected_calibration_error": 0.0985, - "raw_negative_log_likelihood": 0.4338 + "calibrated_accuracy": 0.8276, + "calibrated_expected_calibration_error": 0.0811, + "calibrated_negative_log_likelihood": 0.5525, + "mean_calibrated_confidence": 0.8739, + "mean_raw_confidence": 0.8828, + "raw_accuracy": 0.8276, + "raw_expected_calibration_error": 0.0757, + "raw_negative_log_likelihood": 0.5551 }, "minimum_threshold_floor": 0.22, - "optimized_temperature_candidate": 0.940009, + "optimized_temperature_candidate": 1.032831, "selected_threshold_before_floor": { - "accepted_accuracy": 0.8621, + "accepted_accuracy": 0.8276, "coverage": 1.0, "threshold": 0.0 }, "selection_split": "val", "selection_target_precision": 0.75, - "temperature": 0.940009, + "temperature": 1.032831, "temperature_scaling_applied": true, "threshold_summary": { - "accepted_accuracy": 0.8621, + "accepted_accuracy": 0.8276, "coverage": 1.0, "threshold": 0.22 } diff --git a/artifacts/calibration/iab_content.json b/artifacts/calibration/iab_content.json index ce93446814fa41d584fe9d70db4d48c58b2e4092..cc16ff2e932fe16bcac0d254be8192dbbbb41540 100644 --- a/artifacts/calibration/iab_content.json +++ b/artifacts/calibration/iab_content.json @@ -1,32 +1,32 @@ { "calibrated": true, "confidence_threshold": 0.12, - "generated_at": "2026-03-25T20:39:18.586053+00:00", + "generated_at": "2026-03-25T21:21:46.770447+00:00", "head": "iab_content", "metrics": { - "calibrated_accuracy": 0.9485, - "calibrated_expected_calibration_error": 0.2692, - "calibrated_negative_log_likelihood": 0.5281, - "mean_calibrated_confidence": 0.6793, - "mean_raw_confidence": 0.1987, - "raw_accuracy": 0.9485, - "raw_expected_calibration_error": 0.7498, - "raw_negative_log_likelihood": 1.7931 + "calibrated_accuracy": 0.9321, + "calibrated_expected_calibration_error": 0.2607, + "calibrated_negative_log_likelihood": 0.5642, + "mean_calibrated_confidence": 0.6714, + "mean_raw_confidence": 0.1481, + "raw_accuracy": 0.9321, + "raw_expected_calibration_error": 0.7839, + "raw_negative_log_likelihood": 2.103 }, "minimum_threshold_floor": 0.12, - "optimized_temperature_candidate": 0.573651, + "optimized_temperature_candidate": 0.502066, "selected_threshold_before_floor": { - "accepted_accuracy": 0.9485, + "accepted_accuracy": 0.9321, "coverage": 1.0, "threshold": 0.0 }, "selection_split": "val", "selection_target_precision": 0.7, - "temperature": 0.573651, + "temperature": 0.502066, "temperature_scaling_applied": true, "threshold_summary": { - "accepted_accuracy": 0.9553, - "coverage": 0.9875, + "accepted_accuracy": 0.9472, + "coverage": 0.975, "threshold": 0.12 } } diff --git a/artifacts/calibration/intent_subtype.json b/artifacts/calibration/intent_subtype.json index 27f1c5055f81fd986302160619cfeeb6f4b943c3..3e4237120819a25ff36be0234d5e40bc3c183d32 100644 --- a/artifacts/calibration/intent_subtype.json +++ b/artifacts/calibration/intent_subtype.json @@ -1,20 +1,20 @@ { "calibrated": true, "confidence_threshold": 0.25, - "generated_at": "2026-03-25T20:15:16.332284+00:00", + "generated_at": "2026-03-25T21:19:52.208134+00:00", "head": "intent_subtype", "metrics": { "calibrated_accuracy": 0.8625, - "calibrated_expected_calibration_error": 0.0827, - "calibrated_negative_log_likelihood": 0.3943, - "mean_calibrated_confidence": 0.8131, - "mean_raw_confidence": 0.7338, + "calibrated_expected_calibration_error": 0.0584, + "calibrated_negative_log_likelihood": 0.4288, + "mean_calibrated_confidence": 0.8389, + "mean_raw_confidence": 0.7787, "raw_accuracy": 0.8625, - "raw_expected_calibration_error": 0.152, - "raw_negative_log_likelihood": 0.4841 + "raw_expected_calibration_error": 0.0904, + "raw_negative_log_likelihood": 0.4795 }, "minimum_threshold_floor": 0.25, - "optimized_temperature_candidate": 0.789295, + "optimized_temperature_candidate": 0.861152, "selected_threshold_before_floor": { "accepted_accuracy": 0.8625, "coverage": 1.0, @@ -22,7 +22,7 @@ }, "selection_split": "val", "selection_target_precision": 0.75, - "temperature": 0.789295, + "temperature": 0.861152, "temperature_scaling_applied": true, "threshold_summary": { "accepted_accuracy": 0.8625, diff --git a/artifacts/calibration/intent_type.json b/artifacts/calibration/intent_type.json index 9601552ffc8cf86244a0cad32057b3f7e76897ac..faa6f28b51b2f87500961305845fedff611b80ca 100644 --- a/artifacts/calibration/intent_type.json +++ b/artifacts/calibration/intent_type.json @@ -1,32 +1,32 @@ { "calibrated": true, "confidence_threshold": 0.4, - "generated_at": "2026-03-25T20:15:05.272668+00:00", + "generated_at": "2026-03-25T21:19:41.302013+00:00", "head": "intent_type", "metrics": { - "calibrated_accuracy": 0.8936, - "calibrated_expected_calibration_error": 0.0915, - "calibrated_negative_log_likelihood": 0.2696, - "mean_calibrated_confidence": 0.9148, - "mean_raw_confidence": 0.8914, - "raw_accuracy": 0.8936, - "raw_expected_calibration_error": 0.0842, - "raw_negative_log_likelihood": 0.2831 + "calibrated_accuracy": 0.8723, + "calibrated_expected_calibration_error": 0.0798, + "calibrated_negative_log_likelihood": 0.2692, + "mean_calibrated_confidence": 0.8962, + "mean_raw_confidence": 0.8671, + "raw_accuracy": 0.8723, + "raw_expected_calibration_error": 0.1073, + "raw_negative_log_likelihood": 0.2907 }, "minimum_threshold_floor": 0.4, - "optimized_temperature_candidate": 0.918544, + "optimized_temperature_candidate": 0.889496, "selected_threshold_before_floor": { - "accepted_accuracy": 0.8936, + "accepted_accuracy": 0.8723, "coverage": 1.0, "threshold": 0.0 }, "selection_split": "val", "selection_target_precision": 0.8, - "temperature": 0.918544, + "temperature": 0.889496, "temperature_scaling_applied": true, "threshold_summary": { - "accepted_accuracy": 0.8936, - "coverage": 1.0, + "accepted_accuracy": 0.8913, + "coverage": 0.9787, "threshold": 0.4 } } diff --git a/artifacts/evaluation/latest/combined_demo_benchmark.json b/artifacts/evaluation/latest/combined_demo_benchmark.json index 1b8515d1b9d8fb2e9a0e212a6554af6b18f71f1d..27570895691ea783aa71c9f1a001299a8240f150 100644 --- a/artifacts/evaluation/latest/combined_demo_benchmark.json +++ b/artifacts/evaluation/latest/combined_demo_benchmark.json @@ -11,21 +11,13 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.5429, + "mapping_confidence": 0.3078, "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { "id": "596", "label": "Technology & Computing" - }, - "tier2": { - "id": "599", - "label": "Computing" - }, - "tier3": { - "id": "602", - "label": "Software and Applications" } }, "intent": { @@ -33,31 +25,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.962, + "confidence": 0.9548, "confidence_threshold": 0.22, "label": "awareness", "meets_threshold": true, - "raw_confidence": 0.9633 + "raw_confidence": 0.9611 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9805, + "confidence": 0.9731, "confidence_threshold": 0.25, "label": "education", "meets_threshold": true, - "raw_confidence": 0.9549 + "raw_confidence": 0.9378 }, "intent_type": { "calibrated": true, - "confidence": 0.9817, + "confidence": 0.9816, "confidence_threshold": 0.4, "label": "informational", "meets_threshold": true, - "raw_confidence": 0.9658 + "raw_confidence": 0.9644 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.962, + "confidence": 0.9548, "decision_phase": "awareness", "subtype": "education", "summary": "Classified as informational intent with subtype education in the awareness phase.", @@ -102,7 +94,7 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.4784, + "mapping_confidence": 0.2281, "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", @@ -116,31 +108,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9277, + "confidence": 0.9159, "confidence_threshold": 0.22, "label": "awareness", "meets_threshold": true, - "raw_confidence": 0.9297 + "raw_confidence": 0.9256 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9749, + "confidence": 0.9671, "confidence_threshold": 0.25, "label": "education", "meets_threshold": true, - "raw_confidence": 0.9445 + "raw_confidence": 0.9273 }, "intent_type": { "calibrated": true, - "confidence": 0.9797, + "confidence": 0.9771, "confidence_threshold": 0.4, "label": "informational", "meets_threshold": true, - "raw_confidence": 0.9626 + "raw_confidence": 0.957 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9277, + "confidence": 0.9159, "decision_phase": "awareness", "subtype": "education", "summary": "Classified as informational intent with subtype education in the awareness phase.", @@ -185,17 +177,13 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.2179, - "mapping_mode": "exact", + "mapping_confidence": 0.6271, + "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { "id": "483", "label": "Sports" - }, - "tier2": { - "id": "496", - "label": "Equine Sports" } }, "intent": { @@ -203,31 +191,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9444, + "confidence": 0.9469, "confidence_threshold": 0.22, "label": "consideration", "meets_threshold": true, - "raw_confidence": 0.9461 + "raw_confidence": 0.954 }, "intent_subtype": { "calibrated": true, - "confidence": 0.4804, + "confidence": 0.4849, "confidence_threshold": 0.25, "label": "comparison", "meets_threshold": true, - "raw_confidence": 0.4327 + "raw_confidence": 0.4322 }, "intent_type": { "calibrated": true, - "confidence": 0.981, + "confidence": 0.9863, "confidence_threshold": 0.4, "label": "commercial", "meets_threshold": true, - "raw_confidence": 0.9653 + "raw_confidence": 0.9724 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.4804, + "confidence": 0.4849, "decision_phase": "consideration", "subtype": "comparison", "summary": "Classified as commercial intent with subtype comparison in the consideration phase.", @@ -272,17 +260,13 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.3122, - "mapping_mode": "exact", + "mapping_confidence": 0.1648, + "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { "id": "596", "label": "Technology & Computing" - }, - "tier2": { - "id": "638", - "label": "Robotics" } }, "intent": { @@ -290,31 +274,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.8858, + "confidence": 0.9303, "confidence_threshold": 0.22, "label": "consideration", "meets_threshold": true, - "raw_confidence": 0.8885 + "raw_confidence": 0.9389 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9538, + "confidence": 0.9598, "confidence_threshold": 0.25, "label": "comparison", "meets_threshold": true, - "raw_confidence": 0.9083 + "raw_confidence": 0.9157 }, "intent_type": { "calibrated": true, - "confidence": 0.9676, + "confidence": 0.9746, "confidence_threshold": 0.4, "label": "commercial", "meets_threshold": true, - "raw_confidence": 0.9435 + "raw_confidence": 0.953 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.8858, + "confidence": 0.9303, "decision_phase": "consideration", "subtype": "comparison", "summary": "Classified as commercial intent with subtype comparison in the consideration phase.", @@ -359,13 +343,21 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.5309, + "mapping_confidence": 0.1701, "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "596", - "label": "Technology & Computing" + "id": "52", + "label": "Business and Finance" + }, + "tier2": { + "id": "53", + "label": "Business" + }, + "tier3": { + "id": "61", + "label": "Startups" } }, "intent": { @@ -373,31 +365,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.6077, + "confidence": 0.6389, "confidence_threshold": 0.22, "label": "decision", "meets_threshold": true, - "raw_confidence": 0.6097 + "raw_confidence": 0.6498 }, "intent_subtype": { "calibrated": true, - "confidence": 0.7801, + "confidence": 0.7851, "confidence_threshold": 0.25, "label": "provider_selection", "meets_threshold": true, - "raw_confidence": 0.6968 + "raw_confidence": 0.6921 }, "intent_type": { "calibrated": true, - "confidence": 0.9843, + "confidence": 0.9784, "confidence_threshold": 0.4, "label": "commercial", "meets_threshold": true, - "raw_confidence": 0.9703 + "raw_confidence": 0.9591 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.6077, + "confidence": 0.6389, "decision_phase": "decision", "subtype": "provider_selection", "summary": "Classified as commercial intent with subtype provider_selection in the decision phase.", @@ -442,17 +434,13 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.2299, - "mapping_mode": "exact", + "mapping_confidence": 0.24, + "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "v9i3On", - "label": "Sensitive Topics" - }, - "tier2": { - "id": "XtODT3", - "label": "Crime & Harmful Acts to Individuals, Society & Human Right Violations" + "id": "483", + "label": "Sports" } }, "intent": { @@ -460,31 +448,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9662, + "confidence": 0.937, "confidence_threshold": 0.22, "label": "action", "meets_threshold": true, - "raw_confidence": 0.9674 + "raw_confidence": 0.9451 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9473, + "confidence": 0.9242, "confidence_threshold": 0.25, "label": "signup", "meets_threshold": true, - "raw_confidence": 0.8993 + "raw_confidence": 0.8636 }, "intent_type": { "calibrated": true, - "confidence": 0.9788, + "confidence": 0.9809, "confidence_threshold": 0.4, "label": "transactional", "meets_threshold": true, - "raw_confidence": 0.9614 + "raw_confidence": 0.9633 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9473, + "confidence": 0.9242, "decision_phase": "action", "subtype": "signup", "summary": "Classified as transactional intent with subtype signup in the action phase.", @@ -529,7 +517,7 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.8304, + "mapping_confidence": 0.4786, "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", @@ -547,31 +535,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9595, + "confidence": 0.9436, "confidence_threshold": 0.22, "label": "action", "meets_threshold": true, - "raw_confidence": 0.9608 + "raw_confidence": 0.951 }, "intent_subtype": { "calibrated": true, - "confidence": 0.8434, + "confidence": 0.8891, "confidence_threshold": 0.25, "label": "booking", "meets_threshold": true, - "raw_confidence": 0.7616 + "raw_confidence": 0.8107 }, "intent_type": { "calibrated": true, - "confidence": 0.9805, + "confidence": 0.9715, "confidence_threshold": 0.4, "label": "transactional", "meets_threshold": true, - "raw_confidence": 0.9649 + "raw_confidence": 0.9481 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.8434, + "confidence": 0.8891, "decision_phase": "action", "subtype": "booking", "summary": "Classified as transactional intent with subtype booking in the action phase.", @@ -616,8 +604,8 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.5261, - "mapping_mode": "exact", + "mapping_confidence": 0.3826, + "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { @@ -630,31 +618,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9573, + "confidence": 0.9613, "confidence_threshold": 0.22, "label": "post_purchase", "meets_threshold": true, - "raw_confidence": 0.9587 + "raw_confidence": 0.9669 }, "intent_subtype": { "calibrated": true, - "confidence": 0.967, + "confidence": 0.965, "confidence_threshold": 0.25, "label": "onboarding_setup", "meets_threshold": true, - "raw_confidence": 0.9306 + "raw_confidence": 0.9235 }, "intent_type": { "calibrated": true, - "confidence": 0.5834, + "confidence": 0.5393, "confidence_threshold": 0.4, "label": "transactional", "meets_threshold": true, - "raw_confidence": 0.5253 + "raw_confidence": 0.4772 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.5834, + "confidence": 0.5393, "decision_phase": "post_purchase", "subtype": "onboarding_setup", "summary": "Classified as transactional intent with subtype onboarding_setup in the post_purchase phase.", @@ -699,21 +687,13 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.272, - "mapping_mode": "exact", + "mapping_confidence": 0.3628, + "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "52", - "label": "Business and Finance" - }, - "tier2": { - "id": "53", - "label": "Business" - }, - "tier3": { - "id": "72", - "label": "Business I.T." + "id": "391", + "label": "Personal Finance" } }, "intent": { @@ -721,31 +701,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9589, + "confidence": 0.9481, "confidence_threshold": 0.22, "label": "support", "meets_threshold": true, - "raw_confidence": 0.9603 + "raw_confidence": 0.9551 }, "intent_subtype": { "calibrated": true, - "confidence": 0.8859, + "confidence": 0.934, "confidence_threshold": 0.25, "label": "account_help", "meets_threshold": true, - "raw_confidence": 0.8147 + "raw_confidence": 0.8749 }, "intent_type": { "calibrated": true, - "confidence": 0.9699, + "confidence": 0.9542, "confidence_threshold": 0.4, "label": "support", "meets_threshold": true, - "raw_confidence": 0.9476 + "raw_confidence": 0.9232 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.8859, + "confidence": 0.934, "decision_phase": "support", "subtype": "account_help", "summary": "Classified as support intent with subtype account_help in the support phase.", @@ -796,8 +776,8 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.7892, - "mapping_mode": "exact", + "mapping_confidence": 0.3231, + "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { @@ -810,31 +790,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9219, + "confidence": 0.8468, "confidence_threshold": 0.22, "label": "awareness", "meets_threshold": true, - "raw_confidence": 0.9239 + "raw_confidence": 0.8606 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9492, + "confidence": 0.9639, "confidence_threshold": 0.25, "label": "emotional_reflection", "meets_threshold": true, - "raw_confidence": 0.9021 + "raw_confidence": 0.9211 }, "intent_type": { "calibrated": true, - "confidence": 0.9388, + "confidence": 0.9627, "confidence_threshold": 0.4, "label": "personal_reflection", "meets_threshold": true, - "raw_confidence": 0.9059 + "raw_confidence": 0.9348 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9219, + "confidence": 0.8468, "decision_phase": "awareness", "subtype": "emotional_reflection", "summary": "Classified as personal_reflection intent with subtype emotional_reflection in the awareness phase.", @@ -885,7 +865,7 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.2238, + "mapping_confidence": 0.3327, "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", @@ -899,31 +879,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.8763, + "confidence": 0.8651, "confidence_threshold": 0.22, "label": "research", "meets_threshold": true, - "raw_confidence": 0.8791 + "raw_confidence": 0.8781 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9683, + "confidence": 0.9652, "confidence_threshold": 0.25, "label": "follow_up", "meets_threshold": true, - "raw_confidence": 0.9314 + "raw_confidence": 0.9229 }, "intent_type": { "calibrated": true, - "confidence": 0.9623, + "confidence": 0.9746, "confidence_threshold": 0.4, "label": "ambiguous", "meets_threshold": true, - "raw_confidence": 0.9367 + "raw_confidence": 0.9541 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.8763, + "confidence": 0.8651, "decision_phase": "research", "subtype": "follow_up", "summary": "Classified as ambiguous intent with subtype follow_up in the research phase.", @@ -974,17 +954,13 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.2371, + "mapping_confidence": 0.1481, "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { "id": "391", "label": "Personal Finance" - }, - "tier2": { - "id": "396", - "label": "Financial Planning" } }, "intent": { @@ -992,31 +968,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9225, + "confidence": 0.9177, "confidence_threshold": 0.22, "label": "research", "meets_threshold": true, - "raw_confidence": 0.9246 + "raw_confidence": 0.9273 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9586, + "confidence": 0.9506, "confidence_threshold": 0.25, "label": "follow_up", "meets_threshold": true, - "raw_confidence": 0.9146 + "raw_confidence": 0.8983 }, "intent_type": { "calibrated": true, - "confidence": 0.9488, + "confidence": 0.9628, "confidence_threshold": 0.4, "label": "ambiguous", "meets_threshold": true, - "raw_confidence": 0.9179 + "raw_confidence": 0.9356 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9225, + "confidence": 0.9177, "decision_phase": "research", "subtype": "follow_up", "summary": "Classified as ambiguous intent with subtype follow_up in the research phase.", @@ -1067,13 +1043,13 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.2131, + "mapping_confidence": 0.0729, "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "42", - "label": "Books and Literature" + "id": "123", + "label": "Careers" } }, "intent": { @@ -1081,31 +1057,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9861, + "confidence": 0.9739, "confidence_threshold": 0.22, "label": "action", "meets_threshold": true, - "raw_confidence": 0.9867 + "raw_confidence": 0.9781 }, "intent_subtype": { "calibrated": true, - "confidence": 0.7335, + "confidence": 0.7259, "confidence_threshold": 0.25, "label": "signup", "meets_threshold": true, - "raw_confidence": 0.6454 + "raw_confidence": 0.6331 }, "intent_type": { "calibrated": true, - "confidence": 0.9628, + "confidence": 0.9763, "confidence_threshold": 0.4, "label": "transactional", "meets_threshold": true, - "raw_confidence": 0.938 + "raw_confidence": 0.9557 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.7335, + "confidence": 0.7259, "decision_phase": "action", "subtype": "signup", "summary": "Classified as transactional intent with subtype signup in the action phase.", @@ -1150,17 +1126,17 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.3327, + "mapping_confidence": 0.1383, "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "596", - "label": "Technology & Computing" + "id": "123", + "label": "Careers" }, "tier2": { - "id": "639", - "label": "Virtual Reality" + "id": "127", + "label": "Job Search" } }, "intent": { @@ -1168,31 +1144,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9295, + "confidence": 0.9578, "confidence_threshold": 0.22, "label": "consideration", "meets_threshold": true, - "raw_confidence": 0.9315 + "raw_confidence": 0.9639 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9374, + "confidence": 0.9095, "confidence_threshold": 0.25, "label": "comparison", "meets_threshold": true, - "raw_confidence": 0.8838 + "raw_confidence": 0.8429 }, "intent_type": { "calibrated": true, - "confidence": 0.9602, + "confidence": 0.9747, "confidence_threshold": 0.4, "label": "commercial", "meets_threshold": true, - "raw_confidence": 0.9329 + "raw_confidence": 0.953 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9295, + "confidence": 0.9095, "decision_phase": "consideration", "subtype": "comparison", "summary": "Classified as commercial intent with subtype comparison in the consideration phase.", @@ -1237,7 +1213,7 @@ "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.3227, + "mapping_confidence": 0.1608, "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", @@ -1251,31 +1227,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9535, + "confidence": 0.9436, "confidence_threshold": 0.22, "label": "awareness", "meets_threshold": true, - "raw_confidence": 0.955 + "raw_confidence": 0.951 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9793, + "confidence": 0.9692, "confidence_threshold": 0.25, "label": "education", "meets_threshold": true, - "raw_confidence": 0.9527 + "raw_confidence": 0.931 }, "intent_type": { "calibrated": true, - "confidence": 0.9769, + "confidence": 0.9775, "confidence_threshold": 0.4, "label": "informational", "meets_threshold": true, - "raw_confidence": 0.9584 + "raw_confidence": 0.9578 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9535, + "confidence": 0.9436, "decision_phase": "awareness", "subtype": "education", "summary": "Classified as informational intent with subtype education in the awareness phase.", diff --git a/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv b/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv index d8629f9e7297b88be3145c27c471f3daf9a612e3..7202242ef2e3a9c8e69609744a3de88723b07f62 100644 --- a/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv +++ b/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv @@ -1,8 +1,8 @@ ,awareness,research,consideration,decision,action,post_purchase,support awareness,14,1,0,0,0,0,0 research,0,15,0,0,0,0,0 -consideration,0,2,13,0,0,0,0 +consideration,0,1,14,0,0,0,0 decision,0,1,0,14,0,0,0 -action,0,0,0,1,14,0,0 +action,0,1,0,0,14,0,0 post_purchase,0,0,0,0,0,15,0 support,0,0,0,0,0,0,15 diff --git a/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_report.json b/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_report.json index 6611eedc4620df66249f5d22a22d6faec44d5e48..acc5ea446813bf822b75ceb38fb4248b18d3a87e 100644 --- a/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_report.json +++ b/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_report.json @@ -1,7 +1,7 @@ { - "accepted_accuracy": 0.9524, + "accepted_accuracy": 0.9619, "accepted_coverage": 1.0, - "accuracy": 0.9524, + "accuracy": 0.9619, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv", "count": 105, "dataset_path": "/content/agentic-intent-classifier/data/decision_phase_benchmark.jsonl", @@ -15,12 +15,12 @@ "macro_f1": 0.9711 }, "hard": { - "accepted_accuracy": 0.8857, + "accepted_accuracy": 0.9143, "accepted_coverage": 1.0, - "accuracy": 0.8857, + "accuracy": 0.9143, "count": 35, "fallback_rate": 0.0, - "macro_f1": 0.8908 + "macro_f1": 0.9194 }, "medium": { "accepted_accuracy": 1.0, @@ -33,9 +33,9 @@ }, "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.9536, + "macro_f1": 0.9635, "per_class_metrics": { - "accuracy": 0.9523809523809523, + "accuracy": 0.9619047619047619, "action": { "f1-score": 0.9655172413793104, "precision": 1.0, @@ -49,21 +49,21 @@ "support": 15.0 }, "consideration": { - "f1-score": 0.9285714285714286, + "f1-score": 0.9655172413793104, "precision": 1.0, - "recall": 0.8666666666666667, + "recall": 0.9333333333333333, "support": 15.0 }, "decision": { - "f1-score": 0.9333333333333333, - "precision": 0.9333333333333333, + "f1-score": 0.9655172413793104, + "precision": 1.0, "recall": 0.9333333333333333, "support": 15.0 }, "macro avg": { - "f1-score": 0.9536131694056934, - "precision": 0.9604010025062657, - "recall": 0.9523809523809524, + "f1-score": 0.9634888438133874, + "precision": 0.9699248120300752, + "recall": 0.9619047619047619, "support": 105.0 }, "post_purchase": { @@ -85,9 +85,9 @@ "support": 15.0 }, "weighted avg": { - "f1-score": 0.9536131694056934, - "precision": 0.9604010025062656, - "recall": 0.9523809523809523, + "f1-score": 0.9634888438133875, + "precision": 0.9699248120300752, + "recall": 0.9619047619047619, "support": 105.0 } }, diff --git a/artifacts/evaluation/latest/decision_phase_final_wave_cases_confusion_matrix.csv b/artifacts/evaluation/latest/decision_phase_final_wave_cases_confusion_matrix.csv index 7066b2cd2062314bb54e438130f1d5a1cb7ecb9c..5eec12148ff61e8dc414ad8803de0fd9b41e9e95 100644 --- a/artifacts/evaluation/latest/decision_phase_final_wave_cases_confusion_matrix.csv +++ b/artifacts/evaluation/latest/decision_phase_final_wave_cases_confusion_matrix.csv @@ -1,7 +1,7 @@ ,awareness,research,consideration,decision,action,post_purchase,support awareness,5,0,0,0,0,0,0 -research,1,3,0,0,0,0,0 -consideration,0,2,3,0,0,0,0 +research,3,1,0,0,0,0,0 +consideration,0,1,4,0,0,0,0 decision,0,0,0,5,0,0,0 action,0,0,0,0,0,0,0 post_purchase,0,0,0,0,0,4,0 diff --git a/artifacts/evaluation/latest/decision_phase_final_wave_cases_report.json b/artifacts/evaluation/latest/decision_phase_final_wave_cases_report.json index 3eb90650ac8929e108c96942ba6b2f26bed04c42..323d2f8a894f7fe1444bf5df8afa2f2041773640 100644 --- a/artifacts/evaluation/latest/decision_phase_final_wave_cases_report.json +++ b/artifacts/evaluation/latest/decision_phase_final_wave_cases_report.json @@ -1,15 +1,15 @@ { - "accepted_accuracy": 0.8889, + "accepted_accuracy": 0.8519, "accepted_coverage": 1.0, - "accuracy": 0.8889, + "accuracy": 0.8519, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_final_wave_cases_confusion_matrix.csv", "count": 27, "dataset_path": "/content/agentic-intent-classifier/data/decision_phase/final_wave_cases.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.8876, + "macro_f1": 0.8319, "per_class_metrics": { - "accuracy": 0.8888888888888888, + "accuracy": 0.8518518518518519, "action": { "f1-score": 0.0, "precision": 0.0, @@ -17,15 +17,15 @@ "support": 0.0 }, "awareness": { - "f1-score": 0.9090909090909091, - "precision": 0.8333333333333334, + "f1-score": 0.7692307692307693, + "precision": 0.625, "recall": 1.0, "support": 5.0 }, "consideration": { - "f1-score": 0.75, + "f1-score": 0.8888888888888888, "precision": 1.0, - "recall": 0.6, + "recall": 0.8, "support": 5.0 }, "decision": { @@ -35,9 +35,9 @@ "support": 5.0 }, "macro avg": { - "f1-score": 0.7608225108225108, - "precision": 0.7761904761904762, - "recall": 0.7642857142857142, + "f1-score": 0.7130647130647131, + "precision": 0.7321428571428571, + "recall": 0.7214285714285714, "support": 27.0 }, "post_purchase": { @@ -47,9 +47,9 @@ "support": 4.0 }, "research": { - "f1-score": 0.6666666666666666, - "precision": 0.6, - "recall": 0.75, + "f1-score": 0.3333333333333333, + "precision": 0.5, + "recall": 0.25, "support": 4.0 }, "support": { @@ -59,9 +59,9 @@ "support": 4.0 }, "weighted avg": { - "f1-score": 0.8874859708193041, - "precision": 0.9098765432098765, - "recall": 0.8888888888888888, + "f1-score": 0.8379233934789491, + "precision": 0.8564814814814815, + "recall": 0.8518518518518519, "support": 27.0 } }, diff --git a/artifacts/evaluation/latest/decision_phase_test_confusion_matrix.csv b/artifacts/evaluation/latest/decision_phase_test_confusion_matrix.csv index 7ee810e23bfb8a0cc3747e0962776fc940f083ed..dc7f621094e1b20427da3662992b82da4002a7b5 100644 --- a/artifacts/evaluation/latest/decision_phase_test_confusion_matrix.csv +++ b/artifacts/evaluation/latest/decision_phase_test_confusion_matrix.csv @@ -1,8 +1,8 @@ ,awareness,research,consideration,decision,action,post_purchase,support awareness,3,0,0,0,0,0,0 -research,3,2,0,0,0,0,0 +research,4,1,0,0,0,0,0 consideration,0,2,3,0,0,0,0 -decision,0,0,0,4,0,1,0 +decision,0,0,0,5,0,0,0 action,0,0,0,0,3,0,0 post_purchase,0,0,0,0,0,4,0 support,0,0,0,0,0,1,3 diff --git a/artifacts/evaluation/latest/decision_phase_test_report.json b/artifacts/evaluation/latest/decision_phase_test_report.json index e87d6de18080785defa8596fe97d628346bbe418..cd8290a56c2f38f42ad074f35a6e108595a77ebd 100644 --- a/artifacts/evaluation/latest/decision_phase_test_report.json +++ b/artifacts/evaluation/latest/decision_phase_test_report.json @@ -7,7 +7,7 @@ "dataset_path": "/content/agentic-intent-classifier/data/decision_phase/test.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.7724, + "macro_f1": 0.7637, "per_class_metrics": { "accuracy": 0.7586206896551724, "action": { @@ -17,8 +17,8 @@ "support": 3.0 }, "awareness": { - "f1-score": 0.6666666666666666, - "precision": 0.5, + "f1-score": 0.6, + "precision": 0.42857142857142855, "recall": 1.0, "support": 3.0 }, @@ -29,27 +29,27 @@ "support": 5.0 }, "decision": { - "f1-score": 0.8888888888888888, + "f1-score": 1.0, "precision": 1.0, - "recall": 0.8, + "recall": 1.0, "support": 5.0 }, "macro avg": { - "f1-score": 0.7724489795918367, - "precision": 0.8095238095238095, + "f1-score": 0.763718820861678, + "precision": 0.7945578231292517, "recall": 0.7928571428571428, "support": 29.0 }, "post_purchase": { - "f1-score": 0.8, - "precision": 0.6666666666666666, + "f1-score": 0.8888888888888888, + "precision": 0.8, "recall": 1.0, "support": 4.0 }, "research": { - "f1-score": 0.4444444444444444, - "precision": 0.5, - "recall": 0.4, + "f1-score": 0.25, + "precision": 0.3333333333333333, + "recall": 0.2, "support": 5.0 }, "support": { @@ -59,8 +59,8 @@ "support": 4.0 }, "weighted avg": { - "f1-score": 0.7601806239737274, - "precision": 0.8160919540229885, + "f1-score": 0.7511767925561028, + "precision": 0.7983579638752052, "recall": 0.7586206896551724, "support": 29.0 } diff --git a/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv b/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv index ead354eb5bbc296162fab26b8847d38e9f27e6ab..236c876baa9dbcafc7278f834bce97012e94edf6 100644 --- a/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv +++ b/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv @@ -1,8 +1,8 @@ ,awareness,research,consideration,decision,action,post_purchase,support awareness,16,0,0,0,0,0,0 research,1,14,0,0,0,0,0 -consideration,0,4,13,0,0,0,0 -decision,0,0,1,15,0,0,0 +consideration,0,2,15,0,0,0,0 +decision,0,0,0,16,0,0,0 action,0,0,0,0,10,0,0 post_purchase,0,0,0,0,0,14,0 support,0,0,0,0,0,0,14 diff --git a/artifacts/evaluation/latest/decision_phase_train_report.json b/artifacts/evaluation/latest/decision_phase_train_report.json index ca4b8e52b24daa95d504bffa5e1f7ec961247fa9..7677c764664c9c3998fd1bc3e899e0597d3130de 100644 --- a/artifacts/evaluation/latest/decision_phase_train_report.json +++ b/artifacts/evaluation/latest/decision_phase_train_report.json @@ -1,15 +1,15 @@ { - "accepted_accuracy": 0.9412, + "accepted_accuracy": 0.9706, "accepted_coverage": 1.0, - "accuracy": 0.9412, + "accuracy": 0.9706, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv", "count": 102, "dataset_path": "/content/agentic-intent-classifier/data/decision_phase/train.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.9464, + "macro_f1": 0.9729, "per_class_metrics": { - "accuracy": 0.9411764705882353, + "accuracy": 0.9705882352941176, "action": { "f1-score": 1.0, "precision": 1.0, @@ -23,21 +23,21 @@ "support": 16.0 }, "consideration": { - "f1-score": 0.8387096774193549, - "precision": 0.9285714285714286, - "recall": 0.7647058823529411, + "f1-score": 0.9375, + "precision": 1.0, + "recall": 0.8823529411764706, "support": 17.0 }, "decision": { - "f1-score": 0.967741935483871, + "f1-score": 1.0, "precision": 1.0, - "recall": 0.9375, + "recall": 1.0, "support": 16.0 }, "macro avg": { - "f1-score": 0.9463762044407206, - "precision": 0.9496465252767774, - "recall": 0.9479341736694679, + "f1-score": 0.9729175394497975, + "precision": 0.9737394957983193, + "recall": 0.9736694677871148, "support": 102.0 }, "post_purchase": { @@ -47,8 +47,8 @@ "support": 14.0 }, "research": { - "f1-score": 0.8484848484848485, - "precision": 0.7777777777777778, + "f1-score": 0.9032258064516129, + "precision": 0.875, "recall": 0.9333333333333333, "support": 15.0 }, @@ -59,9 +59,9 @@ "support": 14.0 }, "weighted avg": { - "f1-score": 0.9410231345715216, - "precision": 0.946188279233262, - "recall": 0.9411764705882353, + "f1-score": 0.9705984177639775, + "precision": 0.9723904267589389, + "recall": 0.9705882352941176, "support": 102.0 } }, diff --git a/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv b/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv index 36b4bb778c2261e6bdf697a55cfcf935669db80a..cc0caef96905f8e129271e3425b69d3b226cffa3 100644 --- a/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv +++ b/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv @@ -1,8 +1,8 @@ ,awareness,research,consideration,decision,action,post_purchase,support awareness,5,0,0,0,0,0,0 -research,1,3,0,0,0,0,0 +research,2,2,0,0,0,0,0 consideration,0,0,5,0,0,0,0 decision,0,0,1,3,0,0,0 action,0,0,0,0,3,0,0 post_purchase,0,1,0,0,0,3,0 -support,0,0,0,0,1,0,3 +support,0,0,0,0,0,1,3 diff --git a/artifacts/evaluation/latest/decision_phase_val_report.json b/artifacts/evaluation/latest/decision_phase_val_report.json index a22554f814b4084ed90614f08a4ad33fbec443be..ca55fe08985e1ca262142d1969b0e8e3e1163a4d 100644 --- a/artifacts/evaluation/latest/decision_phase_val_report.json +++ b/artifacts/evaluation/latest/decision_phase_val_report.json @@ -1,24 +1,24 @@ { - "accepted_accuracy": 0.8621, + "accepted_accuracy": 0.8276, "accepted_coverage": 1.0, - "accuracy": 0.8621, + "accuracy": 0.8276, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv", "count": 29, "dataset_path": "/content/agentic-intent-classifier/data/decision_phase/val.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.8567, + "macro_f1": 0.8254, "per_class_metrics": { - "accuracy": 0.8620689655172413, + "accuracy": 0.8275862068965517, "action": { - "f1-score": 0.8571428571428571, - "precision": 0.75, + "f1-score": 1.0, + "precision": 1.0, "recall": 1.0, "support": 3.0 }, "awareness": { - "f1-score": 0.9090909090909091, - "precision": 0.8333333333333334, + "f1-score": 0.8333333333333334, + "precision": 0.7142857142857143, "recall": 1.0, "support": 5.0 }, @@ -35,21 +35,21 @@ "support": 4.0 }, "macro avg": { - "f1-score": 0.8566790352504637, - "precision": 0.880952380952381, - "recall": 0.8571428571428571, + "f1-score": 0.8254483611626469, + "precision": 0.8520408163265306, + "recall": 0.8214285714285714, "support": 29.0 }, "post_purchase": { - "f1-score": 0.8571428571428571, - "precision": 1.0, + "f1-score": 0.75, + "precision": 0.75, "recall": 0.75, "support": 4.0 }, "research": { - "f1-score": 0.75, - "precision": 0.75, - "recall": 0.75, + "f1-score": 0.5714285714285714, + "precision": 0.6666666666666666, + "recall": 0.5, "support": 4.0 }, "support": { @@ -59,9 +59,9 @@ "support": 4.0 }, "weighted avg": { - "f1-score": 0.8602776533811015, - "precision": 0.8821839080459771, - "recall": 0.8620689655172413, + "f1-score": 0.822585460516495, + "precision": 0.8415435139573071, + "recall": 0.8275862068965517, "support": 29.0 } }, diff --git a/artifacts/evaluation/latest/iab_behavior_lock_regression.json b/artifacts/evaluation/latest/iab_behavior_lock_regression.json index 53e99132e43ed85484e31771835d74752e087a91..8998938437c0d692651a727f43f935f4c94cecf7 100644 --- a/artifacts/evaluation/latest/iab_behavior_lock_regression.json +++ b/artifacts/evaluation/latest/iab_behavior_lock_regression.json @@ -13,7 +13,7 @@ "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": null }, @@ -24,11 +24,6 @@ }, "id": "car-buying-maps-to-automotive-buying", "mismatches": [ - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": null, "expected": "Auto Type", @@ -106,8 +101,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -121,6 +116,16 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "CRM education should resolve to the closest business/sales path, not generic software.", @@ -130,9 +135,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Robotics", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -144,12 +149,7 @@ "id": "crm-comparison-maps-to-sales", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Robotics", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -166,9 +166,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -185,12 +185,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Job Search", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -208,7 +203,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Science" + "model_output.classification.iab_content.tier1.label": "Real Estate" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -217,7 +212,7 @@ "id": "ml-explanation-maps-to-ai", "mismatches": [ { - "actual": "Science", + "actual": "Real Estate", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" } @@ -229,10 +224,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Information and Network Security" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -243,12 +238,17 @@ "id": "support-credential-help-maps-to-business-it", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Personal Finance", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Information and Network Security", + "actual": null, "expected": "Internet", "path": "model_output.classification.iab_content.tier3.label" } @@ -284,9 +284,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sensitive Topics", - "model_output.classification.iab_content.tier2.label": "Crime & Harmful Acts to Individuals, Society & Human Right Violations", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -298,17 +298,12 @@ "id": "trial-signup-maps-to-software", "mismatches": [ { - "actual": "Sensitive Topics", + "actual": "Sports", "expected": "Hobbies & Interests", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Crime & Harmful Acts to Individuals, Society & Human Right Violations", + "actual": null, "expected": "Content Production", "path": "model_output.classification.iab_content.tier2.label" }, diff --git a/artifacts/evaluation/latest/iab_content_cross_vertical_benchmark_report.json b/artifacts/evaluation/latest/iab_content_cross_vertical_benchmark_report.json index 0afa8ca3e8e1857c41d097763a88551bc37cac7c..cd86f797b235c0670b111933adb49e888554cba0 100644 --- a/artifacts/evaluation/latest/iab_content_cross_vertical_benchmark_report.json +++ b/artifacts/evaluation/latest/iab_content_cross_vertical_benchmark_report.json @@ -1,90 +1,90 @@ { - "accepted_accuracy": 0.427, - "accepted_coverage": 0.9889, - "accuracy": 0.4222, + "accepted_accuracy": 0.3108, + "accepted_coverage": 0.8222, + "accuracy": 0.2556, "count": 90, "dataset_path": "/content/agentic-intent-classifier/data/iab_cross_vertical_benchmark.jsonl", "difficulty_breakdown": { "easy": { - "accepted_accuracy": 0.4138, - "accepted_coverage": 0.9667, - "accuracy": 0.4, + "accepted_accuracy": 0.3636, + "accepted_coverage": 0.7333, + "accuracy": 0.2667, "count": 30, - "fallback_rate": 0.0333, - "macro_f1": 0.2727 + "fallback_rate": 0.2667, + "macro_f1": 0.1778 }, "hard": { - "accepted_accuracy": 0.4667, - "accepted_coverage": 1.0, - "accuracy": 0.4667, + "accepted_accuracy": 0.3077, + "accepted_coverage": 0.8667, + "accuracy": 0.2667, "count": 30, - "fallback_rate": 0.0, - "macro_f1": 0.3106 + "fallback_rate": 0.1333, + "macro_f1": 0.1562 }, "medium": { - "accepted_accuracy": 0.4, - "accepted_coverage": 1.0, - "accuracy": 0.4, + "accepted_accuracy": 0.2692, + "accepted_coverage": 0.8667, + "accuracy": 0.2333, "count": 30, - "fallback_rate": 0.0, - "macro_f1": 0.2667 + "fallback_rate": 0.1333, + "macro_f1": 0.1591 } }, - "fallback_rate": 0.0111, + "fallback_rate": 0.1778, "head": "iab_content", - "macro_f1": 0.227, + "macro_f1": 0.1228, "primary_source": "supervised_classifier", "suite": "cross_vertical_benchmark", "tier_metrics": { - "average_prediction_depth": 2.4, + "average_prediction_depth": 1.9222, "error_buckets": { - "exact_match": 38, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 14, + "exact_match": 23, + "parent_safe_stop": 3, + "right_tier1_wrong_tier2": 23, "wrong_deep_leaf": 8, - "wrong_tier1": 29 + "wrong_tier1": 33 }, - "exact_path_accuracy": 0.4222, - "parent_safe_accuracy": 0.4444, - "tier1_accuracy": 0.6778, - "tier2_accuracy": 0.4881, - "tier3_accuracy": 0.5238, - "tier4_accuracy": 0.5 + "exact_path_accuracy": 0.2556, + "parent_safe_accuracy": 0.4222, + "tier1_accuracy": 0.6333, + "tier2_accuracy": 0.3571, + "tier3_accuracy": 0.2381, + "tier4_accuracy": 0.0 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.4, + "average_prediction_depth": 1.9222, "error_buckets": { - "exact_match": 37, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 14, - "wrong_deep_leaf": 9, - "wrong_tier1": 29 + "exact_match": 23, + "parent_safe_stop": 3, + "right_tier1_wrong_tier2": 23, + "wrong_deep_leaf": 8, + "wrong_tier1": 33 }, - "exact_path_accuracy": 0.4111, - "parent_safe_accuracy": 0.4333, - "tier1_accuracy": 0.6778, - "tier2_accuracy": 0.4881, - "tier3_accuracy": 0.4762, - "tier4_accuracy": 0.5 + "exact_path_accuracy": 0.2556, + "parent_safe_accuracy": 0.4222, + "tier1_accuracy": 0.6333, + "tier2_accuracy": 0.3571, + "tier3_accuracy": 0.2381, + "tier4_accuracy": 0.0 }, "combined_path": { - "average_prediction_depth": 2.4, + "average_prediction_depth": 1.9222, "error_buckets": { - "exact_match": 37, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 14, - "wrong_deep_leaf": 9, - "wrong_tier1": 29 + "exact_match": 23, + "parent_safe_stop": 3, + "right_tier1_wrong_tier2": 23, + "wrong_deep_leaf": 8, + "wrong_tier1": 33 }, - "exact_path_accuracy": 0.4111, - "fallback_overuse_count": 25, - "fallback_rate": 0.2778, - "parent_safe_accuracy": 0.4333, - "tier1_accuracy": 0.6778, - "tier2_accuracy": 0.4881, - "tier3_accuracy": 0.4762, - "tier4_accuracy": 0.5 + "exact_path_accuracy": 0.2556, + "fallback_overuse_count": 19, + "fallback_rate": 0.2111, + "parent_safe_accuracy": 0.4222, + "tier1_accuracy": 0.6333, + "tier2_accuracy": 0.3571, + "tier3_accuracy": 0.2381, + "tier4_accuracy": 0.0 }, "disagreements": { "classifier_vs_combined": 0 diff --git a/artifacts/evaluation/latest/iab_content_difficulty_benchmark_report.json b/artifacts/evaluation/latest/iab_content_difficulty_benchmark_report.json index 4e5d6b9465b37fb5007a3b6237115dba9b3e1eb9..1d8534bf2659e4204b8e20a7115ad0c54bfc9906 100644 --- a/artifacts/evaluation/latest/iab_content_difficulty_benchmark_report.json +++ b/artifacts/evaluation/latest/iab_content_difficulty_benchmark_report.json @@ -1,90 +1,90 @@ { - "accepted_accuracy": 0.4231, - "accepted_coverage": 1.0, - "accuracy": 0.4231, + "accepted_accuracy": 0.32, + "accepted_coverage": 0.8013, + "accuracy": 0.2564, "count": 156, "dataset_path": "/content/agentic-intent-classifier/data/iab_benchmark.jsonl", "difficulty_breakdown": { "easy": { - "accepted_accuracy": 0.4615, - "accepted_coverage": 1.0, - "accuracy": 0.4615, + "accepted_accuracy": 0.35, + "accepted_coverage": 0.7692, + "accuracy": 0.2692, "count": 52, - "fallback_rate": 0.0, - "macro_f1": 0.2359 + "fallback_rate": 0.2308, + "macro_f1": 0.153 }, "hard": { - "accepted_accuracy": 0.3654, - "accepted_coverage": 1.0, - "accuracy": 0.3654, + "accepted_accuracy": 0.275, + "accepted_coverage": 0.7692, + "accuracy": 0.2115, "count": 52, - "fallback_rate": 0.0, - "macro_f1": 0.1892 + "fallback_rate": 0.2308, + "macro_f1": 0.1108 }, "medium": { - "accepted_accuracy": 0.4423, - "accepted_coverage": 1.0, - "accuracy": 0.4423, + "accepted_accuracy": 0.3333, + "accepted_coverage": 0.8654, + "accuracy": 0.2885, "count": 52, - "fallback_rate": 0.0, - "macro_f1": 0.2338 + "fallback_rate": 0.1346, + "macro_f1": 0.1491 } }, - "fallback_rate": 0.0, + "fallback_rate": 0.1987, "head": "iab_content", - "macro_f1": 0.1524, + "macro_f1": 0.105, "primary_source": "supervised_classifier", "suite": "difficulty_benchmark", "tier_metrics": { - "average_prediction_depth": 2.4103, + "average_prediction_depth": 1.7564, "error_buckets": { - "exact_match": 66, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 42, - "wrong_deep_leaf": 8, - "wrong_tier1": 39 + "exact_match": 40, + "parent_safe_stop": 11, + "right_tier1_wrong_tier2": 58, + "wrong_deep_leaf": 1, + "wrong_tier1": 46 }, - "exact_path_accuracy": 0.4231, - "parent_safe_accuracy": 0.5385, - "tier1_accuracy": 0.75, - "tier2_accuracy": 0.4808, - "tier3_accuracy": 0.5093, - "tier4_accuracy": 0.4583 + "exact_path_accuracy": 0.2564, + "parent_safe_accuracy": 0.6218, + "tier1_accuracy": 0.7051, + "tier2_accuracy": 0.3333, + "tier3_accuracy": 0.2315, + "tier4_accuracy": 0.0 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.4103, + "average_prediction_depth": 1.7564, "error_buckets": { - "exact_match": 59, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 42, - "wrong_deep_leaf": 15, - "wrong_tier1": 39 + "exact_match": 40, + "parent_safe_stop": 11, + "right_tier1_wrong_tier2": 58, + "wrong_deep_leaf": 1, + "wrong_tier1": 46 }, - "exact_path_accuracy": 0.3782, - "parent_safe_accuracy": 0.4936, - "tier1_accuracy": 0.75, - "tier2_accuracy": 0.4808, - "tier3_accuracy": 0.4259, - "tier4_accuracy": 0.1667 + "exact_path_accuracy": 0.2564, + "parent_safe_accuracy": 0.6218, + "tier1_accuracy": 0.7051, + "tier2_accuracy": 0.3333, + "tier3_accuracy": 0.2315, + "tier4_accuracy": 0.0 }, "combined_path": { - "average_prediction_depth": 2.4103, + "average_prediction_depth": 1.7564, "error_buckets": { - "exact_match": 59, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 42, - "wrong_deep_leaf": 15, - "wrong_tier1": 39 + "exact_match": 40, + "parent_safe_stop": 11, + "right_tier1_wrong_tier2": 58, + "wrong_deep_leaf": 1, + "wrong_tier1": 46 }, - "exact_path_accuracy": 0.3782, - "fallback_overuse_count": 15, - "fallback_rate": 0.0962, - "parent_safe_accuracy": 0.4936, - "tier1_accuracy": 0.75, - "tier2_accuracy": 0.4808, - "tier3_accuracy": 0.4259, - "tier4_accuracy": 0.1667 + "exact_path_accuracy": 0.2564, + "fallback_overuse_count": 13, + "fallback_rate": 0.0833, + "parent_safe_accuracy": 0.6218, + "tier1_accuracy": 0.7051, + "tier2_accuracy": 0.3333, + "tier3_accuracy": 0.2315, + "tier4_accuracy": 0.0 }, "disagreements": { "classifier_vs_combined": 0 diff --git a/artifacts/evaluation/latest/iab_content_extended_cases_report.json b/artifacts/evaluation/latest/iab_content_extended_cases_report.json index e1ac1e327074bcccf079f5bb70a957a2aa80065f..171c7f93e8f9ef6932c330103c5aebd6251a691b 100644 --- a/artifacts/evaluation/latest/iab_content_extended_cases_report.json +++ b/artifacts/evaluation/latest/iab_content_extended_cases_report.json @@ -1,58 +1,58 @@ { - "accepted_accuracy": 0.5, - "accepted_coverage": 1.0, - "accuracy": 0.5, + "accepted_accuracy": 0.6, + "accepted_coverage": 0.625, + "accuracy": 0.375, "count": 8, "dataset_path": "/content/agentic-intent-classifier/data/iab/extended_cases.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.375, "head": "iab_content", - "macro_f1": 0.3333, + "macro_f1": 0.2308, "primary_source": "supervised_classifier", "suite": "extended_cases", "tier_metrics": { - "average_prediction_depth": 2.125, + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 4, - "right_tier1_wrong_tier2": 2, + "exact_match": 3, + "right_tier1_wrong_tier2": 1, "wrong_deep_leaf": 1, - "wrong_tier1": 1 + "wrong_tier1": 3 }, - "exact_path_accuracy": 0.5, - "parent_safe_accuracy": 0.5, - "tier1_accuracy": 0.875, + "exact_path_accuracy": 0.375, + "parent_safe_accuracy": 0.375, + "tier1_accuracy": 0.625, "tier2_accuracy": 0.5714, "tier3_accuracy": 0.0, "tier4_accuracy": 0.0 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.125, + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 4, - "right_tier1_wrong_tier2": 2, + "exact_match": 3, + "right_tier1_wrong_tier2": 1, "wrong_deep_leaf": 1, - "wrong_tier1": 1 + "wrong_tier1": 3 }, - "exact_path_accuracy": 0.5, - "parent_safe_accuracy": 0.5, - "tier1_accuracy": 0.875, + "exact_path_accuracy": 0.375, + "parent_safe_accuracy": 0.375, + "tier1_accuracy": 0.625, "tier2_accuracy": 0.5714, "tier3_accuracy": 0.0, "tier4_accuracy": 0.0 }, "combined_path": { - "average_prediction_depth": 2.125, + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 4, - "right_tier1_wrong_tier2": 2, + "exact_match": 3, + "right_tier1_wrong_tier2": 1, "wrong_deep_leaf": 1, - "wrong_tier1": 1 + "wrong_tier1": 3 }, - "exact_path_accuracy": 0.5, + "exact_path_accuracy": 0.375, "fallback_overuse_count": 2, "fallback_rate": 0.25, - "parent_safe_accuracy": 0.5, - "tier1_accuracy": 0.875, + "parent_safe_accuracy": 0.375, + "tier1_accuracy": 0.625, "tier2_accuracy": 0.5714, "tier3_accuracy": 0.0, "tier4_accuracy": 0.0 diff --git a/artifacts/evaluation/latest/iab_content_hard_cases_report.json b/artifacts/evaluation/latest/iab_content_hard_cases_report.json index fa18dce2744f7b21ccc3bf1bede556a9d8f93a8d..7a85669e782b4334395fda563e76d7fd4453360a 100644 --- a/artifacts/evaluation/latest/iab_content_hard_cases_report.json +++ b/artifacts/evaluation/latest/iab_content_hard_cases_report.json @@ -1,16 +1,16 @@ { - "accepted_accuracy": 0.4286, - "accepted_coverage": 0.875, + "accepted_accuracy": 0.6, + "accepted_coverage": 0.625, "accuracy": 0.375, "count": 8, "dataset_path": "/content/agentic-intent-classifier/data/iab/hard_cases.jsonl", - "fallback_rate": 0.125, + "fallback_rate": 0.375, "head": "iab_content", "macro_f1": 0.2308, "primary_source": "supervised_classifier", "suite": "hard_cases", "tier_metrics": { - "average_prediction_depth": 2.25, + "average_prediction_depth": 1.75, "error_buckets": { "exact_match": 3, "right_tier1_wrong_tier2": 1, @@ -25,7 +25,7 @@ }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.25, + "average_prediction_depth": 1.75, "error_buckets": { "exact_match": 3, "right_tier1_wrong_tier2": 1, @@ -39,7 +39,7 @@ "tier4_accuracy": 0.0 }, "combined_path": { - "average_prediction_depth": 2.25, + "average_prediction_depth": 1.75, "error_buckets": { "exact_match": 3, "right_tier1_wrong_tier2": 1, diff --git a/artifacts/evaluation/latest/iab_content_test_report.json b/artifacts/evaluation/latest/iab_content_test_report.json index 27c4e2a3dadfb29307c90dd87a8b951326f852c6..5571768b6fdfe56eef349f0fd54706d3fbb81041 100644 --- a/artifacts/evaluation/latest/iab_content_test_report.json +++ b/artifacts/evaluation/latest/iab_content_test_report.json @@ -1,46 +1,46 @@ { - "accepted_accuracy": 0.943, - "accepted_coverage": 1.0, - "accuracy": 0.943, + "accepted_accuracy": 0.9278, + "accepted_coverage": 0.996, + "accuracy": 0.9247, "count": 3282, "dataset_path": "/content/agentic-intent-classifier/data/iab/test.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.004, "head": "iab_content", - "macro_f1": 0.911, + "macro_f1": 0.8814, "primary_source": "supervised_classifier", "suite": "test", "tier_metrics": { - "average_prediction_depth": 2.213, + "average_prediction_depth": 2.1706, "error_buckets": { - "exact_match": 3095, - "parent_safe_stop": 45, - "right_tier1_wrong_tier2": 41, - "wrong_deep_leaf": 72, - "wrong_tier1": 29 + "exact_match": 3035, + "parent_safe_stop": 87, + "right_tier1_wrong_tier2": 56, + "wrong_deep_leaf": 69, + "wrong_tier1": 35 }, - "exact_path_accuracy": 0.943, - "parent_safe_accuracy": 0.958, - "tier1_accuracy": 0.9912, - "tier2_accuracy": 0.9776, - "tier3_accuracy": 0.9078, - "tier4_accuracy": 0.7 + "exact_path_accuracy": 0.9247, + "parent_safe_accuracy": 0.961, + "tier1_accuracy": 0.9893, + "tier2_accuracy": 0.9707, + "tier3_accuracy": 0.8487, + "tier4_accuracy": 0.5714 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.213, + "average_prediction_depth": 2.1706, "error_buckets": { - "exact_match": 3052, - "parent_safe_stop": 44, - "right_tier1_wrong_tier2": 53, - "wrong_deep_leaf": 104, - "wrong_tier1": 29 + "exact_match": 3004, + "parent_safe_stop": 84, + "right_tier1_wrong_tier2": 68, + "wrong_deep_leaf": 91, + "wrong_tier1": 35 }, - "exact_path_accuracy": 0.9299, - "parent_safe_accuracy": 0.9445, - "tier1_accuracy": 0.9912, - "tier2_accuracy": 0.9734, - "tier3_accuracy": 0.8725, - "tier4_accuracy": 0.5 + "exact_path_accuracy": 0.9153, + "parent_safe_accuracy": 0.9506, + "tier1_accuracy": 0.9893, + "tier2_accuracy": 0.9665, + "tier3_accuracy": 0.8259, + "tier4_accuracy": 0.4429 }, "combined_path": { "count": 3282, diff --git a/artifacts/evaluation/latest/iab_content_train_report.json b/artifacts/evaluation/latest/iab_content_train_report.json index c71b063661998121cf55f61491790f628b8be659..ddfc220c4eb4ef19336c5788c65a8d5fd66d7e40 100644 --- a/artifacts/evaluation/latest/iab_content_train_report.json +++ b/artifacts/evaluation/latest/iab_content_train_report.json @@ -1,46 +1,46 @@ { - "accepted_accuracy": 0.9459, - "accepted_coverage": 1.0, - "accuracy": 0.9459, + "accepted_accuracy": 0.9314, + "accepted_coverage": 0.9972, + "accuracy": 0.9295, "count": 13211, "dataset_path": "/content/agentic-intent-classifier/data/iab/train.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.0028, "head": "iab_content", - "macro_f1": 0.9194, + "macro_f1": 0.8927, "primary_source": "supervised_classifier", "suite": "train", "tier_metrics": { - "average_prediction_depth": 2.2105, + "average_prediction_depth": 2.1683, "error_buckets": { - "exact_match": 12496, - "parent_safe_stop": 162, - "right_tier1_wrong_tier2": 144, - "wrong_deep_leaf": 284, - "wrong_tier1": 125 + "exact_match": 12280, + "parent_safe_stop": 312, + "right_tier1_wrong_tier2": 215, + "wrong_deep_leaf": 288, + "wrong_tier1": 116 }, - "exact_path_accuracy": 0.9459, - "parent_safe_accuracy": 0.9585, - "tier1_accuracy": 0.9905, - "tier2_accuracy": 0.9805, - "tier3_accuracy": 0.9135, - "tier4_accuracy": 0.7268 + "exact_path_accuracy": 0.9295, + "parent_safe_accuracy": 0.9618, + "tier1_accuracy": 0.9912, + "tier2_accuracy": 0.9737, + "tier3_accuracy": 0.8557, + "tier4_accuracy": 0.6107 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.2105, + "average_prediction_depth": 2.1683, "error_buckets": { - "exact_match": 12323, - "parent_safe_stop": 157, - "right_tier1_wrong_tier2": 192, - "wrong_deep_leaf": 414, - "wrong_tier1": 125 + "exact_match": 12145, + "parent_safe_stop": 300, + "right_tier1_wrong_tier2": 263, + "wrong_deep_leaf": 387, + "wrong_tier1": 116 }, - "exact_path_accuracy": 0.9328, - "parent_safe_accuracy": 0.945, - "tier1_accuracy": 0.9905, - "tier2_accuracy": 0.9764, - "tier3_accuracy": 0.8777, - "tier4_accuracy": 0.525 + "exact_path_accuracy": 0.9193, + "parent_safe_accuracy": 0.9507, + "tier1_accuracy": 0.9912, + "tier2_accuracy": 0.9695, + "tier3_accuracy": 0.8301, + "tier4_accuracy": 0.475 }, "combined_path": { "count": 13211, diff --git a/artifacts/evaluation/latest/iab_content_val_report.json b/artifacts/evaluation/latest/iab_content_val_report.json index e6180d848166b6ebaf680a7c1d583dd7004e9aaf..07f7b7c308a572fcacf4bfda0c00cdefd92f7d3a 100644 --- a/artifacts/evaluation/latest/iab_content_val_report.json +++ b/artifacts/evaluation/latest/iab_content_val_report.json @@ -1,46 +1,46 @@ { - "accepted_accuracy": 0.9442, - "accepted_coverage": 1.0, - "accuracy": 0.9442, + "accepted_accuracy": 0.9273, + "accepted_coverage": 0.9973, + "accuracy": 0.9254, "count": 3282, "dataset_path": "/content/agentic-intent-classifier/data/iab/val.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.0027, "head": "iab_content", - "macro_f1": 0.9166, + "macro_f1": 0.8864, "primary_source": "supervised_classifier", "suite": "val", "tier_metrics": { - "average_prediction_depth": 2.2151, + "average_prediction_depth": 2.1709, "error_buckets": { - "exact_match": 3099, - "parent_safe_stop": 35, - "right_tier1_wrong_tier2": 45, - "wrong_deep_leaf": 72, - "wrong_tier1": 31 + "exact_match": 3037, + "parent_safe_stop": 80, + "right_tier1_wrong_tier2": 55, + "wrong_deep_leaf": 74, + "wrong_tier1": 36 }, - "exact_path_accuracy": 0.9442, - "parent_safe_accuracy": 0.9576, - "tier1_accuracy": 0.9906, - "tier2_accuracy": 0.9769, - "tier3_accuracy": 0.9088, - "tier4_accuracy": 0.7286 + "exact_path_accuracy": 0.9254, + "parent_safe_accuracy": 0.9613, + "tier1_accuracy": 0.989, + "tier2_accuracy": 0.9713, + "tier3_accuracy": 0.8549, + "tier4_accuracy": 0.6071 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.2151, + "average_prediction_depth": 2.1709, "error_buckets": { - "exact_match": 3056, - "parent_safe_stop": 34, - "right_tier1_wrong_tier2": 57, - "wrong_deep_leaf": 104, - "wrong_tier1": 31 + "exact_match": 3002, + "parent_safe_stop": 78, + "right_tier1_wrong_tier2": 67, + "wrong_deep_leaf": 99, + "wrong_tier1": 36 }, - "exact_path_accuracy": 0.9311, - "parent_safe_accuracy": 0.9442, - "tier1_accuracy": 0.9906, - "tier2_accuracy": 0.9727, - "tier3_accuracy": 0.8736, - "tier4_accuracy": 0.5286 + "exact_path_accuracy": 0.9147, + "parent_safe_accuracy": 0.95, + "tier1_accuracy": 0.989, + "tier2_accuracy": 0.9672, + "tier3_accuracy": 0.829, + "tier4_accuracy": 0.4643 }, "combined_path": { "count": 3282, diff --git a/artifacts/evaluation/latest/iab_cross_vertical_behavior_lock_regression.json b/artifacts/evaluation/latest/iab_cross_vertical_behavior_lock_regression.json index b50acd3aa09bda431eba62ef3ebf6ccac57ec156..6de4c767990c75ed447c3089aee7236c8fc47b5a 100644 --- a/artifacts/evaluation/latest/iab_cross_vertical_behavior_lock_regression.json +++ b/artifacts/evaluation/latest/iab_cross_vertical_behavior_lock_regression.json @@ -1,21 +1,21 @@ { "by_status": { "must_fix": { - "failed": 90, - "passed": 0, + "failed": 88, + "passed": 2, "total": 90 } }, "cases_path": "/content/agentic-intent-classifier/examples/iab_cross_vertical_behavior_lock_cases.json", "count": 90, - "failed": 90, - "passed": 0, + "failed": 88, + "passed": 2, "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Insurance" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -23,31 +23,15 @@ "model_output.classification.iab_content.tier2.label": "Travel Type" }, "id": "auto-buying-easy", - "mismatches": [ - { - "actual": "Personal Finance", - "expected": "Travel", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Insurance", - "expected": "Travel Type", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Automotive > Auto Buying and Selling.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Which car should I buy for commuting?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, @@ -57,23 +41,17 @@ "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, "id": "auto-buying-medium", - "mismatches": [ - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Automotive > Auto Buying and Selling.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best used SUV for a family of four" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Car Culture" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -83,12 +61,7 @@ "id": "auto-buying-hard", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Car Culture", + "actual": null, "expected": "Auto Type", "path": "model_output.classification.iab_content.tier2.label" } @@ -102,8 +75,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -117,6 +90,16 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical easy IAB mapping case for Business and Finance > Business > Sales.", @@ -126,9 +109,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Robotics", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -140,12 +123,7 @@ "id": "sales-crm-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Robotics", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -165,7 +143,7 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Business and Finance", "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": "Sales" + "model_output.classification.iab_content.tier3.label": "Startups" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -179,6 +157,11 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Startups", + "expected": "Sales", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical hard IAB mapping case for Business and Finance > Business > Sales.", @@ -188,9 +171,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -202,17 +185,7 @@ "id": "marketing-tools-easy", "mismatches": [ { - "actual": "Careers", - "expected": "Technology & Computing", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Job Search", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -229,9 +202,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sensitive Topics", - "model_output.classification.iab_content.tier2.label": "Terrorism" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -241,17 +214,12 @@ "id": "marketing-tools-medium", "mismatches": [ { - "actual": "Sensitive Topics", + "actual": "Technology & Computing", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Terrorism", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" } @@ -263,9 +231,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Home Utilities", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -277,17 +245,12 @@ "id": "marketing-tools-hard", "mismatches": [ { - "actual": "Personal Finance", + "actual": "Careers", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Home Utilities", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -304,10 +267,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Information and Network Security" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -318,12 +281,17 @@ "id": "business-it-easy", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Information and Network Security", + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, "expected": "Internet", "path": "model_output.classification.iab_content.tier3.label" } @@ -335,9 +303,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -347,9 +315,14 @@ "id": "business-it-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Personal Finance", + "expected": "Careers", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Job Search", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical medium IAB mapping case for Business and Finance > Business > Business I.T..", @@ -415,8 +388,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Attractions", - "model_output.classification.iab_content.tier2.label": "Bars & Restaurants" + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": "Dining Out" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -425,20 +398,10 @@ }, "id": "dining-out-medium", "mismatches": [ - { - "actual": "Attractions", - "expected": "Food & Drink", - "path": "model_output.classification.iab_content.tier1.label" - }, { "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Bars & Restaurants", - "expected": "Dining Out", - "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical medium IAB mapping case for Food & Drink > Dining Out.", @@ -449,8 +412,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Attractions", - "model_output.classification.iab_content.tier2.label": "Bars & Restaurants" + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": "Dining Out" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -459,20 +422,10 @@ }, "id": "dining-out-hard", "mismatches": [ - { - "actual": "Attractions", - "expected": "Food & Drink", - "path": "model_output.classification.iab_content.tier1.label" - }, { "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Bars & Restaurants", - "expected": "Dining Out", - "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical hard IAB mapping case for Food & Drink > Dining Out.", @@ -549,7 +502,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Science" + "model_output.classification.iab_content.tier1.label": "Real Estate" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -558,7 +511,7 @@ "id": "artificial-intelligence-easy", "mismatches": [ { - "actual": "Science", + "actual": "Real Estate", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" } @@ -570,9 +523,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Education", - "model_output.classification.iab_content.tier2.label": "Language Learning" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -582,9 +535,9 @@ "id": "artificial-intelligence-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Language Learning", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Artificial Intelligence.", @@ -619,8 +572,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Job Search" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -630,7 +583,7 @@ "id": "software-apps-easy", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Careers", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, @@ -640,7 +593,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Computing", + "actual": "Job Search", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" } @@ -654,8 +607,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -673,7 +626,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Software and Applications", + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, "expected": "Internet", "path": "model_output.classification.iab_content.tier3.label" }, @@ -691,8 +649,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Virtual Reality", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Job Search", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -703,13 +661,18 @@ }, "id": "software-apps-hard", "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Virtual Reality", + "actual": "Job Search", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -774,10 +737,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Information and Network Security", + "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -790,12 +753,7 @@ "id": "communication-software-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Information and Network Security", + "actual": null, "expected": "Software and Applications", "path": "model_output.classification.iab_content.tier3.label" }, @@ -812,9 +770,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Virtual Reality", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, @@ -828,12 +786,12 @@ "id": "communication-software-hard", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Virtual Reality", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -858,8 +816,8 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier3.label": "Data Storage and Warehousing", + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -874,6 +832,16 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Data Storage and Warehousing", + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" } ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", @@ -883,11 +851,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -899,9 +867,19 @@ "id": "web-hosting-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" } ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", @@ -911,11 +889,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -927,9 +905,14 @@ "id": "web-hosting-hard", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" } ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", @@ -993,8 +976,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Laptops" + "model_output.classification.iab_content.tier2.label": "Consumer Electronics", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1010,12 +993,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Computing", - "expected": "Consumer Electronics", - "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": "Laptops", + "actual": null, "expected": "Smartphones", "path": "model_output.classification.iab_content.tier3.label" } @@ -1027,11 +1005,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Computer Animation" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1043,12 +1021,17 @@ "id": "desktops-easy", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" }, { - "actual": "Computer Animation", + "actual": null, "expected": "Photo Editing Software", "path": "model_output.classification.iab_content.tier4.label" } @@ -1060,10 +1043,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Desktops" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1074,9 +1057,9 @@ "id": "desktops-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Desktops", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Desktops.", @@ -1086,10 +1069,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Desktops" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1100,9 +1083,14 @@ "id": "desktops-hard", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Desktops", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Desktops.", @@ -1188,10 +1176,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Men's Fashion", - "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Shopping", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1202,17 +1190,17 @@ "id": "style-fashion-parent-easy", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Shopping", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Men's Fashion", + "actual": null, "expected": "Women's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Men's Shoes and Footwear", + "actual": null, "expected": "Women's Shoes and Footwear", "path": "model_output.classification.iab_content.tier3.label" } @@ -1286,9 +1274,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Bodybuilding", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Style & Fashion", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1300,17 +1288,7 @@ "id": "womens-shoes-easy", "mismatches": [ { - "actual": "Sports", - "expected": "Style & Fashion", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Bodybuilding", + "actual": null, "expected": "Women's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1327,9 +1305,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Style & Fashion", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1341,17 +1319,7 @@ "id": "womens-shoes-medium", "mismatches": [ { - "actual": "Sports", - "expected": "Style & Fashion", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Walking", + "actual": null, "expected": "Women's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1396,8 +1364,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Children's Clothing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Women's Fashion", + "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1413,12 +1381,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Children's Clothing", + "actual": "Women's Fashion", "expected": "Men's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Women's Shoes and Footwear", "expected": "Men's Clothing", "path": "model_output.classification.iab_content.tier3.label" } @@ -1432,8 +1400,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Men's Fashion", - "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" + "model_output.classification.iab_content.tier2.label": "Women's Fashion", + "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1449,7 +1417,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Men's Shoes and Footwear", + "actual": "Women's Fashion", + "expected": "Men's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": "Women's Shoes and Footwear", "expected": "Men's Clothing", "path": "model_output.classification.iab_content.tier3.label" } @@ -1463,8 +1436,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Children's Clothing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Women's Fashion", + "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1480,12 +1453,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Children's Clothing", + "actual": "Women's Fashion", "expected": "Men's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Women's Shoes and Footwear", "expected": "Men's Shoes and Footwear", "path": "model_output.classification.iab_content.tier3.label" } @@ -1531,10 +1504,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type", - "model_output.classification.iab_content.tier3.label": "Hotels and Motels" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1545,9 +1518,14 @@ "id": "hotels-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Travel Type", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Hotels and Motels", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Travel > Travel Type > Hotels and Motels.", @@ -1559,7 +1537,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1572,6 +1550,11 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Travel Type", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical hard IAB mapping case for Travel > Travel Type > Hotels and Motels.", @@ -1672,8 +1655,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", - "model_output.classification.iab_content.tier3.label": "Running and Jogging" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1694,12 +1677,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Fitness and Exercise", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Running and Jogging", + "actual": null, "expected": "Green Solutions", "path": "model_output.classification.iab_content.tier3.label" } @@ -1713,7 +1696,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier2.label": "Bodybuilding", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1735,7 +1718,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Walking", + "actual": "Bodybuilding", "expected": "Fitness and Exercise", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1897,9 +1880,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1914,12 +1897,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Travel Type", + "actual": null, "expected": "Fiction", "path": "model_output.classification.iab_content.tier2.label" } @@ -1932,7 +1910,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Books and Literature" + "model_output.classification.iab_content.tier1.label": "Genres" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1940,6 +1918,11 @@ }, "id": "fiction-hard", "mismatches": [ + { + "actual": "Genres", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "exact", "expected": "nearest_equivalent", @@ -1955,7 +1938,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Interior Decorating" + "model_output.classification.iab_content.tier2.label": "Remodeling & Construction" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1968,11 +1951,6 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Interior Decorating", - "expected": "Remodeling & Construction", - "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical easy IAB mapping case for Home & Garden > Home Improvement.", @@ -1983,9 +1961,9 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Interior Decorating", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier1.label": "Style & Fashion", + "model_output.classification.iab_content.tier2.label": "Personal Care", + "model_output.classification.iab_content.tier3.label": "Bath and Shower" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1995,25 +1973,10 @@ }, "id": "home-improvement-medium", "mismatches": [ - { - "actual": "Home & Garden", - "expected": "Style & Fashion", - "path": "model_output.classification.iab_content.tier1.label" - }, { "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Interior Decorating", - "expected": "Personal Care", - "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": null, - "expected": "Bath and Shower", - "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Home & Garden > Home Improvement.", @@ -2057,9 +2020,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Augmented Reality" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2069,17 +2032,12 @@ "id": "online-education-easy", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Careers", "expected": "Education", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Augmented Reality", + "actual": null, "expected": "Language Learning", "path": "model_output.classification.iab_content.tier2.label" } @@ -2218,10 +2176,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Medical Health", - "model_output.classification.iab_content.tier2.label": "Diseases and Conditions", - "model_output.classification.iab_content.tier3.label": "Allergies" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2232,9 +2190,19 @@ "id": "medical-health-easy", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Food & Drink", + "expected": "Medical Health", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Diseases and Conditions", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Allergies", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical easy IAB mapping case for Medical Health.", @@ -2244,10 +2212,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Medical Health", "model_output.classification.iab_content.tier2.label": "Diseases and Conditions", - "model_output.classification.iab_content.tier3.label": "Bone and Joint Conditions", + "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -2260,12 +2228,7 @@ "id": "medical-health-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Bone and Joint Conditions", + "actual": null, "expected": "Injuries", "path": "model_output.classification.iab_content.tier3.label" }, @@ -2284,7 +2247,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Medical Health", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Surgery", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -2306,7 +2269,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": null, + "actual": "Surgery", "expected": "Wellness", "path": "model_output.classification.iab_content.tier2.label" }, @@ -2407,9 +2370,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Holidays", - "model_output.classification.iab_content.tier2.label": "National & Civic Holidays" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2424,12 +2387,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "National & Civic Holidays", + "actual": null, "expected": "Food Movements", "path": "model_output.classification.iab_content.tier2.label" } @@ -2530,9 +2488,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Genres", - "model_output.classification.iab_content.tier2.label": "Family/Children" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Shopping", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2542,17 +2500,12 @@ "id": "parenting-medium", "mismatches": [ { - "actual": "Genres", + "actual": "Shopping", "expected": "Family and Relationships", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Family/Children", + "actual": null, "expected": "Parenting", "path": "model_output.classification.iab_content.tier2.label" } @@ -2567,7 +2520,7 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Family and Relationships", "model_output.classification.iab_content.tier2.label": "Parenting", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Parenting Babies and Toddlers" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2583,7 +2536,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": null, + "actual": "Parenting Babies and Toddlers", "expected": "Special Needs Kids", "path": "model_output.classification.iab_content.tier3.label" } @@ -2665,7 +2618,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Entertainment", - "model_output.classification.iab_content.tier2.label": "Movies" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2678,6 +2631,11 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Movies", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical easy IAB mapping case for Entertainment > Movies.", @@ -2688,8 +2646,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Entertainment", - "model_output.classification.iab_content.tier2.label": "Movies", + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Horror", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -2701,7 +2659,7 @@ "id": "movies-medium", "mismatches": [ { - "actual": "Entertainment", + "actual": "Genres", "expected": "Video Gaming", "path": "model_output.classification.iab_content.tier1.label" }, @@ -2711,7 +2669,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Movies", + "actual": "Horror", "expected": "Video Game Genres", "path": "model_output.classification.iab_content.tier2.label" }, diff --git a/artifacts/evaluation/latest/iab_cross_vertical_quality_target_eval.json b/artifacts/evaluation/latest/iab_cross_vertical_quality_target_eval.json index b269b2367fdf2ce6764627276bb75045080a0aeb..675bcce768bf0edd39d0d34e05d363b99d3a9492 100644 --- a/artifacts/evaluation/latest/iab_cross_vertical_quality_target_eval.json +++ b/artifacts/evaluation/latest/iab_cross_vertical_quality_target_eval.json @@ -1,21 +1,21 @@ { "by_status": { "must_fix": { - "failed": 49, - "passed": 41, + "failed": 64, + "passed": 26, "total": 90 } }, "cases_path": "/content/agentic-intent-classifier/examples/iab_cross_vertical_mapping_cases.json", "count": 90, - "failed": 49, - "passed": 41, + "failed": 64, + "passed": 26, "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Insurance" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -25,17 +25,12 @@ "id": "auto-buying-easy", "mismatches": [ { - "actual": "Personal Finance", + "actual": "Travel", "expected": "Automotive", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Insurance", + "actual": "Travel Type", "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" } @@ -47,7 +42,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, @@ -58,11 +53,6 @@ }, "id": "auto-buying-medium", "mismatches": [ - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Auto Body Styles", "expected": "Auto Buying and Selling", @@ -76,9 +66,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Car Culture" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -88,12 +78,7 @@ "id": "auto-buying-hard", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Car Culture", + "actual": null, "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" } @@ -107,8 +92,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -124,12 +109,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Software and Applications", + "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" } @@ -141,9 +126,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Robotics", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -160,7 +145,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Robotics", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -180,7 +170,7 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Business and Finance", "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": "Sales" + "model_output.classification.iab_content.tier3.label": "Startups" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -189,17 +179,23 @@ "model_output.classification.iab_content.tier3.label": "Sales" }, "id": "sales-crm-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Startups", + "expected": "Sales", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Business and Finance > Business > Sales.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need software to manage leads and pipeline for a startup sales team" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -211,12 +207,17 @@ "id": "marketing-tools-easy", "mismatches": [ { - "actual": "Careers", + "actual": "Technology & Computing", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Job Search", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -233,9 +234,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sensitive Topics", - "model_output.classification.iab_content.tier2.label": "Terrorism", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -247,12 +248,17 @@ "id": "marketing-tools-medium", "mismatches": [ { - "actual": "Sensitive Topics", + "actual": "Technology & Computing", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Terrorism", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -269,9 +275,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Home Utilities", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -283,12 +289,17 @@ "id": "marketing-tools-hard", "mismatches": [ { - "actual": "Personal Finance", + "actual": "Careers", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Home Utilities", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -305,10 +316,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Information and Network Security" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -319,17 +330,22 @@ "id": "business-it-easy", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Careers", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Information and Network Security", + "actual": null, "expected": "Business I.T.", "path": "model_output.classification.iab_content.tier3.label" } @@ -341,9 +357,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -355,12 +371,17 @@ "id": "business-it-medium", "mismatches": [ { - "actual": "Careers", + "actual": "Personal Finance", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Job Search", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -432,8 +453,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Attractions", - "model_output.classification.iab_content.tier2.label": "Bars & Restaurants" + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": "Dining Out" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -441,28 +462,17 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-medium", - "mismatches": [ - { - "actual": "Attractions", - "expected": "Food & Drink", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "Bars & Restaurants", - "expected": "Dining Out", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Food & Drink > Dining Out.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Good restaurants for a client dinner downtown" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Attractions", - "model_output.classification.iab_content.tier2.label": "Bars & Restaurants" + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": "Dining Out" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -470,20 +480,9 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-hard", - "mismatches": [ - { - "actual": "Attractions", - "expected": "Food & Drink", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "Bars & Restaurants", - "expected": "Dining Out", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Food & Drink > Dining Out.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a place to eat tonight where I can make a reservation online" }, @@ -550,7 +549,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Science", + "model_output.classification.iab_content.tier1.label": "Real Estate", "model_output.classification.iab_content.tier2.label": null }, "expected": { @@ -561,7 +560,7 @@ "id": "artificial-intelligence-easy", "mismatches": [ { - "actual": "Science", + "actual": "Real Estate", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, @@ -583,9 +582,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Education", - "model_output.classification.iab_content.tier2.label": "Language Learning" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -600,7 +599,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Language Learning", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Artificial Intelligence", "path": "model_output.classification.iab_content.tier2.label" } @@ -642,9 +646,9 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -653,9 +657,25 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "software-apps-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Job Search", + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best workflow software for a small operations team" }, @@ -663,8 +683,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -673,17 +693,28 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "software-apps-medium", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need project management software for a distributed team" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Virtual Reality", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Job Search", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -695,7 +726,12 @@ "id": "software-apps-hard", "mismatches": [ { - "actual": "Virtual Reality", + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Job Search", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -755,10 +791,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Information and Network Security", + "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -771,7 +807,12 @@ "id": "communication-software-medium", "mismatches": [ { - "actual": "Information and Network Security", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Software and Applications", "path": "model_output.classification.iab_content.tier3.label" }, @@ -788,9 +829,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Virtual Reality", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, @@ -804,7 +845,17 @@ "id": "communication-software-hard", "mismatches": [ { - "actual": "Virtual Reality", + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -829,8 +880,8 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier3.label": "Data Storage and Warehousing", + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -840,19 +891,30 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Data Storage and Warehousing", + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Vercel vs Netlify for website hosting" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -862,19 +924,40 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best hosting platform for a startup website" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -884,9 +967,25 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a managed hosting provider to deploy and run our marketing site" }, @@ -934,8 +1033,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Laptops" + "model_output.classification.iab_content.tier2.label": "Consumer Electronics", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -944,18 +1043,29 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Consumer Electronics", + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Laptops", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a portable computer with good battery life for everyday work" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -966,7 +1076,17 @@ "id": "desktops-easy", "mismatches": [ { - "actual": "Software and Applications", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, "expected": "Desktops", "path": "model_output.classification.iab_content.tier3.label" } @@ -978,10 +1098,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Desktops" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -990,18 +1110,29 @@ "model_output.classification.iab_content.tier3.label": "Desktops" }, "id": "desktops-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Desktops", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which desktop computer should I buy for a home office?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Desktops" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1010,9 +1141,25 @@ "model_output.classification.iab_content.tier3.label": "Desktops" }, "id": "desktops-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Desktops", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a desktop PC with strong performance for creative work" }, @@ -1078,8 +1225,8 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Style & Fashion" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Shopping" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1088,9 +1235,9 @@ "id": "style-fashion-parent-easy", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Shopping", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" } ], "notes": "Cross-vertical easy IAB mapping case for Style & Fashion.", @@ -1144,9 +1291,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Bodybuilding", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Style & Fashion", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1158,12 +1305,12 @@ "id": "womens-shoes-easy", "mismatches": [ { - "actual": "Sports", - "expected": "Style & Fashion", - "path": "model_output.classification.iab_content.tier1.label" + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Bodybuilding", + "actual": null, "expected": "Women's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1180,9 +1327,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Style & Fashion", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1194,12 +1341,12 @@ "id": "womens-shoes-medium", "mismatches": [ { - "actual": "Sports", - "expected": "Style & Fashion", - "path": "model_output.classification.iab_content.tier1.label" + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Walking", + "actual": null, "expected": "Women's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1238,8 +1385,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Children's Clothing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Women's Fashion", + "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1250,12 +1397,12 @@ "id": "mens-shoes-easy", "mismatches": [ { - "actual": "Children's Clothing", + "actual": "Women's Fashion", "expected": "Men's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Women's Shoes and Footwear", "expected": "Men's Shoes and Footwear", "path": "model_output.classification.iab_content.tier3.label" } @@ -1269,8 +1416,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Men's Fashion", - "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" + "model_output.classification.iab_content.tier2.label": "Women's Fashion", + "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1279,9 +1426,20 @@ "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" }, "id": "mens-shoes-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Women's Fashion", + "expected": "Men's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": "Women's Shoes and Footwear", + "expected": "Men's Shoes and Footwear", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Style & Fashion > Men's Fashion > Men's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Good men's dress shoes for office use" }, @@ -1289,8 +1447,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Children's Clothing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Women's Fashion", + "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1301,12 +1459,12 @@ "id": "mens-shoes-hard", "mismatches": [ { - "actual": "Children's Clothing", + "actual": "Women's Fashion", "expected": "Men's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Women's Shoes and Footwear", "expected": "Men's Shoes and Footwear", "path": "model_output.classification.iab_content.tier3.label" } @@ -1338,10 +1496,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type", - "model_output.classification.iab_content.tier3.label": "Hotels and Motels" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1350,9 +1508,25 @@ "model_output.classification.iab_content.tier3.label": "Hotels and Motels" }, "id": "hotels-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Travel Type", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Hotels and Motels", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Travel > Travel Type > Hotels and Motels.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best hotels near Times Square for a weekend trip" }, @@ -1360,7 +1534,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1371,6 +1545,11 @@ }, "id": "hotels-hard", "mismatches": [ + { + "actual": null, + "expected": "Travel Type", + "path": "model_output.classification.iab_content.tier2.label" + }, { "actual": null, "expected": "Hotels and Motels", @@ -1457,8 +1636,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", - "model_output.classification.iab_content.tier3.label": "Running and Jogging" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1467,9 +1646,20 @@ "model_output.classification.iab_content.tier3.label": "Running and Jogging" }, "id": "running-and-jogging-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Fitness and Exercise", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Running and Jogging", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best running plan for a first 10k" }, @@ -1477,7 +1667,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier2.label": "Bodybuilding", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1494,7 +1684,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Walking", + "actual": "Bodybuilding", "expected": "Fitness and Exercise", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1630,9 +1820,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1647,12 +1837,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Travel Type", + "actual": null, "expected": "Fiction", "path": "model_output.classification.iab_content.tier2.label" } @@ -1665,8 +1850,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Books and Literature", - "model_output.classification.iab_content.tier2.label": "Fiction" + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Romance" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1674,9 +1859,20 @@ "model_output.classification.iab_content.tier2.label": "Fiction" }, "id": "fiction-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Genres", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Romance", + "expected": "Fiction", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Books and Literature > Fiction.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Looking for a character-driven novel, not comics or poetry" }, @@ -1684,7 +1880,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Interior Decorating" + "model_output.classification.iab_content.tier2.label": "Remodeling & Construction" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1694,7 +1890,7 @@ "id": "home-improvement-easy", "mismatches": [ { - "actual": "Interior Decorating", + "actual": "Remodeling & Construction", "expected": "Home Improvement", "path": "model_output.classification.iab_content.tier2.label" } @@ -1707,8 +1903,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Interior Decorating" + "model_output.classification.iab_content.tier1.label": "Style & Fashion", + "model_output.classification.iab_content.tier2.label": "Personal Care" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1718,7 +1914,12 @@ "id": "home-improvement-medium", "mismatches": [ { - "actual": "Interior Decorating", + "actual": "Style & Fashion", + "expected": "Home & Garden", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Personal Care", "expected": "Home Improvement", "path": "model_output.classification.iab_content.tier2.label" } @@ -1759,9 +1960,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Augmented Reality" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1771,12 +1972,17 @@ "id": "online-education-easy", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Careers", "expected": "Education", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Augmented Reality", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Online Education", "path": "model_output.classification.iab_content.tier2.label" } @@ -1906,23 +2112,34 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Medical Health" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Medical Health" }, "id": "medical-health-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Food & Drink", + "expected": "Medical Health", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Medical Health.", - "pass": true, + "pass": false, "status": "must_fix", "text": "what do these allergy symptoms mean" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Medical Health" }, "expected": { @@ -1930,9 +2147,15 @@ "model_output.classification.iab_content.tier1.label": "Medical Health" }, "id": "medical-health-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Medical Health.", - "pass": true, + "pass": false, "status": "must_fix", "text": "when should i see a doctor for persistent knee pain" }, @@ -2036,9 +2259,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Holidays", - "model_output.classification.iab_content.tier2.label": "National & Civic Holidays" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2053,7 +2276,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "National & Civic Holidays", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Financial Planning", "path": "model_output.classification.iab_content.tier2.label" } @@ -2131,9 +2359,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Genres", - "model_output.classification.iab_content.tier2.label": "Family/Children" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Shopping", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2143,12 +2371,17 @@ "id": "parenting-medium", "mismatches": [ { - "actual": "Genres", + "actual": "Shopping", "expected": "Family and Relationships", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Family/Children", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Parenting", "path": "model_output.classification.iab_content.tier2.label" } @@ -2234,7 +2467,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Entertainment", - "model_output.classification.iab_content.tier2.label": "Movies" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2242,17 +2475,23 @@ "model_output.classification.iab_content.tier2.label": "Movies" }, "id": "movies-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Movies", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Entertainment > Movies.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What movie should we watch tonight?" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Entertainment", - "model_output.classification.iab_content.tier2.label": "Movies" + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Horror" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2260,9 +2499,20 @@ "model_output.classification.iab_content.tier2.label": "Movies" }, "id": "movies-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Genres", + "expected": "Entertainment", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Horror", + "expected": "Movies", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Entertainment > Movies.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best thriller movies from the last few years" }, diff --git a/artifacts/evaluation/latest/iab_quality_target_eval.json b/artifacts/evaluation/latest/iab_quality_target_eval.json index 78aa1dece0ad471d7121e942adfff34294db59a2..490411ecf947ac8d5a222bf5e42b71d6f025cfd7 100644 --- a/artifacts/evaluation/latest/iab_quality_target_eval.json +++ b/artifacts/evaluation/latest/iab_quality_target_eval.json @@ -13,7 +13,7 @@ "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": null }, @@ -28,6 +28,11 @@ "actual": null, "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Vehicle shopping queries should map into the automotive buying branch, not business sales.", @@ -90,8 +95,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -107,12 +112,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Software and Applications", + "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" }, @@ -129,9 +134,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Robotics", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -148,7 +153,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Robotics", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -156,6 +161,11 @@ "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Direct CRM vendor comparison should map cleanly into the sales domain.", @@ -165,9 +175,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -184,7 +194,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Job Search", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -192,6 +202,11 @@ "actual": null, "expected": "Marketing and Advertising", "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Marketing tool discovery should map to the marketing and advertising branch.", @@ -202,7 +217,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Science", + "model_output.classification.iab_content.tier1.label": "Real Estate", "model_output.classification.iab_content.tier2.label": null }, "expected": { @@ -213,7 +228,7 @@ "id": "ml-explanation-maps-to-ai", "mismatches": [ { - "actual": "Science", + "actual": "Real Estate", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, @@ -235,10 +250,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Information and Network Security" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -249,24 +264,19 @@ "id": "support-credential-help-maps-to-business-it", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Personal Finance", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Information and Network Security", + "actual": null, "expected": "Business I.T.", "path": "model_output.classification.iab_content.tier3.label" - }, - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Credential and account help should map to business IT rather than generic business.", @@ -294,9 +304,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sensitive Topics", - "model_output.classification.iab_content.tier2.label": "Crime & Harmful Acts to Individuals, Society & Human Right Violations", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -308,12 +318,12 @@ "id": "trial-signup-maps-to-software", "mismatches": [ { - "actual": "Sensitive Topics", + "actual": "Sports", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Crime & Harmful Acts to Individuals, Society & Human Right Violations", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -321,11 +331,6 @@ "actual": null, "expected": "Software and Applications", "path": "model_output.classification.iab_content.tier3.label" - }, - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Software action queries should map to the software/application branch.", diff --git a/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv b/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv index 16ff6b8456f51d6f9c8270dcbe9a79564117e0ba..f770438e5f907d7b0905ee503cb17c3020cca501 100644 --- a/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv @@ -1,19 +1,19 @@ ,education,product_discovery,comparison,evaluation,deal_seeking,provider_selection,signup,purchase,booking,download,contact_sales,task_execution,onboarding_setup,troubleshooting,account_help,billing_help,follow_up,emotional_reflection education,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -product_discovery,0,14,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -comparison,2,1,11,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 -evaluation,1,0,0,13,0,0,0,0,0,0,0,0,1,0,0,0,0,0 -deal_seeking,0,1,0,0,13,1,0,0,0,0,0,0,0,0,0,0,0,0 +product_discovery,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +comparison,2,0,12,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +evaluation,1,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +deal_seeking,0,2,0,0,12,1,0,0,0,0,0,0,0,0,0,0,0,0 provider_selection,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0 -signup,0,0,0,0,0,0,15,1,0,0,0,0,0,0,0,0,0,0 -purchase,0,0,0,0,0,0,0,13,0,0,0,0,2,0,0,0,0,0 -booking,0,0,0,0,0,0,1,0,13,0,1,0,0,0,0,0,0,0 -download,0,0,0,0,0,0,0,0,0,13,1,1,0,0,0,0,0,0 +signup,0,0,0,0,0,0,14,0,0,0,0,2,0,0,0,0,0,0 +purchase,0,0,0,0,0,0,1,12,0,0,0,0,2,0,0,0,0,0 +booking,0,0,0,0,0,0,3,0,9,0,1,2,0,0,0,0,0,0 +download,0,0,0,0,0,0,0,0,0,14,0,1,0,0,0,0,0,0 contact_sales,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0 -task_execution,0,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0 -onboarding_setup,0,0,0,0,0,0,0,0,0,0,0,1,16,0,0,0,0,0 -troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,12,1,2,0,0 -account_help,0,0,0,0,0,0,2,0,0,0,0,1,0,3,8,1,0,0 -billing_help,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,13,0,0 -follow_up,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,14,0 +task_execution,0,0,0,0,0,0,0,0,0,0,0,17,1,0,0,0,0,0 +onboarding_setup,0,0,0,0,0,0,0,0,0,0,0,0,17,0,0,0,0,0 +troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,11,1,2,0,1 +account_help,0,0,0,0,0,0,0,0,0,0,1,1,0,3,10,0,0,0 +billing_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,13,0,0 +follow_up,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,1 emotional_reflection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15 diff --git a/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_report.json b/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_report.json index 4b70173425fd552225eb16e2466fd8636e06340f..1be4d328caca9d99263f2e7a21bcfc32a15de83f 100644 --- a/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_report.json +++ b/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_report.json @@ -1,7 +1,7 @@ { - "accepted_accuracy": 0.9104, - "accepted_coverage": 0.9675, - "accuracy": 0.8917, + "accepted_accuracy": 0.8901, + "accepted_coverage": 0.9856, + "accuracy": 0.8845, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv", "count": 277, "dataset_path": "/content/agentic-intent-classifier/data/subtype_benchmark.jsonl", @@ -12,52 +12,52 @@ "accuracy": 0.913, "count": 92, "fallback_rate": 0.0, - "macro_f1": 0.9109 + "macro_f1": 0.9124 }, "hard": { - "accepted_accuracy": 0.8554, - "accepted_coverage": 0.9121, - "accuracy": 0.8132, + "accepted_accuracy": 0.8295, + "accepted_coverage": 0.967, + "accuracy": 0.8242, "count": 91, - "fallback_rate": 0.0879, - "macro_f1": 0.8025 + "fallback_rate": 0.033, + "macro_f1": 0.8183 }, "medium": { - "accepted_accuracy": 0.957, + "accepted_accuracy": 0.9247, "accepted_coverage": 0.9894, - "accuracy": 0.9468, + "accuracy": 0.9149, "count": 94, "fallback_rate": 0.0106, - "macro_f1": 0.9469 + "macro_f1": 0.9117 } }, - "fallback_rate": 0.0325, + "fallback_rate": 0.0144, "head": "intent_subtype", - "macro_f1": 0.8886, + "macro_f1": 0.8824, "per_class_metrics": { "account_help": { - "f1-score": 0.64, - "precision": 0.8, - "recall": 0.5333333333333333, + "f1-score": 0.7142857142857143, + "precision": 0.7692307692307693, + "recall": 0.6666666666666666, "support": 15.0 }, - "accuracy": 0.8916967509025271, + "accuracy": 0.8844765342960289, "billing_help": { - "f1-score": 0.8387096774193549, - "precision": 0.8125, + "f1-score": 0.8666666666666667, + "precision": 0.8666666666666667, "recall": 0.8666666666666667, "support": 15.0 }, "booking": { - "f1-score": 0.9285714285714286, + "f1-score": 0.75, "precision": 1.0, - "recall": 0.8666666666666667, + "recall": 0.6, "support": 15.0 }, "comparison": { - "f1-score": 0.8148148148148148, - "precision": 0.9166666666666666, - "recall": 0.7333333333333333, + "f1-score": 0.8888888888888888, + "precision": 1.0, + "recall": 0.8, "support": 15.0 }, "contact_sales": { @@ -67,15 +67,15 @@ "support": 15.0 }, "deal_seeking": { - "f1-score": 0.896551724137931, - "precision": 0.9285714285714286, - "recall": 0.8666666666666667, + "f1-score": 0.8888888888888888, + "precision": 1.0, + "recall": 0.8, "support": 15.0 }, "download": { - "f1-score": 0.9285714285714286, + "f1-score": 0.9655172413793104, "precision": 1.0, - "recall": 0.8666666666666667, + "recall": 0.9333333333333333, "support": 15.0 }, "education": { @@ -85,15 +85,15 @@ "support": 15.0 }, "emotional_reflection": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9375, + "precision": 0.8823529411764706, "recall": 1.0, "support": 15.0 }, "evaluation": { - "f1-score": 0.896551724137931, - "precision": 0.9285714285714286, - "recall": 0.8666666666666667, + "f1-score": 0.9333333333333333, + "precision": 0.9333333333333333, + "recall": 0.9333333333333333, "support": 15.0 }, "follow_up": { @@ -103,21 +103,21 @@ "support": 15.0 }, "macro avg": { - "f1-score": 0.8886471209737711, - "precision": 0.8965122159975102, - "recall": 0.8895561002178651, + "f1-score": 0.8824228919733669, + "precision": 0.8968567719420234, + "recall": 0.8825617283950618, "support": 277.0 }, "onboarding_setup": { - "f1-score": 0.8648648648648649, - "precision": 0.8, - "recall": 0.9411764705882353, + "f1-score": 0.918918918918919, + "precision": 0.85, + "recall": 1.0, "support": 17.0 }, "product_discovery": { - "f1-score": 0.9032258064516129, - "precision": 0.875, - "recall": 0.9333333333333333, + "f1-score": 0.9375, + "precision": 0.8823529411764706, + "recall": 1.0, "support": 15.0 }, "provider_selection": { @@ -127,33 +127,33 @@ "support": 16.0 }, "purchase": { - "f1-score": 0.896551724137931, - "precision": 0.9285714285714286, - "recall": 0.8666666666666667, + "f1-score": 0.8888888888888888, + "precision": 1.0, + "recall": 0.8, "support": 15.0 }, "signup": { - "f1-score": 0.8823529411764706, - "precision": 0.8333333333333334, - "recall": 0.9375, + "f1-score": 0.8235294117647058, + "precision": 0.7777777777777778, + "recall": 0.875, "support": 16.0 }, "task_execution": { - "f1-score": 0.9230769230769231, - "precision": 0.8571428571428571, - "recall": 1.0, + "f1-score": 0.8292682926829268, + "precision": 0.7391304347826086, + "recall": 0.9444444444444444, "support": 18.0 }, "troubleshooting": { - "f1-score": 0.8, - "precision": 0.8, - "recall": 0.8, + "f1-score": 0.7586206896551724, + "precision": 0.7857142857142857, + "recall": 0.7333333333333333, "support": 15.0 }, "weighted avg": { - "f1-score": 0.8891181699377334, - "precision": 0.8953221541324111, - "recall": 0.8916967509025271, + "f1-score": 0.8822131766431675, + "precision": 0.8945403392673653, + "recall": 0.8844765342960289, "support": 277.0 } }, diff --git a/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv b/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv index 32f3cc3b353284a7c9ed993adff29ad28752c3d8..b2db308583b6fd5ba3f9102eb0491bfdf54f87b0 100644 --- a/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv @@ -1,9 +1,9 @@ ,education,product_discovery,comparison,evaluation,deal_seeking,provider_selection,signup,purchase,booking,download,contact_sales,task_execution,onboarding_setup,troubleshooting,account_help,billing_help,follow_up,emotional_reflection education,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 product_discovery,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -comparison,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +comparison,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 evaluation,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 -deal_seeking,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0 +deal_seeking,1,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0 provider_selection,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0 signup,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 purchase,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 @@ -15,5 +15,5 @@ onboarding_setup,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0 troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0 account_help,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0 billing_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -follow_up,1,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,7,0 +follow_up,0,0,0,0,3,0,0,0,0,0,0,0,0,0,1,0,8,0 emotional_reflection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/artifacts/evaluation/latest/intent_subtype_extended_cases_report.json b/artifacts/evaluation/latest/intent_subtype_extended_cases_report.json index 9827fc99e6be6d8aa6c64a1d86dd23b70aa03055..5c726c13a7efc21faf7f05b5418c8d4a1f931d25 100644 --- a/artifacts/evaluation/latest/intent_subtype_extended_cases_report.json +++ b/artifacts/evaluation/latest/intent_subtype_extended_cases_report.json @@ -1,21 +1,21 @@ { - "accepted_accuracy": 0.8302, + "accepted_accuracy": 0.8113, "accepted_coverage": 1.0, - "accuracy": 0.8302, + "accuracy": 0.8113, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv", "count": 53, "dataset_path": "/content/agentic-intent-classifier/data/subtype/extended_cases.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.7668, + "macro_f1": 0.7517, "per_class_metrics": { "account_help": { - "f1-score": 0.8, - "precision": 1.0, + "f1-score": 0.6666666666666666, + "precision": 0.6666666666666666, "recall": 0.6666666666666666, "support": 3.0 }, - "accuracy": 0.8301886792452831, + "accuracy": 0.8113207547169812, "billing_help": { "f1-score": 0.0, "precision": 0.0, @@ -29,9 +29,9 @@ "support": 0.0 }, "comparison": { - "f1-score": 1.0, + "f1-score": 0.6666666666666666, "precision": 1.0, - "recall": 1.0, + "recall": 0.5, "support": 2.0 }, "contact_sales": { @@ -41,9 +41,9 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 0.8571428571428571, - "precision": 0.75, - "recall": 1.0, + "f1-score": 0.7619047619047619, + "precision": 0.6666666666666666, + "recall": 0.8888888888888888, "support": 9.0 }, "download": { @@ -71,15 +71,15 @@ "support": 3.0 }, "follow_up": { - "f1-score": 0.7368421052631579, + "f1-score": 0.8, "precision": 1.0, - "recall": 0.5833333333333334, + "recall": 0.6666666666666666, "support": 12.0 }, "macro avg": { - "f1-score": 0.46858256266151, - "precision": 0.4565696649029982, - "recall": 0.513888888888889, + "f1-score": 0.45939292189292186, + "precision": 0.4611992945326278, + "recall": 0.48456790123456783, "support": 53.0 }, "onboarding_setup": { @@ -113,8 +113,8 @@ "support": 0.0 }, "task_execution": { - "f1-score": 0.6666666666666666, - "precision": 0.5, + "f1-score": 1.0, + "precision": 1.0, "recall": 1.0, "support": 1.0 }, @@ -125,9 +125,9 @@ "support": 1.0 }, "weighted avg": { - "f1-score": 0.8018611395225099, - "precision": 0.8208295896975142, - "recall": 0.8301886792452831, + "f1-score": 0.7861520554916781, + "precision": 0.7972446840371369, + "recall": 0.8113207547169812, "support": 53.0 } }, diff --git a/artifacts/evaluation/latest/intent_subtype_hard_cases_confusion_matrix.csv b/artifacts/evaluation/latest/intent_subtype_hard_cases_confusion_matrix.csv index 5e6557f59f22cd5799cd4591e735e99566b12971..15de48438e9e1f282283839804876ebf5b7650e7 100644 --- a/artifacts/evaluation/latest/intent_subtype_hard_cases_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_subtype_hard_cases_confusion_matrix.csv @@ -2,7 +2,7 @@ education,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 product_discovery,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 comparison,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -evaluation,3,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +evaluation,2,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 deal_seeking,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0 provider_selection,0,0,0,1,0,9,0,0,0,0,0,0,0,0,0,0,0,0 signup,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0 diff --git a/artifacts/evaluation/latest/intent_subtype_hard_cases_report.json b/artifacts/evaluation/latest/intent_subtype_hard_cases_report.json index ae7f0904cfd23d66cb645f7662fa628497025d30..bb2a077d9aa6aa53f91864f064b671568953f33b 100644 --- a/artifacts/evaluation/latest/intent_subtype_hard_cases_report.json +++ b/artifacts/evaluation/latest/intent_subtype_hard_cases_report.json @@ -7,7 +7,7 @@ "dataset_path": "/content/agentic-intent-classifier/data/subtype/hard_cases.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.8447, + "macro_f1": 0.8426, "per_class_metrics": { "account_help": { "f1-score": 0.8, @@ -53,8 +53,8 @@ "support": 0.0 }, "education": { - "f1-score": 0.9508196721311475, - "precision": 0.90625, + "f1-score": 0.9666666666666667, + "precision": 0.9354838709677419, "recall": 1.0, "support": 29.0 }, @@ -77,8 +77,8 @@ "support": 12.0 }, "macro avg": { - "f1-score": 0.7038911013311109, - "precision": 0.7234953703703704, + "f1-score": 0.7021723995980289, + "precision": 0.7210790702726187, "recall": 0.7212962962962962, "support": 94.0 }, @@ -89,8 +89,8 @@ "support": 6.0 }, "product_discovery": { - "f1-score": 0.8888888888888888, - "precision": 0.8, + "f1-score": 0.8421052631578947, + "precision": 0.7272727272727273, "recall": 1.0, "support": 8.0 }, @@ -125,8 +125,8 @@ "support": 3.0 }, "weighted avg": { - "f1-score": 0.8798004011763282, - "precision": 0.8911125886524823, + "f1-score": 0.8807077824069889, + "precision": 0.8939419937189327, "recall": 0.8936170212765957, "support": 94.0 } diff --git a/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv b/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv index 64a5f1bbdf4a4a58655b34163d11851128aa6e0e..e118be1e0635b58dab599e9cc24d9077b75eb2db 100644 --- a/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv @@ -1,9 +1,9 @@ ,education,product_discovery,comparison,evaluation,deal_seeking,provider_selection,signup,purchase,booking,download,contact_sales,task_execution,onboarding_setup,troubleshooting,account_help,billing_help,follow_up,emotional_reflection education,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -product_discovery,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +product_discovery,0,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 comparison,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -evaluation,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 -deal_seeking,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0 +evaluation,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 +deal_seeking,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 provider_selection,0,0,0,1,0,4,0,0,0,0,0,0,1,0,0,0,0,0 signup,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0 purchase,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 @@ -12,8 +12,8 @@ download,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 contact_sales,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 task_execution,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0 onboarding_setup,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0 -troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0 +troubleshooting,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0 account_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0 billing_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -follow_up,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,8,0 +follow_up,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,8,0 emotional_reflection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5 diff --git a/artifacts/evaluation/latest/intent_subtype_test_report.json b/artifacts/evaluation/latest/intent_subtype_test_report.json index 124bd1866a7fc31b6e804433ae076df68ddca2ad..120cbf10fc566e55ada2da80fc7a088f28800006 100644 --- a/artifacts/evaluation/latest/intent_subtype_test_report.json +++ b/artifacts/evaluation/latest/intent_subtype_test_report.json @@ -1,13 +1,13 @@ { - "accepted_accuracy": 0.9, + "accepted_accuracy": 0.8714, "accepted_coverage": 1.0, - "accuracy": 0.9, + "accuracy": 0.8714, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv", "count": 70, "dataset_path": "/content/agentic-intent-classifier/data/subtype/test.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.8531, + "macro_f1": 0.8317, "per_class_metrics": { "account_help": { "f1-score": 1.0, @@ -15,7 +15,7 @@ "recall": 1.0, "support": 2.0 }, - "accuracy": 0.9, + "accuracy": 0.8714285714285714, "billing_help": { "f1-score": 0.0, "precision": 0.0, @@ -41,9 +41,9 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 0.6666666666666666, - "precision": 0.5, - "recall": 1.0, + "f1-score": 0.3333333333333333, + "precision": 0.25, + "recall": 0.5, "support": 2.0 }, "download": { @@ -65,9 +65,9 @@ "support": 5.0 }, "evaluation": { - "f1-score": 0.0, - "precision": 0.0, - "recall": 0.0, + "f1-score": 0.5, + "precision": 0.5, + "recall": 0.5, "support": 2.0 }, "follow_up": { @@ -77,9 +77,9 @@ "support": 11.0 }, "macro avg": { - "f1-score": 0.6635221022395855, - "precision": 0.6578042328042328, - "recall": 0.6885521885521885, + "f1-score": 0.646896135613619, + "precision": 0.6657407407407407, + "recall": 0.6538299663299663, "support": 70.0 }, "onboarding_setup": { @@ -89,9 +89,9 @@ "support": 4.0 }, "product_discovery": { - "f1-score": 1.0, + "f1-score": 0.9333333333333333, "precision": 1.0, - "recall": 1.0, + "recall": 0.875, "support": 8.0 }, "provider_selection": { @@ -113,21 +113,21 @@ "support": 2.0 }, "task_execution": { - "f1-score": 0.9230769230769231, - "precision": 0.8571428571428571, + "f1-score": 0.8571428571428571, + "precision": 0.75, "recall": 1.0, "support": 6.0 }, "troubleshooting": { - "f1-score": 1.0, + "f1-score": 0.6666666666666666, "precision": 1.0, - "recall": 1.0, + "recall": 0.5, "support": 2.0 }, "weighted avg": { - "f1-score": 0.8939882610403741, - "precision": 0.9094217687074829, - "recall": 0.9, + "f1-score": 0.8759558172936446, + "precision": 0.9073809523809524, + "recall": 0.8714285714285714, "support": 70.0 } }, diff --git a/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv b/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv index cec2aad62af2ad3c09e31dc591f77602bc7467e0..c1467c69fdefabb0c451845a88e5e12f4d2325ab 100644 --- a/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv @@ -1,19 +1,19 @@ ,education,product_discovery,comparison,evaluation,deal_seeking,provider_selection,signup,purchase,booking,download,contact_sales,task_execution,onboarding_setup,troubleshooting,account_help,billing_help,follow_up,emotional_reflection education,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -product_discovery,0,29,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0 -comparison,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -evaluation,4,4,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +product_discovery,0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0 +comparison,0,0,14,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +evaluation,3,5,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0 deal_seeking,0,0,0,0,10,1,0,0,0,0,0,0,0,0,0,0,0,0 provider_selection,0,0,0,0,0,23,0,0,0,0,0,0,0,0,0,0,2,0 signup,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0 purchase,0,0,0,0,0,0,2,4,0,0,0,0,0,0,0,0,0,0 booking,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0 download,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0 -contact_sales,0,0,0,0,0,0,2,0,1,0,6,0,0,0,0,0,0,0 -task_execution,0,0,0,0,0,0,1,0,0,1,0,17,0,0,0,0,0,0 +contact_sales,0,0,0,0,0,0,2,0,1,0,5,1,0,0,0,0,0,0 +task_execution,0,0,0,0,0,0,0,1,0,0,0,18,0,0,0,0,0,0 onboarding_setup,0,0,0,0,0,0,0,0,1,0,0,0,16,0,0,0,0,0 -troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,10,2,0,1,0 -account_help,0,0,0,0,0,0,0,0,0,0,0,0,0,2,5,0,0,0 +troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,9,3,0,1,0 +account_help,0,0,0,0,0,0,0,0,0,0,0,0,0,1,6,0,0,0 billing_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0 -follow_up,0,0,0,0,1,0,0,0,0,0,0,5,0,0,0,0,30,0 +follow_up,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,32,0 emotional_reflection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20 diff --git a/artifacts/evaluation/latest/intent_subtype_train_report.json b/artifacts/evaluation/latest/intent_subtype_train_report.json index 96bb5763fdceb3d304d5baafb3cb725f5674bda9..8e4ed55f68a12ff748f66a2c1d102c84dae18381 100644 --- a/artifacts/evaluation/latest/intent_subtype_train_report.json +++ b/artifacts/evaluation/latest/intent_subtype_train_report.json @@ -1,21 +1,21 @@ { - "accepted_accuracy": 0.8978, - "accepted_coverage": 1.0, - "accuracy": 0.8978, + "accepted_accuracy": 0.9068, + "accepted_coverage": 0.9936, + "accuracy": 0.9042, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv", "count": 313, "dataset_path": "/content/agentic-intent-classifier/data/subtype/train.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.0064, "head": "intent_subtype", - "macro_f1": 0.877, + "macro_f1": 0.8787, "per_class_metrics": { "account_help": { - "f1-score": 0.7142857142857143, - "precision": 0.7142857142857143, - "recall": 0.7142857142857143, + "f1-score": 0.75, + "precision": 0.6666666666666666, + "recall": 0.8571428571428571, "support": 7.0 }, - "accuracy": 0.8977635782747604, + "accuracy": 0.9041533546325878, "billing_help": { "f1-score": 1.0, "precision": 1.0, @@ -29,32 +29,32 @@ "support": 5.0 }, "comparison": { - "f1-score": 0.967741935483871, - "precision": 0.9375, - "recall": 1.0, + "f1-score": 0.9655172413793104, + "precision": 1.0, + "recall": 0.9333333333333333, "support": 15.0 }, "contact_sales": { - "f1-score": 0.8, + "f1-score": 0.7142857142857143, "precision": 1.0, - "recall": 0.6666666666666666, + "recall": 0.5555555555555556, "support": 9.0 }, "deal_seeking": { - "f1-score": 0.9090909090909091, - "precision": 0.9090909090909091, + "f1-score": 0.9523809523809523, + "precision": 1.0, "recall": 0.9090909090909091, "support": 11.0 }, "download": { - "f1-score": 0.9411764705882353, - "precision": 0.8888888888888888, + "f1-score": 1.0, + "precision": 1.0, "recall": 1.0, "support": 8.0 }, "education": { - "f1-score": 0.9629629629629629, - "precision": 0.9285714285714286, + "f1-score": 0.9719626168224299, + "precision": 0.9454545454545454, "recall": 1.0, "support": 52.0 }, @@ -65,21 +65,21 @@ "support": 20.0 }, "evaluation": { - "f1-score": 0.6923076923076923, - "precision": 1.0, + "f1-score": 0.6666666666666666, + "precision": 0.9, "recall": 0.5294117647058824, "support": 17.0 }, "follow_up": { - "f1-score": 0.8571428571428571, - "precision": 0.8823529411764706, - "recall": 0.8333333333333334, + "f1-score": 0.8888888888888888, + "precision": 0.8888888888888888, + "recall": 0.8888888888888888, "support": 36.0 }, "macro avg": { - "f1-score": 0.8770498618135788, - "precision": 0.8988923431325393, - "recall": 0.876671278202288, + "f1-score": 0.8786951095392168, + "precision": 0.9007433723013433, + "recall": 0.8782602497120292, "support": 313.0 }, "onboarding_setup": { @@ -89,9 +89,9 @@ "support": 17.0 }, "product_discovery": { - "f1-score": 0.90625, - "precision": 0.8787878787878788, - "recall": 0.9354838709677419, + "f1-score": 0.9090909090909091, + "precision": 0.8571428571428571, + "recall": 0.967741935483871, "support": 31.0 }, "provider_selection": { @@ -101,33 +101,33 @@ "support": 25.0 }, "purchase": { - "f1-score": 0.8, - "precision": 1.0, + "f1-score": 0.7272727272727273, + "precision": 0.8, "recall": 0.6666666666666666, "support": 6.0 }, "signup": { - "f1-score": 0.8648648648648649, - "precision": 0.7619047619047619, + "f1-score": 0.8888888888888888, + "precision": 0.8, "recall": 1.0, "support": 16.0 }, "task_execution": { - "f1-score": 0.8292682926829268, - "precision": 0.7727272727272727, - "recall": 0.8947368421052632, + "f1-score": 0.8571428571428571, + "precision": 0.782608695652174, + "recall": 0.9473684210526315, "support": 19.0 }, "troubleshooting": { - "f1-score": 0.8, - "precision": 0.8333333333333334, - "recall": 0.7692307692307693, + "f1-score": 0.782608695652174, + "precision": 0.9, + "recall": 0.6923076923076923, "support": 13.0 }, "weighted avg": { - "f1-score": 0.894423568060199, - "precision": 0.9063956713482179, - "recall": 0.8977635782747604, + "f1-score": 0.9005147505975646, + "precision": 0.9118244687664052, + "recall": 0.9041533546325878, "support": 313.0 } }, diff --git a/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv b/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv index 833eb8abc8d04eb31f19748c94edb8012823b2a6..27f1d7777abb47fc69ba3374373ec65f8b8ee72a 100644 --- a/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv @@ -1,12 +1,12 @@ ,education,product_discovery,comparison,evaluation,deal_seeking,provider_selection,signup,purchase,booking,download,contact_sales,task_execution,onboarding_setup,troubleshooting,account_help,billing_help,follow_up,emotional_reflection education,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -product_discovery,0,9,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -comparison,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +product_discovery,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +comparison,0,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0 evaluation,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0 deal_seeking,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0 provider_selection,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0 signup,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0 -purchase,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0 +purchase,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0 booking,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0 download,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 contact_sales,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 @@ -14,6 +14,6 @@ task_execution,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0 onboarding_setup,0,1,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0 troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0 account_help,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0 -billing_help,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 +billing_help,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 follow_up,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,9,0 emotional_reflection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5 diff --git a/artifacts/evaluation/latest/intent_subtype_val_report.json b/artifacts/evaluation/latest/intent_subtype_val_report.json index 30a623834fc4e573b7c309f7df3df57eb565d43d..96ad039e77173ac9c1c7f75f25768d1c1ec2923a 100644 --- a/artifacts/evaluation/latest/intent_subtype_val_report.json +++ b/artifacts/evaluation/latest/intent_subtype_val_report.json @@ -1,13 +1,13 @@ { - "accepted_accuracy": 0.8608, - "accepted_coverage": 0.9875, - "accuracy": 0.85, + "accepted_accuracy": 0.8625, + "accepted_coverage": 1.0, + "accuracy": 0.8625, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv", "count": 80, "dataset_path": "/content/agentic-intent-classifier/data/subtype/val.jsonl", - "fallback_rate": 0.0125, + "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.6722, + "macro_f1": 0.6561, "per_class_metrics": { "account_help": { "f1-score": 0.5, @@ -15,7 +15,7 @@ "recall": 0.5, "support": 2.0 }, - "accuracy": 0.85, + "accuracy": 0.8625, "billing_help": { "f1-score": 0.0, "precision": 0.0, @@ -29,9 +29,9 @@ "support": 3.0 }, "comparison": { - "f1-score": 0.5, - "precision": 0.5, - "recall": 0.5, + "f1-score": 0.4, + "precision": 1.0, + "recall": 0.25, "support": 4.0 }, "contact_sales": { @@ -41,8 +41,8 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 0.8, - "precision": 0.6666666666666666, + "f1-score": 0.6666666666666666, + "precision": 0.5, "recall": 1.0, "support": 2.0 }, @@ -77,21 +77,21 @@ "support": 11.0 }, "macro avg": { - "f1-score": 0.5974890931031281, - "precision": 0.5811447811447812, - "recall": 0.6353535353535353, + "f1-score": 0.5832054560954817, + "precision": 0.5891975308641975, + "recall": 0.6315656565656567, "support": 80.0 }, "onboarding_setup": { - "f1-score": 0.8888888888888888, - "precision": 1.0, + "f1-score": 0.8, + "precision": 0.8, "recall": 0.8, "support": 5.0 }, "product_discovery": { - "f1-score": 0.8571428571428571, - "precision": 0.9, - "recall": 0.8181818181818182, + "f1-score": 0.9565217391304348, + "precision": 0.9166666666666666, + "recall": 1.0, "support": 11.0 }, "provider_selection": { @@ -107,14 +107,14 @@ "support": 2.0 }, "signup": { - "f1-score": 0.8, - "precision": 0.6666666666666666, + "f1-score": 0.6666666666666666, + "precision": 0.5, "recall": 1.0, "support": 2.0 }, "task_execution": { - "f1-score": 0.8421052631578947, - "precision": 0.7272727272727273, + "f1-score": 0.9411764705882353, + "precision": 0.8888888888888888, "recall": 1.0, "support": 8.0 }, @@ -125,9 +125,9 @@ "support": 1.0 }, "weighted avg": { - "f1-score": 0.8380398913951546, - "precision": 0.8423106060606059, - "recall": 0.85, + "f1-score": 0.8443893861892583, + "precision": 0.8649305555555555, + "recall": 0.8625, "support": 80.0 } }, diff --git a/artifacts/evaluation/latest/intent_type_hard_cases_report.json b/artifacts/evaluation/latest/intent_type_hard_cases_report.json index d4434a92551f3a1dc00712b9dff4fb3a342f3af7..775a5de8fea1b2d20db0781e665a289184e6cc22 100644 --- a/artifacts/evaluation/latest/intent_type_hard_cases_report.json +++ b/artifacts/evaluation/latest/intent_type_hard_cases_report.json @@ -1,11 +1,11 @@ { "accepted_accuracy": 1.0, - "accepted_coverage": 0.9836, + "accepted_coverage": 1.0, "accuracy": 1.0, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_hard_cases_confusion_matrix.csv", "count": 61, "dataset_path": "/content/agentic-intent-classifier/data/hard_cases.jsonl", - "fallback_rate": 0.0164, + "fallback_rate": 0.0, "head": "intent_type", "macro_f1": 1.0, "per_class_metrics": { diff --git a/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv b/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv index 8ed8bdaf8c350860a22261b3cfd68c081e6415ae..e0fba415dd7603e364d3843fdcafaaf0437f5158 100644 --- a/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv @@ -1,11 +1,11 @@ ,informational,exploratory,commercial,transactional,support,personal_reflection,creative_generation,chit_chat,ambiguous,prohibited informational,8,0,0,0,0,0,0,0,0,0 exploratory,0,1,0,0,0,0,0,0,0,0 -commercial,0,0,10,0,0,0,0,0,0,0 -transactional,0,0,0,8,0,0,0,0,0,0 +commercial,1,0,9,0,0,0,0,0,0,0 +transactional,0,0,0,7,0,0,1,0,0,0 support,0,0,0,0,2,0,0,0,0,1 personal_reflection,0,0,0,0,0,5,0,0,0,0 -creative_generation,0,0,0,0,0,0,1,0,0,0 +creative_generation,0,0,0,1,0,0,0,0,0,0 chit_chat,0,0,0,0,0,0,0,1,0,0 ambiguous,1,0,1,0,0,0,0,0,7,0 prohibited,0,0,0,0,0,0,0,0,0,1 diff --git a/artifacts/evaluation/latest/intent_type_test_report.json b/artifacts/evaluation/latest/intent_type_test_report.json index 03178f8f35b3c34944ada9a61a79ab034359731a..2065e59475ab2ac96d486ccbbb15046371eb3203 100644 --- a/artifacts/evaluation/latest/intent_type_test_report.json +++ b/artifacts/evaluation/latest/intent_type_test_report.json @@ -1,15 +1,15 @@ { - "accepted_accuracy": 0.9362, - "accepted_coverage": 1.0, - "accuracy": 0.9362, + "accepted_accuracy": 0.8889, + "accepted_coverage": 0.9574, + "accuracy": 0.8723, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv", "count": 47, "dataset_path": "/content/agentic-intent-classifier/data/test.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.0426, "head": "intent_type", - "macro_f1": 0.9235, + "macro_f1": 0.8006, "per_class_metrics": { - "accuracy": 0.9361702127659575, + "accuracy": 0.8723404255319149, "ambiguous": { "f1-score": 0.875, "precision": 1.0, @@ -23,15 +23,15 @@ "support": 1.0 }, "commercial": { - "f1-score": 0.9523809523809523, - "precision": 0.9090909090909091, - "recall": 1.0, + "f1-score": 0.9, + "precision": 0.9, + "recall": 0.9, "support": 10.0 }, "creative_generation": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.0, + "precision": 0.0, + "recall": 0.0, "support": 1.0 }, "exploratory": { @@ -41,15 +41,15 @@ "support": 1.0 }, "informational": { - "f1-score": 0.9411764705882353, - "precision": 0.8888888888888888, + "f1-score": 0.8888888888888888, + "precision": 0.8, "recall": 1.0, "support": 8.0 }, "macro avg": { - "f1-score": 0.9235224089635853, - "precision": 0.9297979797979797, - "recall": 0.9444444444444444, + "f1-score": 0.8005555555555555, + "precision": 0.8074999999999999, + "recall": 0.8219444444444445, "support": 47.0 }, "personal_reflection": { @@ -71,15 +71,15 @@ "support": 3.0 }, "transactional": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.875, + "precision": 0.875, + "recall": 0.875, "support": 8.0 }, "weighted avg": { - "f1-score": 0.9360614458549377, - "precision": 0.9511068128089405, - "recall": 0.9361702127659575, + "f1-score": 0.8734633569739952, + "precision": 0.8914893617021277, + "recall": 0.8723404255319149, "support": 47.0 } }, diff --git a/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv b/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv index dbbd11b91a1bf127e1eb34fd2392aca759cfb272..fffa2bbab6f4d5807c46fb306e41cc880aafc2c5 100644 --- a/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv @@ -1,7 +1,7 @@ ,informational,exploratory,commercial,transactional,support,personal_reflection,creative_generation,chit_chat,ambiguous,prohibited informational,0,0,0,0,0,0,0,0,0,0 exploratory,0,1,0,0,0,0,0,0,0,0 -commercial,0,0,12,0,0,0,0,0,0,0 +commercial,1,0,11,0,0,0,0,0,0,0 transactional,0,0,0,0,0,0,0,0,0,0 support,0,0,0,0,0,0,0,0,0,0 personal_reflection,0,0,0,0,0,0,0,0,0,0 diff --git a/artifacts/evaluation/latest/intent_type_third_wave_cases_report.json b/artifacts/evaluation/latest/intent_type_third_wave_cases_report.json index 6265d844145b5f42455cb87b9657901a58fea714..af143bc00eed6d5535eb597e699c2bc3fdb3be6f 100644 --- a/artifacts/evaluation/latest/intent_type_third_wave_cases_report.json +++ b/artifacts/evaluation/latest/intent_type_third_wave_cases_report.json @@ -1,15 +1,15 @@ { - "accepted_accuracy": 0.8846, + "accepted_accuracy": 0.8462, "accepted_coverage": 1.0, - "accuracy": 0.8846, + "accuracy": 0.8462, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv", "count": 26, "dataset_path": "/content/agentic-intent-classifier/data/third_wave_cases.jsonl", "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 0.8209, + "macro_f1": 0.8148, "per_class_metrics": { - "accuracy": 0.8846153846153846, + "accuracy": 0.8461538461538461, "ambiguous": { "f1-score": 0.8235294117647058, "precision": 1.0, @@ -23,9 +23,9 @@ "support": 1.0 }, "commercial": { - "f1-score": 0.9230769230769231, - "precision": 0.8571428571428571, - "recall": 1.0, + "f1-score": 0.88, + "precision": 0.8461538461538461, + "recall": 0.9166666666666666, "support": 12.0 }, "creative_generation": { @@ -47,9 +47,9 @@ "support": 0.0 }, "macro avg": { - "f1-score": 0.5746606334841629, - "precision": 0.5857142857142857, - "recall": 0.5700000000000001, + "f1-score": 0.5703529411764705, + "precision": 0.5846153846153846, + "recall": 0.5616666666666666, "support": 26.0 }, "personal_reflection": { @@ -77,9 +77,9 @@ "support": 0.0 }, "weighted avg": { - "f1-score": 0.8966237382526975, - "precision": 0.9340659340659341, - "recall": 0.8846153846153846, + "f1-score": 0.8767420814479638, + "precision": 0.9289940828402367, + "recall": 0.8461538461538461, "support": 26.0 } }, diff --git a/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv b/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv index 0bccb99598127decfb008208f7a31553bf527feb..8dc8d14c82ea9c51e9cfd3c11514c27bc8fe0968 100644 --- a/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv @@ -7,5 +7,5 @@ support,0,0,0,0,10,0,0,0,0,0 personal_reflection,0,0,0,0,0,20,0,0,0,0 creative_generation,0,0,0,0,0,0,5,0,0,0 chit_chat,0,0,0,0,0,0,0,5,0,0 -ambiguous,0,0,0,0,0,0,0,0,31,0 +ambiguous,0,0,0,0,1,0,0,0,30,0 prohibited,0,0,0,0,0,0,0,0,0,5 diff --git a/artifacts/evaluation/latest/intent_type_train_report.json b/artifacts/evaluation/latest/intent_type_train_report.json index ac713faea97a231464bb9f073a89fc3fde69f1be..95149ca22fc9759b238bf770ddab843399d52f5c 100644 --- a/artifacts/evaluation/latest/intent_type_train_report.json +++ b/artifacts/evaluation/latest/intent_type_train_report.json @@ -1,19 +1,19 @@ { - "accepted_accuracy": 1.0, - "accepted_coverage": 0.9945, - "accuracy": 1.0, + "accepted_accuracy": 0.9945, + "accepted_coverage": 1.0, + "accuracy": 0.9945, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv", "count": 183, "dataset_path": "/content/agentic-intent-classifier/data/train.jsonl", - "fallback_rate": 0.0055, + "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 1.0, + "macro_f1": 0.9936, "per_class_metrics": { - "accuracy": 1.0, + "accuracy": 0.994535519125683, "ambiguous": { - "f1-score": 1.0, + "f1-score": 0.9836065573770492, "precision": 1.0, - "recall": 1.0, + "recall": 0.967741935483871, "support": 31.0 }, "chit_chat": { @@ -47,9 +47,9 @@ "support": 38.0 }, "macro avg": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.9935987509758002, + "precision": 0.990909090909091, + "recall": 0.9967741935483871, "support": 183.0 }, "personal_reflection": { @@ -65,8 +65,8 @@ "support": 5.0 }, "support": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9523809523809523, + "precision": 0.9090909090909091, "recall": 1.0, "support": 10.0 }, @@ -77,9 +77,9 @@ "support": 28.0 }, "weighted avg": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.9946208349863281, + "precision": 0.9950322901142573, + "recall": 0.994535519125683, "support": 183.0 } }, diff --git a/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv b/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv index 15483dd9c56df7dc4959e423e64aea34c7bf95a8..e729131ddbc7ee65939f5606aae08f28a8bb16aa 100644 --- a/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv @@ -2,8 +2,8 @@ informational,8,0,0,0,0,0,0,0,0,0 exploratory,0,1,0,0,0,0,0,0,0,0 commercial,0,1,9,0,0,0,0,0,0,0 -transactional,0,0,0,7,0,0,1,0,0,0 -support,0,0,0,0,3,0,0,0,0,0 +transactional,0,0,0,5,0,0,3,0,0,0 +support,0,0,0,0,2,0,0,0,0,1 personal_reflection,0,0,0,0,0,5,0,0,0,0 creative_generation,0,0,0,0,0,0,1,0,0,0 chit_chat,0,0,0,0,0,0,0,1,0,0 diff --git a/artifacts/evaluation/latest/intent_type_val_report.json b/artifacts/evaluation/latest/intent_type_val_report.json index a24406a8d18fb7daea6251a5ad22a8b1507743cf..9015d7ddf61a7c57db7f8f54178ed8b15f9a6ee0 100644 --- a/artifacts/evaluation/latest/intent_type_val_report.json +++ b/artifacts/evaluation/latest/intent_type_val_report.json @@ -1,15 +1,15 @@ { - "accepted_accuracy": 0.9362, - "accepted_coverage": 1.0, - "accuracy": 0.9362, + "accepted_accuracy": 0.8913, + "accepted_coverage": 0.9787, + "accuracy": 0.8723, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv", "count": 47, "dataset_path": "/content/agentic-intent-classifier/data/val.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.0213, "head": "intent_type", - "macro_f1": 0.9108, + "macro_f1": 0.8144, "per_class_metrics": { - "accuracy": 0.9361702127659575, + "accuracy": 0.8723404255319149, "ambiguous": { "f1-score": 0.9411764705882353, "precision": 1.0, @@ -29,8 +29,8 @@ "support": 10.0 }, "creative_generation": { - "f1-score": 0.6666666666666666, - "precision": 0.5, + "f1-score": 0.4, + "precision": 0.25, "recall": 1.0, "support": 1.0 }, @@ -47,9 +47,9 @@ "support": 8.0 }, "macro avg": { - "f1-score": 0.9107843137254902, - "precision": 0.89, - "recall": 0.966388888888889, + "f1-score": 0.8143740573152337, + "precision": 0.8150000000000001, + "recall": 0.9080555555555556, "support": 47.0 }, "personal_reflection": { @@ -59,27 +59,27 @@ "support": 5.0 }, "prohibited": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.6666666666666666, + "precision": 0.5, "recall": 1.0, "support": 1.0 }, "support": { - "f1-score": 1.0, + "f1-score": 0.8, "precision": 1.0, - "recall": 1.0, + "recall": 0.6666666666666666, "support": 3.0 }, "transactional": { - "f1-score": 0.9333333333333333, + "f1-score": 0.7692307692307693, "precision": 1.0, - "recall": 0.875, + "recall": 0.625, "support": 8.0 }, "weighted avg": { - "f1-score": 0.9419274092615769, - "precision": 0.9574468085106383, - "recall": 0.9361702127659575, + "f1-score": 0.8884631430313531, + "precision": 0.9414893617021277, + "recall": 0.8723404255319149, "support": 47.0 } }, diff --git a/artifacts/evaluation/latest/summary.json b/artifacts/evaluation/latest/summary.json index fb6fb4007d7953cf009ecfa3909939ce5231639e..fc3bdef671cdb58f12083bb6ed562eb2544f5a51 100644 --- a/artifacts/evaluation/latest/summary.json +++ b/artifacts/evaluation/latest/summary.json @@ -21,7 +21,7 @@ "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": null }, @@ -32,11 +32,6 @@ }, "id": "car-buying-maps-to-automotive-buying", "mismatches": [ - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": null, "expected": "Auto Type", @@ -114,8 +109,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -129,6 +124,16 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "CRM education should resolve to the closest business/sales path, not generic software.", @@ -138,9 +143,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Robotics", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -152,12 +157,7 @@ "id": "crm-comparison-maps-to-sales", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Robotics", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -174,9 +174,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -193,12 +193,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Job Search", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -216,7 +211,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Science" + "model_output.classification.iab_content.tier1.label": "Real Estate" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -225,7 +220,7 @@ "id": "ml-explanation-maps-to-ai", "mismatches": [ { - "actual": "Science", + "actual": "Real Estate", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" } @@ -237,10 +232,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Information and Network Security" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -251,12 +246,17 @@ "id": "support-credential-help-maps-to-business-it", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Personal Finance", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Information and Network Security", + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, "expected": "Internet", "path": "model_output.classification.iab_content.tier3.label" } @@ -292,9 +292,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sensitive Topics", - "model_output.classification.iab_content.tier2.label": "Crime & Harmful Acts to Individuals, Society & Human Right Violations", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -306,17 +306,12 @@ "id": "trial-signup-maps-to-software", "mismatches": [ { - "actual": "Sensitive Topics", + "actual": "Sports", "expected": "Hobbies & Interests", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Crime & Harmful Acts to Individuals, Society & Human Right Violations", + "actual": null, "expected": "Content Production", "path": "model_output.classification.iab_content.tier2.label" }, @@ -406,21 +401,21 @@ "iab_cross_vertical_behavior_lock_regression": { "by_status": { "must_fix": { - "failed": 90, - "passed": 0, + "failed": 88, + "passed": 2, "total": 90 } }, "cases_path": "/content/agentic-intent-classifier/examples/iab_cross_vertical_behavior_lock_cases.json", "count": 90, - "failed": 90, - "passed": 0, + "failed": 88, + "passed": 2, "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Insurance" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -428,31 +423,15 @@ "model_output.classification.iab_content.tier2.label": "Travel Type" }, "id": "auto-buying-easy", - "mismatches": [ - { - "actual": "Personal Finance", - "expected": "Travel", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Insurance", - "expected": "Travel Type", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Automotive > Auto Buying and Selling.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Which car should I buy for commuting?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, @@ -462,23 +441,17 @@ "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, "id": "auto-buying-medium", - "mismatches": [ - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Automotive > Auto Buying and Selling.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best used SUV for a family of four" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Car Culture" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -488,12 +461,7 @@ "id": "auto-buying-hard", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Car Culture", + "actual": null, "expected": "Auto Type", "path": "model_output.classification.iab_content.tier2.label" } @@ -507,8 +475,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -522,6 +490,16 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical easy IAB mapping case for Business and Finance > Business > Sales.", @@ -531,9 +509,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Robotics", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -545,12 +523,7 @@ "id": "sales-crm-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Robotics", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -570,7 +543,7 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Business and Finance", "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": "Sales" + "model_output.classification.iab_content.tier3.label": "Startups" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -584,6 +557,11 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Startups", + "expected": "Sales", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical hard IAB mapping case for Business and Finance > Business > Sales.", @@ -593,9 +571,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -607,17 +585,7 @@ "id": "marketing-tools-easy", "mismatches": [ { - "actual": "Careers", - "expected": "Technology & Computing", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Job Search", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -634,9 +602,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sensitive Topics", - "model_output.classification.iab_content.tier2.label": "Terrorism" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -646,17 +614,12 @@ "id": "marketing-tools-medium", "mismatches": [ { - "actual": "Sensitive Topics", + "actual": "Technology & Computing", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Terrorism", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" } @@ -668,9 +631,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Home Utilities", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -682,17 +645,12 @@ "id": "marketing-tools-hard", "mismatches": [ { - "actual": "Personal Finance", + "actual": "Careers", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Home Utilities", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -709,10 +667,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Information and Network Security" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -723,12 +681,17 @@ "id": "business-it-easy", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Information and Network Security", + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, "expected": "Internet", "path": "model_output.classification.iab_content.tier3.label" } @@ -740,9 +703,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -752,9 +715,14 @@ "id": "business-it-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Personal Finance", + "expected": "Careers", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Job Search", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical medium IAB mapping case for Business and Finance > Business > Business I.T..", @@ -820,8 +788,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Attractions", - "model_output.classification.iab_content.tier2.label": "Bars & Restaurants" + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": "Dining Out" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -830,20 +798,10 @@ }, "id": "dining-out-medium", "mismatches": [ - { - "actual": "Attractions", - "expected": "Food & Drink", - "path": "model_output.classification.iab_content.tier1.label" - }, { "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Bars & Restaurants", - "expected": "Dining Out", - "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical medium IAB mapping case for Food & Drink > Dining Out.", @@ -854,8 +812,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Attractions", - "model_output.classification.iab_content.tier2.label": "Bars & Restaurants" + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": "Dining Out" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -864,20 +822,10 @@ }, "id": "dining-out-hard", "mismatches": [ - { - "actual": "Attractions", - "expected": "Food & Drink", - "path": "model_output.classification.iab_content.tier1.label" - }, { "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Bars & Restaurants", - "expected": "Dining Out", - "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical hard IAB mapping case for Food & Drink > Dining Out.", @@ -954,7 +902,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Science" + "model_output.classification.iab_content.tier1.label": "Real Estate" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -963,7 +911,7 @@ "id": "artificial-intelligence-easy", "mismatches": [ { - "actual": "Science", + "actual": "Real Estate", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" } @@ -975,9 +923,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Education", - "model_output.classification.iab_content.tier2.label": "Language Learning" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -987,11 +935,11 @@ "id": "artificial-intelligence-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "actual": null, + "expected": "Language Learning", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Artificial Intelligence.", "pass": false, "status": "must_fix", @@ -1024,8 +972,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Job Search" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1035,7 +983,7 @@ "id": "software-apps-easy", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Careers", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, @@ -1045,7 +993,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Computing", + "actual": "Job Search", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" } @@ -1059,8 +1007,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -1078,7 +1026,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Software and Applications", + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, "expected": "Internet", "path": "model_output.classification.iab_content.tier3.label" }, @@ -1096,8 +1049,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Virtual Reality", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Job Search", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1108,13 +1061,18 @@ }, "id": "software-apps-hard", "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Virtual Reality", + "actual": "Job Search", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1179,10 +1137,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Information and Network Security", + "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -1195,12 +1153,7 @@ "id": "communication-software-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Information and Network Security", + "actual": null, "expected": "Software and Applications", "path": "model_output.classification.iab_content.tier3.label" }, @@ -1217,9 +1170,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Virtual Reality", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, @@ -1233,12 +1186,12 @@ "id": "communication-software-hard", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Virtual Reality", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1263,8 +1216,8 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier3.label": "Data Storage and Warehousing", + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1279,6 +1232,16 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Data Storage and Warehousing", + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" } ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", @@ -1288,11 +1251,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1304,9 +1267,19 @@ "id": "web-hosting-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" } ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", @@ -1316,11 +1289,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1332,9 +1305,14 @@ "id": "web-hosting-hard", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" } ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", @@ -1398,8 +1376,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Laptops" + "model_output.classification.iab_content.tier2.label": "Consumer Electronics", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1415,12 +1393,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Computing", - "expected": "Consumer Electronics", - "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": "Laptops", + "actual": null, "expected": "Smartphones", "path": "model_output.classification.iab_content.tier3.label" } @@ -1432,11 +1405,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Computer Animation" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1448,12 +1421,17 @@ "id": "desktops-easy", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" }, { - "actual": "Computer Animation", + "actual": null, "expected": "Photo Editing Software", "path": "model_output.classification.iab_content.tier4.label" } @@ -1465,10 +1443,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Desktops" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1479,9 +1457,9 @@ "id": "desktops-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Desktops", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Desktops.", @@ -1491,10 +1469,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Desktops" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1505,9 +1483,14 @@ "id": "desktops-hard", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Desktops", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Desktops.", @@ -1593,10 +1576,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Men's Fashion", - "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Shopping", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1607,17 +1590,17 @@ "id": "style-fashion-parent-easy", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Shopping", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Men's Fashion", + "actual": null, "expected": "Women's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Men's Shoes and Footwear", + "actual": null, "expected": "Women's Shoes and Footwear", "path": "model_output.classification.iab_content.tier3.label" } @@ -1691,9 +1674,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Bodybuilding", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Style & Fashion", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1705,17 +1688,7 @@ "id": "womens-shoes-easy", "mismatches": [ { - "actual": "Sports", - "expected": "Style & Fashion", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Bodybuilding", + "actual": null, "expected": "Women's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1732,9 +1705,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Style & Fashion", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -1746,17 +1719,7 @@ "id": "womens-shoes-medium", "mismatches": [ { - "actual": "Sports", - "expected": "Style & Fashion", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Walking", + "actual": null, "expected": "Women's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -1801,8 +1764,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Children's Clothing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Women's Fashion", + "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1818,12 +1781,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Children's Clothing", + "actual": "Women's Fashion", "expected": "Men's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Women's Shoes and Footwear", "expected": "Men's Clothing", "path": "model_output.classification.iab_content.tier3.label" } @@ -1837,8 +1800,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Men's Fashion", - "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" + "model_output.classification.iab_content.tier2.label": "Women's Fashion", + "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1854,7 +1817,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Men's Shoes and Footwear", + "actual": "Women's Fashion", + "expected": "Men's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": "Women's Shoes and Footwear", "expected": "Men's Clothing", "path": "model_output.classification.iab_content.tier3.label" } @@ -1868,8 +1836,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Children's Clothing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Women's Fashion", + "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1885,12 +1853,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Children's Clothing", + "actual": "Women's Fashion", "expected": "Men's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Women's Shoes and Footwear", "expected": "Men's Shoes and Footwear", "path": "model_output.classification.iab_content.tier3.label" } @@ -1936,10 +1904,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type", - "model_output.classification.iab_content.tier3.label": "Hotels and Motels" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1950,9 +1918,14 @@ "id": "hotels-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Travel Type", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Hotels and Motels", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Travel > Travel Type > Hotels and Motels.", @@ -1964,7 +1937,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1977,6 +1950,11 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Travel Type", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical hard IAB mapping case for Travel > Travel Type > Hotels and Motels.", @@ -2077,8 +2055,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", - "model_output.classification.iab_content.tier3.label": "Running and Jogging" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2099,12 +2077,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Fitness and Exercise", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Running and Jogging", + "actual": null, "expected": "Green Solutions", "path": "model_output.classification.iab_content.tier3.label" } @@ -2118,7 +2096,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier2.label": "Bodybuilding", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -2140,7 +2118,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Walking", + "actual": "Bodybuilding", "expected": "Fitness and Exercise", "path": "model_output.classification.iab_content.tier2.label" }, @@ -2302,9 +2280,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2319,12 +2297,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Travel Type", + "actual": null, "expected": "Fiction", "path": "model_output.classification.iab_content.tier2.label" } @@ -2337,7 +2310,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Books and Literature" + "model_output.classification.iab_content.tier1.label": "Genres" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2345,6 +2318,11 @@ }, "id": "fiction-hard", "mismatches": [ + { + "actual": "Genres", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "exact", "expected": "nearest_equivalent", @@ -2360,7 +2338,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Interior Decorating" + "model_output.classification.iab_content.tier2.label": "Remodeling & Construction" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2373,11 +2351,6 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Interior Decorating", - "expected": "Remodeling & Construction", - "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical easy IAB mapping case for Home & Garden > Home Improvement.", @@ -2388,9 +2361,9 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Interior Decorating", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier1.label": "Style & Fashion", + "model_output.classification.iab_content.tier2.label": "Personal Care", + "model_output.classification.iab_content.tier3.label": "Bath and Shower" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2400,25 +2373,10 @@ }, "id": "home-improvement-medium", "mismatches": [ - { - "actual": "Home & Garden", - "expected": "Style & Fashion", - "path": "model_output.classification.iab_content.tier1.label" - }, { "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Interior Decorating", - "expected": "Personal Care", - "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": null, - "expected": "Bath and Shower", - "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Home & Garden > Home Improvement.", @@ -2462,9 +2420,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Augmented Reality" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2474,17 +2432,12 @@ "id": "online-education-easy", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Careers", "expected": "Education", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Augmented Reality", + "actual": null, "expected": "Language Learning", "path": "model_output.classification.iab_content.tier2.label" } @@ -2623,10 +2576,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Medical Health", - "model_output.classification.iab_content.tier2.label": "Diseases and Conditions", - "model_output.classification.iab_content.tier3.label": "Allergies" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2637,9 +2590,19 @@ "id": "medical-health-easy", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Food & Drink", + "expected": "Medical Health", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Diseases and Conditions", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Allergies", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical easy IAB mapping case for Medical Health.", @@ -2649,10 +2612,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Medical Health", "model_output.classification.iab_content.tier2.label": "Diseases and Conditions", - "model_output.classification.iab_content.tier3.label": "Bone and Joint Conditions", + "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -2665,12 +2628,7 @@ "id": "medical-health-medium", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Bone and Joint Conditions", + "actual": null, "expected": "Injuries", "path": "model_output.classification.iab_content.tier3.label" }, @@ -2689,7 +2647,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Medical Health", - "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier2.label": "Surgery", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -2711,7 +2669,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": null, + "actual": "Surgery", "expected": "Wellness", "path": "model_output.classification.iab_content.tier2.label" }, @@ -2812,9 +2770,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Holidays", - "model_output.classification.iab_content.tier2.label": "National & Civic Holidays" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2829,12 +2787,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "National & Civic Holidays", + "actual": null, "expected": "Food Movements", "path": "model_output.classification.iab_content.tier2.label" } @@ -2935,9 +2888,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Genres", - "model_output.classification.iab_content.tier2.label": "Family/Children" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Shopping", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2947,17 +2900,12 @@ "id": "parenting-medium", "mismatches": [ { - "actual": "Genres", + "actual": "Shopping", "expected": "Family and Relationships", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Family/Children", + "actual": null, "expected": "Parenting", "path": "model_output.classification.iab_content.tier2.label" } @@ -2972,7 +2920,7 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Family and Relationships", "model_output.classification.iab_content.tier2.label": "Parenting", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Parenting Babies and Toddlers" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2988,7 +2936,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": null, + "actual": "Parenting Babies and Toddlers", "expected": "Special Needs Kids", "path": "model_output.classification.iab_content.tier3.label" } @@ -3070,7 +3018,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Entertainment", - "model_output.classification.iab_content.tier2.label": "Movies" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -3083,6 +3031,11 @@ "actual": "exact", "expected": "nearest_equivalent", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Movies", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical easy IAB mapping case for Entertainment > Movies.", @@ -3093,8 +3046,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Entertainment", - "model_output.classification.iab_content.tier2.label": "Movies", + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Horror", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3106,7 +3059,7 @@ "id": "movies-medium", "mismatches": [ { - "actual": "Entertainment", + "actual": "Genres", "expected": "Video Gaming", "path": "model_output.classification.iab_content.tier1.label" }, @@ -3116,7 +3069,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Movies", + "actual": "Horror", "expected": "Video Game Genres", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3172,21 +3125,21 @@ "iab_cross_vertical_quality_target_eval": { "by_status": { "must_fix": { - "failed": 49, - "passed": 41, + "failed": 64, + "passed": 26, "total": 90 } }, "cases_path": "/content/agentic-intent-classifier/examples/iab_cross_vertical_mapping_cases.json", "count": 90, - "failed": 49, - "passed": 41, + "failed": 64, + "passed": 26, "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Insurance" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -3196,17 +3149,12 @@ "id": "auto-buying-easy", "mismatches": [ { - "actual": "Personal Finance", + "actual": "Travel", "expected": "Automotive", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Insurance", + "actual": "Travel Type", "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" } @@ -3218,7 +3166,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, @@ -3229,11 +3177,6 @@ }, "id": "auto-buying-medium", "mismatches": [ - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Auto Body Styles", "expected": "Auto Buying and Selling", @@ -3247,9 +3190,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Car Culture" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -3259,12 +3202,7 @@ "id": "auto-buying-hard", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Car Culture", + "actual": null, "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" } @@ -3278,8 +3216,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3295,12 +3233,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Software and Applications", + "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" } @@ -3312,9 +3250,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Robotics", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3331,7 +3269,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Robotics", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3351,7 +3294,7 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Business and Finance", "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": "Sales" + "model_output.classification.iab_content.tier3.label": "Startups" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3360,17 +3303,23 @@ "model_output.classification.iab_content.tier3.label": "Sales" }, "id": "sales-crm-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Startups", + "expected": "Sales", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Business and Finance > Business > Sales.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need software to manage leads and pipeline for a startup sales team" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3382,12 +3331,17 @@ "id": "marketing-tools-easy", "mismatches": [ { - "actual": "Careers", + "actual": "Technology & Computing", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Job Search", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3404,9 +3358,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sensitive Topics", - "model_output.classification.iab_content.tier2.label": "Terrorism", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3418,12 +3372,17 @@ "id": "marketing-tools-medium", "mismatches": [ { - "actual": "Sensitive Topics", + "actual": "Technology & Computing", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Terrorism", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3440,9 +3399,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Home Utilities", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3454,12 +3413,17 @@ "id": "marketing-tools-hard", "mismatches": [ { - "actual": "Personal Finance", + "actual": "Careers", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Home Utilities", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3476,10 +3440,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Information and Network Security" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3490,17 +3454,22 @@ "id": "business-it-easy", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Careers", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Information and Network Security", + "actual": null, "expected": "Business I.T.", "path": "model_output.classification.iab_content.tier3.label" } @@ -3512,9 +3481,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3526,12 +3495,17 @@ "id": "business-it-medium", "mismatches": [ { - "actual": "Careers", + "actual": "Personal Finance", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Job Search", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3603,8 +3577,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Attractions", - "model_output.classification.iab_content.tier2.label": "Bars & Restaurants" + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": "Dining Out" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3612,28 +3586,17 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-medium", - "mismatches": [ - { - "actual": "Attractions", - "expected": "Food & Drink", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "Bars & Restaurants", - "expected": "Dining Out", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Food & Drink > Dining Out.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Good restaurants for a client dinner downtown" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Attractions", - "model_output.classification.iab_content.tier2.label": "Bars & Restaurants" + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": "Dining Out" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3641,20 +3604,9 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-hard", - "mismatches": [ - { - "actual": "Attractions", - "expected": "Food & Drink", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "Bars & Restaurants", - "expected": "Dining Out", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Food & Drink > Dining Out.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a place to eat tonight where I can make a reservation online" }, @@ -3721,7 +3673,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Science", + "model_output.classification.iab_content.tier1.label": "Real Estate", "model_output.classification.iab_content.tier2.label": null }, "expected": { @@ -3732,7 +3684,7 @@ "id": "artificial-intelligence-easy", "mismatches": [ { - "actual": "Science", + "actual": "Real Estate", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, @@ -3754,9 +3706,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Education", - "model_output.classification.iab_content.tier2.label": "Language Learning" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3771,7 +3723,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Language Learning", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Artificial Intelligence", "path": "model_output.classification.iab_content.tier2.label" } @@ -3813,9 +3770,9 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3824,9 +3781,25 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "software-apps-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Job Search", + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best workflow software for a small operations team" }, @@ -3834,8 +3807,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3844,17 +3817,28 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "software-apps-medium", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need project management software for a distributed team" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Virtual Reality", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Job Search", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3866,7 +3850,12 @@ "id": "software-apps-hard", "mismatches": [ { - "actual": "Virtual Reality", + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Job Search", "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -3926,10 +3915,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Information and Network Security", + "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -3942,7 +3931,12 @@ "id": "communication-software-medium", "mismatches": [ { - "actual": "Information and Network Security", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Software and Applications", "path": "model_output.classification.iab_content.tier3.label" }, @@ -3959,9 +3953,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Virtual Reality", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, @@ -3975,7 +3969,17 @@ "id": "communication-software-hard", "mismatches": [ { - "actual": "Virtual Reality", + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -4000,8 +4004,8 @@ "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier3.label": "Data Storage and Warehousing", + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4011,19 +4015,30 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Data Storage and Warehousing", + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Vercel vs Netlify for website hosting" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4033,19 +4048,40 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best hosting platform for a startup website" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4055,9 +4091,25 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a managed hosting provider to deploy and run our marketing site" }, @@ -4105,8 +4157,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Laptops" + "model_output.classification.iab_content.tier2.label": "Consumer Electronics", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4115,18 +4167,29 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Consumer Electronics", + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Laptops", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a portable computer with good battery life for everyday work" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4137,7 +4200,17 @@ "id": "desktops-easy", "mismatches": [ { - "actual": "Software and Applications", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, "expected": "Desktops", "path": "model_output.classification.iab_content.tier3.label" } @@ -4149,10 +4222,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Desktops" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4161,18 +4234,29 @@ "model_output.classification.iab_content.tier3.label": "Desktops" }, "id": "desktops-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Desktops", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which desktop computer should I buy for a home office?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Desktops" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4181,9 +4265,25 @@ "model_output.classification.iab_content.tier3.label": "Desktops" }, "id": "desktops-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Desktops", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a desktop PC with strong performance for creative work" }, @@ -4249,8 +4349,8 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Style & Fashion" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Shopping" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -4259,9 +4359,9 @@ "id": "style-fashion-parent-easy", "mismatches": [ { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Shopping", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" } ], "notes": "Cross-vertical easy IAB mapping case for Style & Fashion.", @@ -4315,9 +4415,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Bodybuilding", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Style & Fashion", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -4329,12 +4429,12 @@ "id": "womens-shoes-easy", "mismatches": [ { - "actual": "Sports", - "expected": "Style & Fashion", - "path": "model_output.classification.iab_content.tier1.label" + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Bodybuilding", + "actual": null, "expected": "Women's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -4351,9 +4451,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Style & Fashion", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -4365,12 +4465,12 @@ "id": "womens-shoes-medium", "mismatches": [ { - "actual": "Sports", - "expected": "Style & Fashion", - "path": "model_output.classification.iab_content.tier1.label" + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Walking", + "actual": null, "expected": "Women's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, @@ -4409,8 +4509,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Children's Clothing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Women's Fashion", + "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4421,12 +4521,12 @@ "id": "mens-shoes-easy", "mismatches": [ { - "actual": "Children's Clothing", + "actual": "Women's Fashion", "expected": "Men's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Women's Shoes and Footwear", "expected": "Men's Shoes and Footwear", "path": "model_output.classification.iab_content.tier3.label" } @@ -4440,8 +4540,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Men's Fashion", - "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" + "model_output.classification.iab_content.tier2.label": "Women's Fashion", + "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4450,9 +4550,20 @@ "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" }, "id": "mens-shoes-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Women's Fashion", + "expected": "Men's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": "Women's Shoes and Footwear", + "expected": "Men's Shoes and Footwear", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Style & Fashion > Men's Fashion > Men's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Good men's dress shoes for office use" }, @@ -4460,8 +4571,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Children's Clothing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier2.label": "Women's Fashion", + "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4472,12 +4583,12 @@ "id": "mens-shoes-hard", "mismatches": [ { - "actual": "Children's Clothing", + "actual": "Women's Fashion", "expected": "Men's Fashion", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": null, + "actual": "Women's Shoes and Footwear", "expected": "Men's Shoes and Footwear", "path": "model_output.classification.iab_content.tier3.label" } @@ -4509,10 +4620,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type", - "model_output.classification.iab_content.tier3.label": "Hotels and Motels" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4521,9 +4632,25 @@ "model_output.classification.iab_content.tier3.label": "Hotels and Motels" }, "id": "hotels-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Travel Type", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Hotels and Motels", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Travel > Travel Type > Hotels and Motels.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best hotels near Times Square for a weekend trip" }, @@ -4531,7 +4658,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -4542,6 +4669,11 @@ }, "id": "hotels-hard", "mismatches": [ + { + "actual": null, + "expected": "Travel Type", + "path": "model_output.classification.iab_content.tier2.label" + }, { "actual": null, "expected": "Hotels and Motels", @@ -4628,8 +4760,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", - "model_output.classification.iab_content.tier3.label": "Running and Jogging" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4638,9 +4770,20 @@ "model_output.classification.iab_content.tier3.label": "Running and Jogging" }, "id": "running-and-jogging-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Fitness and Exercise", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Running and Jogging", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best running plan for a first 10k" }, @@ -4648,7 +4791,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier2.label": "Bodybuilding", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -4665,7 +4808,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Walking", + "actual": "Bodybuilding", "expected": "Fitness and Exercise", "path": "model_output.classification.iab_content.tier2.label" }, @@ -4801,9 +4944,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -4818,12 +4961,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Travel Type", + "actual": null, "expected": "Fiction", "path": "model_output.classification.iab_content.tier2.label" } @@ -4836,8 +4974,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Books and Literature", - "model_output.classification.iab_content.tier2.label": "Fiction" + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Romance" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4845,9 +4983,20 @@ "model_output.classification.iab_content.tier2.label": "Fiction" }, "id": "fiction-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Genres", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Romance", + "expected": "Fiction", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Books and Literature > Fiction.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Looking for a character-driven novel, not comics or poetry" }, @@ -4855,7 +5004,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Interior Decorating" + "model_output.classification.iab_content.tier2.label": "Remodeling & Construction" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4865,7 +5014,7 @@ "id": "home-improvement-easy", "mismatches": [ { - "actual": "Interior Decorating", + "actual": "Remodeling & Construction", "expected": "Home Improvement", "path": "model_output.classification.iab_content.tier2.label" } @@ -4878,8 +5027,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Interior Decorating" + "model_output.classification.iab_content.tier1.label": "Style & Fashion", + "model_output.classification.iab_content.tier2.label": "Personal Care" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4889,7 +5038,12 @@ "id": "home-improvement-medium", "mismatches": [ { - "actual": "Interior Decorating", + "actual": "Style & Fashion", + "expected": "Home & Garden", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Personal Care", "expected": "Home Improvement", "path": "model_output.classification.iab_content.tier2.label" } @@ -4930,9 +5084,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Augmented Reality" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4942,12 +5096,17 @@ "id": "online-education-easy", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Careers", "expected": "Education", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Augmented Reality", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Online Education", "path": "model_output.classification.iab_content.tier2.label" } @@ -5077,23 +5236,34 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Medical Health" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Medical Health" }, "id": "medical-health-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Food & Drink", + "expected": "Medical Health", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Medical Health.", - "pass": true, + "pass": false, "status": "must_fix", "text": "what do these allergy symptoms mean" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Medical Health" }, "expected": { @@ -5101,9 +5271,15 @@ "model_output.classification.iab_content.tier1.label": "Medical Health" }, "id": "medical-health-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Medical Health.", - "pass": true, + "pass": false, "status": "must_fix", "text": "when should i see a doctor for persistent knee pain" }, @@ -5207,9 +5383,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Holidays", - "model_output.classification.iab_content.tier2.label": "National & Civic Holidays" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -5224,7 +5400,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "National & Civic Holidays", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Financial Planning", "path": "model_output.classification.iab_content.tier2.label" } @@ -5302,9 +5483,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Genres", - "model_output.classification.iab_content.tier2.label": "Family/Children" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Shopping", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -5314,12 +5495,17 @@ "id": "parenting-medium", "mismatches": [ { - "actual": "Genres", + "actual": "Shopping", "expected": "Family and Relationships", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Family/Children", + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, "expected": "Parenting", "path": "model_output.classification.iab_content.tier2.label" } @@ -5405,7 +5591,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Entertainment", - "model_output.classification.iab_content.tier2.label": "Movies" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -5413,17 +5599,23 @@ "model_output.classification.iab_content.tier2.label": "Movies" }, "id": "movies-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Movies", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Entertainment > Movies.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What movie should we watch tonight?" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Entertainment", - "model_output.classification.iab_content.tier2.label": "Movies" + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Horror" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -5431,9 +5623,20 @@ "model_output.classification.iab_content.tier2.label": "Movies" }, "id": "movies-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Genres", + "expected": "Entertainment", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Horror", + "expected": "Movies", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Entertainment > Movies.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best thriller movies from the last few years" }, @@ -5472,7 +5675,7 @@ "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": null }, @@ -5487,6 +5690,11 @@ "actual": null, "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Vehicle shopping queries should map into the automotive buying branch, not business sales.", @@ -5549,8 +5757,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -5566,12 +5774,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Software and Applications", + "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" }, @@ -5588,9 +5796,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Robotics", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -5607,7 +5815,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Robotics", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -5615,6 +5823,11 @@ "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Direct CRM vendor comparison should map cleanly into the sales domain.", @@ -5624,9 +5837,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -5643,7 +5856,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Job Search", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -5651,6 +5864,11 @@ "actual": null, "expected": "Marketing and Advertising", "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": "nearest_equivalent", + "expected": "exact", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Marketing tool discovery should map to the marketing and advertising branch.", @@ -5661,7 +5879,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Science", + "model_output.classification.iab_content.tier1.label": "Real Estate", "model_output.classification.iab_content.tier2.label": null }, "expected": { @@ -5672,7 +5890,7 @@ "id": "ml-explanation-maps-to-ai", "mismatches": [ { - "actual": "Science", + "actual": "Real Estate", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, @@ -5694,10 +5912,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Information and Network Security" + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -5708,24 +5926,19 @@ "id": "support-credential-help-maps-to-business-it", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Personal Finance", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Information and Network Security", + "actual": null, "expected": "Business I.T.", "path": "model_output.classification.iab_content.tier3.label" - }, - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Credential and account help should map to business IT rather than generic business.", @@ -5753,9 +5966,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "exact", - "model_output.classification.iab_content.tier1.label": "Sensitive Topics", - "model_output.classification.iab_content.tier2.label": "Crime & Harmful Acts to Individuals, Society & Human Right Violations", + "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -5767,12 +5980,12 @@ "id": "trial-signup-maps-to-software", "mismatches": [ { - "actual": "Sensitive Topics", + "actual": "Sports", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Crime & Harmful Acts to Individuals, Society & Human Right Violations", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -5780,11 +5993,6 @@ "actual": null, "expected": "Software and Applications", "path": "model_output.classification.iab_content.tier3.label" - }, - { - "actual": "exact", - "expected": "nearest_equivalent", - "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Software action queries should map to the software/application branch.", @@ -6261,9 +6469,9 @@ "heads": { "decision_phase": { "difficulty_benchmark": { - "accepted_accuracy": 0.9524, + "accepted_accuracy": 0.9619, "accepted_coverage": 1.0, - "accuracy": 0.9524, + "accuracy": 0.9619, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv", "count": 105, "dataset_path": "/content/agentic-intent-classifier/data/decision_phase_benchmark.jsonl", @@ -6277,12 +6485,12 @@ "macro_f1": 0.9711 }, "hard": { - "accepted_accuracy": 0.8857, + "accepted_accuracy": 0.9143, "accepted_coverage": 1.0, - "accuracy": 0.8857, + "accuracy": 0.9143, "count": 35, "fallback_rate": 0.0, - "macro_f1": 0.8908 + "macro_f1": 0.9194 }, "medium": { "accepted_accuracy": 1.0, @@ -6295,9 +6503,9 @@ }, "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.9536, + "macro_f1": 0.9635, "per_class_metrics": { - "accuracy": 0.9523809523809523, + "accuracy": 0.9619047619047619, "action": { "f1-score": 0.9655172413793104, "precision": 1.0, @@ -6311,21 +6519,21 @@ "support": 15.0 }, "consideration": { - "f1-score": 0.9285714285714286, + "f1-score": 0.9655172413793104, "precision": 1.0, - "recall": 0.8666666666666667, + "recall": 0.9333333333333333, "support": 15.0 }, "decision": { - "f1-score": 0.9333333333333333, - "precision": 0.9333333333333333, + "f1-score": 0.9655172413793104, + "precision": 1.0, "recall": 0.9333333333333333, "support": 15.0 }, "macro avg": { - "f1-score": 0.9536131694056934, - "precision": 0.9604010025062657, - "recall": 0.9523809523809524, + "f1-score": 0.9634888438133874, + "precision": 0.9699248120300752, + "recall": 0.9619047619047619, "support": 105.0 }, "post_purchase": { @@ -6347,26 +6555,26 @@ "support": 15.0 }, "weighted avg": { - "f1-score": 0.9536131694056934, - "precision": 0.9604010025062656, - "recall": 0.9523809523809523, + "f1-score": 0.9634888438133875, + "precision": 0.9699248120300752, + "recall": 0.9619047619047619, "support": 105.0 } }, "suite": "difficulty_benchmark" }, "final_wave_cases": { - "accepted_accuracy": 0.8889, + "accepted_accuracy": 0.8519, "accepted_coverage": 1.0, - "accuracy": 0.8889, + "accuracy": 0.8519, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_final_wave_cases_confusion_matrix.csv", "count": 27, "dataset_path": "/content/agentic-intent-classifier/data/decision_phase/final_wave_cases.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.8876, + "macro_f1": 0.8319, "per_class_metrics": { - "accuracy": 0.8888888888888888, + "accuracy": 0.8518518518518519, "action": { "f1-score": 0.0, "precision": 0.0, @@ -6374,15 +6582,15 @@ "support": 0.0 }, "awareness": { - "f1-score": 0.9090909090909091, - "precision": 0.8333333333333334, + "f1-score": 0.7692307692307693, + "precision": 0.625, "recall": 1.0, "support": 5.0 }, "consideration": { - "f1-score": 0.75, + "f1-score": 0.8888888888888888, "precision": 1.0, - "recall": 0.6, + "recall": 0.8, "support": 5.0 }, "decision": { @@ -6392,9 +6600,9 @@ "support": 5.0 }, "macro avg": { - "f1-score": 0.7608225108225108, - "precision": 0.7761904761904762, - "recall": 0.7642857142857142, + "f1-score": 0.7130647130647131, + "precision": 0.7321428571428571, + "recall": 0.7214285714285714, "support": 27.0 }, "post_purchase": { @@ -6404,9 +6612,9 @@ "support": 4.0 }, "research": { - "f1-score": 0.6666666666666666, - "precision": 0.6, - "recall": 0.75, + "f1-score": 0.3333333333333333, + "precision": 0.5, + "recall": 0.25, "support": 4.0 }, "support": { @@ -6416,9 +6624,9 @@ "support": 4.0 }, "weighted avg": { - "f1-score": 0.8874859708193041, - "precision": 0.9098765432098765, - "recall": 0.8888888888888888, + "f1-score": 0.8379233934789491, + "precision": 0.8564814814814815, + "recall": 0.8518518518518519, "support": 27.0 } }, @@ -6502,7 +6710,7 @@ "dataset_path": "/content/agentic-intent-classifier/data/decision_phase/test.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.7724, + "macro_f1": 0.7637, "per_class_metrics": { "accuracy": 0.7586206896551724, "action": { @@ -6512,8 +6720,8 @@ "support": 3.0 }, "awareness": { - "f1-score": 0.6666666666666666, - "precision": 0.5, + "f1-score": 0.6, + "precision": 0.42857142857142855, "recall": 1.0, "support": 3.0 }, @@ -6524,27 +6732,27 @@ "support": 5.0 }, "decision": { - "f1-score": 0.8888888888888888, + "f1-score": 1.0, "precision": 1.0, - "recall": 0.8, + "recall": 1.0, "support": 5.0 }, "macro avg": { - "f1-score": 0.7724489795918367, - "precision": 0.8095238095238095, + "f1-score": 0.763718820861678, + "precision": 0.7945578231292517, "recall": 0.7928571428571428, "support": 29.0 }, "post_purchase": { - "f1-score": 0.8, - "precision": 0.6666666666666666, + "f1-score": 0.8888888888888888, + "precision": 0.8, "recall": 1.0, "support": 4.0 }, "research": { - "f1-score": 0.4444444444444444, - "precision": 0.5, - "recall": 0.4, + "f1-score": 0.25, + "precision": 0.3333333333333333, + "recall": 0.2, "support": 5.0 }, "support": { @@ -6554,8 +6762,8 @@ "support": 4.0 }, "weighted avg": { - "f1-score": 0.7601806239737274, - "precision": 0.8160919540229885, + "f1-score": 0.7511767925561028, + "precision": 0.7983579638752052, "recall": 0.7586206896551724, "support": 29.0 } @@ -6563,17 +6771,17 @@ "suite": "test" }, "train": { - "accepted_accuracy": 0.9412, + "accepted_accuracy": 0.9706, "accepted_coverage": 1.0, - "accuracy": 0.9412, + "accuracy": 0.9706, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv", "count": 102, "dataset_path": "/content/agentic-intent-classifier/data/decision_phase/train.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.9464, + "macro_f1": 0.9729, "per_class_metrics": { - "accuracy": 0.9411764705882353, + "accuracy": 0.9705882352941176, "action": { "f1-score": 1.0, "precision": 1.0, @@ -6587,21 +6795,21 @@ "support": 16.0 }, "consideration": { - "f1-score": 0.8387096774193549, - "precision": 0.9285714285714286, - "recall": 0.7647058823529411, + "f1-score": 0.9375, + "precision": 1.0, + "recall": 0.8823529411764706, "support": 17.0 }, "decision": { - "f1-score": 0.967741935483871, + "f1-score": 1.0, "precision": 1.0, - "recall": 0.9375, + "recall": 1.0, "support": 16.0 }, "macro avg": { - "f1-score": 0.9463762044407206, - "precision": 0.9496465252767774, - "recall": 0.9479341736694679, + "f1-score": 0.9729175394497975, + "precision": 0.9737394957983193, + "recall": 0.9736694677871148, "support": 102.0 }, "post_purchase": { @@ -6611,8 +6819,8 @@ "support": 14.0 }, "research": { - "f1-score": 0.8484848484848485, - "precision": 0.7777777777777778, + "f1-score": 0.9032258064516129, + "precision": 0.875, "recall": 0.9333333333333333, "support": 15.0 }, @@ -6623,35 +6831,35 @@ "support": 14.0 }, "weighted avg": { - "f1-score": 0.9410231345715216, - "precision": 0.946188279233262, - "recall": 0.9411764705882353, + "f1-score": 0.9705984177639775, + "precision": 0.9723904267589389, + "recall": 0.9705882352941176, "support": 102.0 } }, "suite": "train" }, "val": { - "accepted_accuracy": 0.8621, + "accepted_accuracy": 0.8276, "accepted_coverage": 1.0, - "accuracy": 0.8621, + "accuracy": 0.8276, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv", "count": 29, "dataset_path": "/content/agentic-intent-classifier/data/decision_phase/val.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.8567, + "macro_f1": 0.8254, "per_class_metrics": { - "accuracy": 0.8620689655172413, + "accuracy": 0.8275862068965517, "action": { - "f1-score": 0.8571428571428571, - "precision": 0.75, + "f1-score": 1.0, + "precision": 1.0, "recall": 1.0, "support": 3.0 }, "awareness": { - "f1-score": 0.9090909090909091, - "precision": 0.8333333333333334, + "f1-score": 0.8333333333333334, + "precision": 0.7142857142857143, "recall": 1.0, "support": 5.0 }, @@ -6668,21 +6876,21 @@ "support": 4.0 }, "macro avg": { - "f1-score": 0.8566790352504637, - "precision": 0.880952380952381, - "recall": 0.8571428571428571, + "f1-score": 0.8254483611626469, + "precision": 0.8520408163265306, + "recall": 0.8214285714285714, "support": 29.0 }, "post_purchase": { - "f1-score": 0.8571428571428571, - "precision": 1.0, + "f1-score": 0.75, + "precision": 0.75, "recall": 0.75, "support": 4.0 }, "research": { - "f1-score": 0.75, - "precision": 0.75, - "recall": 0.75, + "f1-score": 0.5714285714285714, + "precision": 0.6666666666666666, + "recall": 0.5, "support": 4.0 }, "support": { @@ -6692,9 +6900,9 @@ "support": 4.0 }, "weighted avg": { - "f1-score": 0.8602776533811015, - "precision": 0.8821839080459771, - "recall": 0.8620689655172413, + "f1-score": 0.822585460516495, + "precision": 0.8415435139573071, + "recall": 0.8275862068965517, "support": 29.0 } }, @@ -6703,92 +6911,92 @@ }, "iab_content": { "cross_vertical_benchmark": { - "accepted_accuracy": 0.427, - "accepted_coverage": 0.9889, - "accuracy": 0.4222, + "accepted_accuracy": 0.3108, + "accepted_coverage": 0.8222, + "accuracy": 0.2556, "count": 90, "dataset_path": "/content/agentic-intent-classifier/data/iab_cross_vertical_benchmark.jsonl", "difficulty_breakdown": { "easy": { - "accepted_accuracy": 0.4138, - "accepted_coverage": 0.9667, - "accuracy": 0.4, + "accepted_accuracy": 0.3636, + "accepted_coverage": 0.7333, + "accuracy": 0.2667, "count": 30, - "fallback_rate": 0.0333, - "macro_f1": 0.2727 + "fallback_rate": 0.2667, + "macro_f1": 0.1778 }, "hard": { - "accepted_accuracy": 0.4667, - "accepted_coverage": 1.0, - "accuracy": 0.4667, + "accepted_accuracy": 0.3077, + "accepted_coverage": 0.8667, + "accuracy": 0.2667, "count": 30, - "fallback_rate": 0.0, - "macro_f1": 0.3106 + "fallback_rate": 0.1333, + "macro_f1": 0.1562 }, "medium": { - "accepted_accuracy": 0.4, - "accepted_coverage": 1.0, - "accuracy": 0.4, + "accepted_accuracy": 0.2692, + "accepted_coverage": 0.8667, + "accuracy": 0.2333, "count": 30, - "fallback_rate": 0.0, - "macro_f1": 0.2667 + "fallback_rate": 0.1333, + "macro_f1": 0.1591 } }, - "fallback_rate": 0.0111, + "fallback_rate": 0.1778, "head": "iab_content", - "macro_f1": 0.227, + "macro_f1": 0.1228, "primary_source": "supervised_classifier", "suite": "cross_vertical_benchmark", "tier_metrics": { - "average_prediction_depth": 2.4, + "average_prediction_depth": 1.9222, "error_buckets": { - "exact_match": 38, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 14, + "exact_match": 23, + "parent_safe_stop": 3, + "right_tier1_wrong_tier2": 23, "wrong_deep_leaf": 8, - "wrong_tier1": 29 + "wrong_tier1": 33 }, - "exact_path_accuracy": 0.4222, - "parent_safe_accuracy": 0.4444, - "tier1_accuracy": 0.6778, - "tier2_accuracy": 0.4881, - "tier3_accuracy": 0.5238, - "tier4_accuracy": 0.5 + "exact_path_accuracy": 0.2556, + "parent_safe_accuracy": 0.4222, + "tier1_accuracy": 0.6333, + "tier2_accuracy": 0.3571, + "tier3_accuracy": 0.2381, + "tier4_accuracy": 0.0 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.4, + "average_prediction_depth": 1.9222, "error_buckets": { - "exact_match": 37, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 14, - "wrong_deep_leaf": 9, - "wrong_tier1": 29 - }, - "exact_path_accuracy": 0.4111, - "parent_safe_accuracy": 0.4333, - "tier1_accuracy": 0.6778, - "tier2_accuracy": 0.4881, - "tier3_accuracy": 0.4762, - "tier4_accuracy": 0.5 + "exact_match": 23, + "parent_safe_stop": 3, + "right_tier1_wrong_tier2": 23, + "wrong_deep_leaf": 8, + "wrong_tier1": 33 + }, + "exact_path_accuracy": 0.2556, + "parent_safe_accuracy": 0.4222, + "tier1_accuracy": 0.6333, + "tier2_accuracy": 0.3571, + "tier3_accuracy": 0.2381, + "tier4_accuracy": 0.0 }, "combined_path": { - "average_prediction_depth": 2.4, + "average_prediction_depth": 1.9222, "error_buckets": { - "exact_match": 37, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 14, - "wrong_deep_leaf": 9, - "wrong_tier1": 29 - }, - "exact_path_accuracy": 0.4111, - "fallback_overuse_count": 25, - "fallback_rate": 0.2778, - "parent_safe_accuracy": 0.4333, - "tier1_accuracy": 0.6778, - "tier2_accuracy": 0.4881, - "tier3_accuracy": 0.4762, - "tier4_accuracy": 0.5 + "exact_match": 23, + "parent_safe_stop": 3, + "right_tier1_wrong_tier2": 23, + "wrong_deep_leaf": 8, + "wrong_tier1": 33 + }, + "exact_path_accuracy": 0.2556, + "fallback_overuse_count": 19, + "fallback_rate": 0.2111, + "parent_safe_accuracy": 0.4222, + "tier1_accuracy": 0.6333, + "tier2_accuracy": 0.3571, + "tier3_accuracy": 0.2381, + "tier4_accuracy": 0.0 }, "disagreements": { "classifier_vs_combined": 0 @@ -6801,92 +7009,92 @@ } }, "difficulty_benchmark": { - "accepted_accuracy": 0.4231, - "accepted_coverage": 1.0, - "accuracy": 0.4231, + "accepted_accuracy": 0.32, + "accepted_coverage": 0.8013, + "accuracy": 0.2564, "count": 156, "dataset_path": "/content/agentic-intent-classifier/data/iab_benchmark.jsonl", "difficulty_breakdown": { "easy": { - "accepted_accuracy": 0.4615, - "accepted_coverage": 1.0, - "accuracy": 0.4615, + "accepted_accuracy": 0.35, + "accepted_coverage": 0.7692, + "accuracy": 0.2692, "count": 52, - "fallback_rate": 0.0, - "macro_f1": 0.2359 + "fallback_rate": 0.2308, + "macro_f1": 0.153 }, "hard": { - "accepted_accuracy": 0.3654, - "accepted_coverage": 1.0, - "accuracy": 0.3654, + "accepted_accuracy": 0.275, + "accepted_coverage": 0.7692, + "accuracy": 0.2115, "count": 52, - "fallback_rate": 0.0, - "macro_f1": 0.1892 + "fallback_rate": 0.2308, + "macro_f1": 0.1108 }, "medium": { - "accepted_accuracy": 0.4423, - "accepted_coverage": 1.0, - "accuracy": 0.4423, + "accepted_accuracy": 0.3333, + "accepted_coverage": 0.8654, + "accuracy": 0.2885, "count": 52, - "fallback_rate": 0.0, - "macro_f1": 0.2338 + "fallback_rate": 0.1346, + "macro_f1": 0.1491 } }, - "fallback_rate": 0.0, + "fallback_rate": 0.1987, "head": "iab_content", - "macro_f1": 0.1524, + "macro_f1": 0.105, "primary_source": "supervised_classifier", "suite": "difficulty_benchmark", "tier_metrics": { - "average_prediction_depth": 2.4103, + "average_prediction_depth": 1.7564, "error_buckets": { - "exact_match": 66, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 42, - "wrong_deep_leaf": 8, - "wrong_tier1": 39 + "exact_match": 40, + "parent_safe_stop": 11, + "right_tier1_wrong_tier2": 58, + "wrong_deep_leaf": 1, + "wrong_tier1": 46 }, - "exact_path_accuracy": 0.4231, - "parent_safe_accuracy": 0.5385, - "tier1_accuracy": 0.75, - "tier2_accuracy": 0.4808, - "tier3_accuracy": 0.5093, - "tier4_accuracy": 0.4583 + "exact_path_accuracy": 0.2564, + "parent_safe_accuracy": 0.6218, + "tier1_accuracy": 0.7051, + "tier2_accuracy": 0.3333, + "tier3_accuracy": 0.2315, + "tier4_accuracy": 0.0 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.4103, + "average_prediction_depth": 1.7564, "error_buckets": { - "exact_match": 59, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 42, - "wrong_deep_leaf": 15, - "wrong_tier1": 39 - }, - "exact_path_accuracy": 0.3782, - "parent_safe_accuracy": 0.4936, - "tier1_accuracy": 0.75, - "tier2_accuracy": 0.4808, - "tier3_accuracy": 0.4259, - "tier4_accuracy": 0.1667 + "exact_match": 40, + "parent_safe_stop": 11, + "right_tier1_wrong_tier2": 58, + "wrong_deep_leaf": 1, + "wrong_tier1": 46 + }, + "exact_path_accuracy": 0.2564, + "parent_safe_accuracy": 0.6218, + "tier1_accuracy": 0.7051, + "tier2_accuracy": 0.3333, + "tier3_accuracy": 0.2315, + "tier4_accuracy": 0.0 }, "combined_path": { - "average_prediction_depth": 2.4103, + "average_prediction_depth": 1.7564, "error_buckets": { - "exact_match": 59, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 42, - "wrong_deep_leaf": 15, - "wrong_tier1": 39 - }, - "exact_path_accuracy": 0.3782, - "fallback_overuse_count": 15, - "fallback_rate": 0.0962, - "parent_safe_accuracy": 0.4936, - "tier1_accuracy": 0.75, - "tier2_accuracy": 0.4808, - "tier3_accuracy": 0.4259, - "tier4_accuracy": 0.1667 + "exact_match": 40, + "parent_safe_stop": 11, + "right_tier1_wrong_tier2": 58, + "wrong_deep_leaf": 1, + "wrong_tier1": 46 + }, + "exact_path_accuracy": 0.2564, + "fallback_overuse_count": 13, + "fallback_rate": 0.0833, + "parent_safe_accuracy": 0.6218, + "tier1_accuracy": 0.7051, + "tier2_accuracy": 0.3333, + "tier3_accuracy": 0.2315, + "tier4_accuracy": 0.0 }, "disagreements": { "classifier_vs_combined": 0 @@ -6899,60 +7107,60 @@ } }, "extended_cases": { - "accepted_accuracy": 0.5, - "accepted_coverage": 1.0, - "accuracy": 0.5, + "accepted_accuracy": 0.6, + "accepted_coverage": 0.625, + "accuracy": 0.375, "count": 8, "dataset_path": "/content/agentic-intent-classifier/data/iab/extended_cases.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.375, "head": "iab_content", - "macro_f1": 0.3333, + "macro_f1": 0.2308, "primary_source": "supervised_classifier", "suite": "extended_cases", "tier_metrics": { - "average_prediction_depth": 2.125, + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 4, - "right_tier1_wrong_tier2": 2, + "exact_match": 3, + "right_tier1_wrong_tier2": 1, "wrong_deep_leaf": 1, - "wrong_tier1": 1 + "wrong_tier1": 3 }, - "exact_path_accuracy": 0.5, - "parent_safe_accuracy": 0.5, - "tier1_accuracy": 0.875, + "exact_path_accuracy": 0.375, + "parent_safe_accuracy": 0.375, + "tier1_accuracy": 0.625, "tier2_accuracy": 0.5714, "tier3_accuracy": 0.0, "tier4_accuracy": 0.0 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.125, + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 4, - "right_tier1_wrong_tier2": 2, + "exact_match": 3, + "right_tier1_wrong_tier2": 1, "wrong_deep_leaf": 1, - "wrong_tier1": 1 + "wrong_tier1": 3 }, - "exact_path_accuracy": 0.5, - "parent_safe_accuracy": 0.5, - "tier1_accuracy": 0.875, + "exact_path_accuracy": 0.375, + "parent_safe_accuracy": 0.375, + "tier1_accuracy": 0.625, "tier2_accuracy": 0.5714, "tier3_accuracy": 0.0, "tier4_accuracy": 0.0 }, "combined_path": { - "average_prediction_depth": 2.125, + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 4, - "right_tier1_wrong_tier2": 2, + "exact_match": 3, + "right_tier1_wrong_tier2": 1, "wrong_deep_leaf": 1, - "wrong_tier1": 1 + "wrong_tier1": 3 }, - "exact_path_accuracy": 0.5, + "exact_path_accuracy": 0.375, "fallback_overuse_count": 2, "fallback_rate": 0.25, - "parent_safe_accuracy": 0.5, - "tier1_accuracy": 0.875, + "parent_safe_accuracy": 0.375, + "tier1_accuracy": 0.625, "tier2_accuracy": 0.5714, "tier3_accuracy": 0.0, "tier4_accuracy": 0.0 @@ -6968,18 +7176,18 @@ } }, "hard_cases": { - "accepted_accuracy": 0.4286, - "accepted_coverage": 0.875, + "accepted_accuracy": 0.6, + "accepted_coverage": 0.625, "accuracy": 0.375, "count": 8, "dataset_path": "/content/agentic-intent-classifier/data/iab/hard_cases.jsonl", - "fallback_rate": 0.125, + "fallback_rate": 0.375, "head": "iab_content", "macro_f1": 0.2308, "primary_source": "supervised_classifier", "suite": "hard_cases", "tier_metrics": { - "average_prediction_depth": 2.25, + "average_prediction_depth": 1.75, "error_buckets": { "exact_match": 3, "right_tier1_wrong_tier2": 1, @@ -6994,7 +7202,7 @@ }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.25, + "average_prediction_depth": 1.75, "error_buckets": { "exact_match": 3, "right_tier1_wrong_tier2": 1, @@ -7008,7 +7216,7 @@ "tier4_accuracy": 0.0 }, "combined_path": { - "average_prediction_depth": 2.25, + "average_prediction_depth": 1.75, "error_buckets": { "exact_match": 3, "right_tier1_wrong_tier2": 1, @@ -7034,48 +7242,48 @@ } }, "test": { - "accepted_accuracy": 0.943, - "accepted_coverage": 1.0, - "accuracy": 0.943, + "accepted_accuracy": 0.9278, + "accepted_coverage": 0.996, + "accuracy": 0.9247, "count": 3282, "dataset_path": "/content/agentic-intent-classifier/data/iab/test.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.004, "head": "iab_content", - "macro_f1": 0.911, + "macro_f1": 0.8814, "primary_source": "supervised_classifier", "suite": "test", "tier_metrics": { - "average_prediction_depth": 2.213, + "average_prediction_depth": 2.1706, "error_buckets": { - "exact_match": 3095, - "parent_safe_stop": 45, - "right_tier1_wrong_tier2": 41, - "wrong_deep_leaf": 72, - "wrong_tier1": 29 - }, - "exact_path_accuracy": 0.943, - "parent_safe_accuracy": 0.958, - "tier1_accuracy": 0.9912, - "tier2_accuracy": 0.9776, - "tier3_accuracy": 0.9078, - "tier4_accuracy": 0.7 + "exact_match": 3035, + "parent_safe_stop": 87, + "right_tier1_wrong_tier2": 56, + "wrong_deep_leaf": 69, + "wrong_tier1": 35 + }, + "exact_path_accuracy": 0.9247, + "parent_safe_accuracy": 0.961, + "tier1_accuracy": 0.9893, + "tier2_accuracy": 0.9707, + "tier3_accuracy": 0.8487, + "tier4_accuracy": 0.5714 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.213, + "average_prediction_depth": 2.1706, "error_buckets": { - "exact_match": 3052, - "parent_safe_stop": 44, - "right_tier1_wrong_tier2": 53, - "wrong_deep_leaf": 104, - "wrong_tier1": 29 - }, - "exact_path_accuracy": 0.9299, - "parent_safe_accuracy": 0.9445, - "tier1_accuracy": 0.9912, - "tier2_accuracy": 0.9734, - "tier3_accuracy": 0.8725, - "tier4_accuracy": 0.5 + "exact_match": 3004, + "parent_safe_stop": 84, + "right_tier1_wrong_tier2": 68, + "wrong_deep_leaf": 91, + "wrong_tier1": 35 + }, + "exact_path_accuracy": 0.9153, + "parent_safe_accuracy": 0.9506, + "tier1_accuracy": 0.9893, + "tier2_accuracy": 0.9665, + "tier3_accuracy": 0.8259, + "tier4_accuracy": 0.4429 }, "combined_path": { "count": 3282, @@ -7097,48 +7305,48 @@ } }, "train": { - "accepted_accuracy": 0.9459, - "accepted_coverage": 1.0, - "accuracy": 0.9459, + "accepted_accuracy": 0.9314, + "accepted_coverage": 0.9972, + "accuracy": 0.9295, "count": 13211, "dataset_path": "/content/agentic-intent-classifier/data/iab/train.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.0028, "head": "iab_content", - "macro_f1": 0.9194, + "macro_f1": 0.8927, "primary_source": "supervised_classifier", "suite": "train", "tier_metrics": { - "average_prediction_depth": 2.2105, + "average_prediction_depth": 2.1683, "error_buckets": { - "exact_match": 12496, - "parent_safe_stop": 162, - "right_tier1_wrong_tier2": 144, - "wrong_deep_leaf": 284, - "wrong_tier1": 125 - }, - "exact_path_accuracy": 0.9459, - "parent_safe_accuracy": 0.9585, - "tier1_accuracy": 0.9905, - "tier2_accuracy": 0.9805, - "tier3_accuracy": 0.9135, - "tier4_accuracy": 0.7268 + "exact_match": 12280, + "parent_safe_stop": 312, + "right_tier1_wrong_tier2": 215, + "wrong_deep_leaf": 288, + "wrong_tier1": 116 + }, + "exact_path_accuracy": 0.9295, + "parent_safe_accuracy": 0.9618, + "tier1_accuracy": 0.9912, + "tier2_accuracy": 0.9737, + "tier3_accuracy": 0.8557, + "tier4_accuracy": 0.6107 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.2105, + "average_prediction_depth": 2.1683, "error_buckets": { - "exact_match": 12323, - "parent_safe_stop": 157, - "right_tier1_wrong_tier2": 192, - "wrong_deep_leaf": 414, - "wrong_tier1": 125 - }, - "exact_path_accuracy": 0.9328, - "parent_safe_accuracy": 0.945, - "tier1_accuracy": 0.9905, - "tier2_accuracy": 0.9764, - "tier3_accuracy": 0.8777, - "tier4_accuracy": 0.525 + "exact_match": 12145, + "parent_safe_stop": 300, + "right_tier1_wrong_tier2": 263, + "wrong_deep_leaf": 387, + "wrong_tier1": 116 + }, + "exact_path_accuracy": 0.9193, + "parent_safe_accuracy": 0.9507, + "tier1_accuracy": 0.9912, + "tier2_accuracy": 0.9695, + "tier3_accuracy": 0.8301, + "tier4_accuracy": 0.475 }, "combined_path": { "count": 13211, @@ -7160,48 +7368,48 @@ } }, "val": { - "accepted_accuracy": 0.9442, - "accepted_coverage": 1.0, - "accuracy": 0.9442, + "accepted_accuracy": 0.9273, + "accepted_coverage": 0.9973, + "accuracy": 0.9254, "count": 3282, "dataset_path": "/content/agentic-intent-classifier/data/iab/val.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.0027, "head": "iab_content", - "macro_f1": 0.9166, + "macro_f1": 0.8864, "primary_source": "supervised_classifier", "suite": "val", "tier_metrics": { - "average_prediction_depth": 2.2151, + "average_prediction_depth": 2.1709, "error_buckets": { - "exact_match": 3099, - "parent_safe_stop": 35, - "right_tier1_wrong_tier2": 45, - "wrong_deep_leaf": 72, - "wrong_tier1": 31 - }, - "exact_path_accuracy": 0.9442, - "parent_safe_accuracy": 0.9576, - "tier1_accuracy": 0.9906, - "tier2_accuracy": 0.9769, - "tier3_accuracy": 0.9088, - "tier4_accuracy": 0.7286 + "exact_match": 3037, + "parent_safe_stop": 80, + "right_tier1_wrong_tier2": 55, + "wrong_deep_leaf": 74, + "wrong_tier1": 36 + }, + "exact_path_accuracy": 0.9254, + "parent_safe_accuracy": 0.9613, + "tier1_accuracy": 0.989, + "tier2_accuracy": 0.9713, + "tier3_accuracy": 0.8549, + "tier4_accuracy": 0.6071 }, "view_metrics": { "classifier": { - "average_prediction_depth": 2.2151, + "average_prediction_depth": 2.1709, "error_buckets": { - "exact_match": 3056, - "parent_safe_stop": 34, - "right_tier1_wrong_tier2": 57, - "wrong_deep_leaf": 104, - "wrong_tier1": 31 - }, - "exact_path_accuracy": 0.9311, - "parent_safe_accuracy": 0.9442, - "tier1_accuracy": 0.9906, - "tier2_accuracy": 0.9727, - "tier3_accuracy": 0.8736, - "tier4_accuracy": 0.5286 + "exact_match": 3002, + "parent_safe_stop": 78, + "right_tier1_wrong_tier2": 67, + "wrong_deep_leaf": 99, + "wrong_tier1": 36 + }, + "exact_path_accuracy": 0.9147, + "parent_safe_accuracy": 0.95, + "tier1_accuracy": 0.989, + "tier2_accuracy": 0.9672, + "tier3_accuracy": 0.829, + "tier4_accuracy": 0.4643 }, "combined_path": { "count": 3282, @@ -7225,9 +7433,9 @@ }, "intent_subtype": { "difficulty_benchmark": { - "accepted_accuracy": 0.9104, - "accepted_coverage": 0.9675, - "accuracy": 0.8917, + "accepted_accuracy": 0.8901, + "accepted_coverage": 0.9856, + "accuracy": 0.8845, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv", "count": 277, "dataset_path": "/content/agentic-intent-classifier/data/subtype_benchmark.jsonl", @@ -7238,52 +7446,52 @@ "accuracy": 0.913, "count": 92, "fallback_rate": 0.0, - "macro_f1": 0.9109 + "macro_f1": 0.9124 }, "hard": { - "accepted_accuracy": 0.8554, - "accepted_coverage": 0.9121, - "accuracy": 0.8132, + "accepted_accuracy": 0.8295, + "accepted_coverage": 0.967, + "accuracy": 0.8242, "count": 91, - "fallback_rate": 0.0879, - "macro_f1": 0.8025 + "fallback_rate": 0.033, + "macro_f1": 0.8183 }, "medium": { - "accepted_accuracy": 0.957, + "accepted_accuracy": 0.9247, "accepted_coverage": 0.9894, - "accuracy": 0.9468, + "accuracy": 0.9149, "count": 94, "fallback_rate": 0.0106, - "macro_f1": 0.9469 + "macro_f1": 0.9117 } }, - "fallback_rate": 0.0325, + "fallback_rate": 0.0144, "head": "intent_subtype", - "macro_f1": 0.8886, + "macro_f1": 0.8824, "per_class_metrics": { "account_help": { - "f1-score": 0.64, - "precision": 0.8, - "recall": 0.5333333333333333, + "f1-score": 0.7142857142857143, + "precision": 0.7692307692307693, + "recall": 0.6666666666666666, "support": 15.0 }, - "accuracy": 0.8916967509025271, + "accuracy": 0.8844765342960289, "billing_help": { - "f1-score": 0.8387096774193549, - "precision": 0.8125, + "f1-score": 0.8666666666666667, + "precision": 0.8666666666666667, "recall": 0.8666666666666667, "support": 15.0 }, "booking": { - "f1-score": 0.9285714285714286, + "f1-score": 0.75, "precision": 1.0, - "recall": 0.8666666666666667, + "recall": 0.6, "support": 15.0 }, "comparison": { - "f1-score": 0.8148148148148148, - "precision": 0.9166666666666666, - "recall": 0.7333333333333333, + "f1-score": 0.8888888888888888, + "precision": 1.0, + "recall": 0.8, "support": 15.0 }, "contact_sales": { @@ -7293,15 +7501,15 @@ "support": 15.0 }, "deal_seeking": { - "f1-score": 0.896551724137931, - "precision": 0.9285714285714286, - "recall": 0.8666666666666667, + "f1-score": 0.8888888888888888, + "precision": 1.0, + "recall": 0.8, "support": 15.0 }, "download": { - "f1-score": 0.9285714285714286, + "f1-score": 0.9655172413793104, "precision": 1.0, - "recall": 0.8666666666666667, + "recall": 0.9333333333333333, "support": 15.0 }, "education": { @@ -7311,15 +7519,15 @@ "support": 15.0 }, "emotional_reflection": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9375, + "precision": 0.8823529411764706, "recall": 1.0, "support": 15.0 }, "evaluation": { - "f1-score": 0.896551724137931, - "precision": 0.9285714285714286, - "recall": 0.8666666666666667, + "f1-score": 0.9333333333333333, + "precision": 0.9333333333333333, + "recall": 0.9333333333333333, "support": 15.0 }, "follow_up": { @@ -7329,21 +7537,21 @@ "support": 15.0 }, "macro avg": { - "f1-score": 0.8886471209737711, - "precision": 0.8965122159975102, - "recall": 0.8895561002178651, + "f1-score": 0.8824228919733669, + "precision": 0.8968567719420234, + "recall": 0.8825617283950618, "support": 277.0 }, "onboarding_setup": { - "f1-score": 0.8648648648648649, - "precision": 0.8, - "recall": 0.9411764705882353, + "f1-score": 0.918918918918919, + "precision": 0.85, + "recall": 1.0, "support": 17.0 }, "product_discovery": { - "f1-score": 0.9032258064516129, - "precision": 0.875, - "recall": 0.9333333333333333, + "f1-score": 0.9375, + "precision": 0.8823529411764706, + "recall": 1.0, "support": 15.0 }, "provider_selection": { @@ -7353,56 +7561,56 @@ "support": 16.0 }, "purchase": { - "f1-score": 0.896551724137931, - "precision": 0.9285714285714286, - "recall": 0.8666666666666667, + "f1-score": 0.8888888888888888, + "precision": 1.0, + "recall": 0.8, "support": 15.0 }, "signup": { - "f1-score": 0.8823529411764706, - "precision": 0.8333333333333334, - "recall": 0.9375, + "f1-score": 0.8235294117647058, + "precision": 0.7777777777777778, + "recall": 0.875, "support": 16.0 }, "task_execution": { - "f1-score": 0.9230769230769231, - "precision": 0.8571428571428571, - "recall": 1.0, + "f1-score": 0.8292682926829268, + "precision": 0.7391304347826086, + "recall": 0.9444444444444444, "support": 18.0 }, "troubleshooting": { - "f1-score": 0.8, - "precision": 0.8, - "recall": 0.8, + "f1-score": 0.7586206896551724, + "precision": 0.7857142857142857, + "recall": 0.7333333333333333, "support": 15.0 }, "weighted avg": { - "f1-score": 0.8891181699377334, - "precision": 0.8953221541324111, - "recall": 0.8916967509025271, + "f1-score": 0.8822131766431675, + "precision": 0.8945403392673653, + "recall": 0.8844765342960289, "support": 277.0 } }, "suite": "difficulty_benchmark" }, "extended_cases": { - "accepted_accuracy": 0.8302, + "accepted_accuracy": 0.8113, "accepted_coverage": 1.0, - "accuracy": 0.8302, + "accuracy": 0.8113, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv", "count": 53, "dataset_path": "/content/agentic-intent-classifier/data/subtype/extended_cases.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.7668, + "macro_f1": 0.7517, "per_class_metrics": { "account_help": { - "f1-score": 0.8, - "precision": 1.0, + "f1-score": 0.6666666666666666, + "precision": 0.6666666666666666, "recall": 0.6666666666666666, "support": 3.0 }, - "accuracy": 0.8301886792452831, + "accuracy": 0.8113207547169812, "billing_help": { "f1-score": 0.0, "precision": 0.0, @@ -7416,9 +7624,9 @@ "support": 0.0 }, "comparison": { - "f1-score": 1.0, + "f1-score": 0.6666666666666666, "precision": 1.0, - "recall": 1.0, + "recall": 0.5, "support": 2.0 }, "contact_sales": { @@ -7428,9 +7636,9 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 0.8571428571428571, - "precision": 0.75, - "recall": 1.0, + "f1-score": 0.7619047619047619, + "precision": 0.6666666666666666, + "recall": 0.8888888888888888, "support": 9.0 }, "download": { @@ -7458,15 +7666,15 @@ "support": 3.0 }, "follow_up": { - "f1-score": 0.7368421052631579, + "f1-score": 0.8, "precision": 1.0, - "recall": 0.5833333333333334, + "recall": 0.6666666666666666, "support": 12.0 }, "macro avg": { - "f1-score": 0.46858256266151, - "precision": 0.4565696649029982, - "recall": 0.513888888888889, + "f1-score": 0.45939292189292186, + "precision": 0.4611992945326278, + "recall": 0.48456790123456783, "support": 53.0 }, "onboarding_setup": { @@ -7500,8 +7708,8 @@ "support": 0.0 }, "task_execution": { - "f1-score": 0.6666666666666666, - "precision": 0.5, + "f1-score": 1.0, + "precision": 1.0, "recall": 1.0, "support": 1.0 }, @@ -7512,9 +7720,9 @@ "support": 1.0 }, "weighted avg": { - "f1-score": 0.8018611395225099, - "precision": 0.8208295896975142, - "recall": 0.8301886792452831, + "f1-score": 0.7861520554916781, + "precision": 0.7972446840371369, + "recall": 0.8113207547169812, "support": 53.0 } }, @@ -7529,7 +7737,7 @@ "dataset_path": "/content/agentic-intent-classifier/data/subtype/hard_cases.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.8447, + "macro_f1": 0.8426, "per_class_metrics": { "account_help": { "f1-score": 0.8, @@ -7575,8 +7783,8 @@ "support": 0.0 }, "education": { - "f1-score": 0.9508196721311475, - "precision": 0.90625, + "f1-score": 0.9666666666666667, + "precision": 0.9354838709677419, "recall": 1.0, "support": 29.0 }, @@ -7599,8 +7807,8 @@ "support": 12.0 }, "macro avg": { - "f1-score": 0.7038911013311109, - "precision": 0.7234953703703704, + "f1-score": 0.7021723995980289, + "precision": 0.7210790702726187, "recall": 0.7212962962962962, "support": 94.0 }, @@ -7611,8 +7819,8 @@ "support": 6.0 }, "product_discovery": { - "f1-score": 0.8888888888888888, - "precision": 0.8, + "f1-score": 0.8421052631578947, + "precision": 0.7272727272727273, "recall": 1.0, "support": 8.0 }, @@ -7647,8 +7855,8 @@ "support": 3.0 }, "weighted avg": { - "f1-score": 0.8798004011763282, - "precision": 0.8911125886524823, + "f1-score": 0.8807077824069889, + "precision": 0.8939419937189327, "recall": 0.8936170212765957, "support": 94.0 } @@ -7656,15 +7864,15 @@ "suite": "hard_cases" }, "test": { - "accepted_accuracy": 0.9, + "accepted_accuracy": 0.8714, "accepted_coverage": 1.0, - "accuracy": 0.9, + "accuracy": 0.8714, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv", "count": 70, "dataset_path": "/content/agentic-intent-classifier/data/subtype/test.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.8531, + "macro_f1": 0.8317, "per_class_metrics": { "account_help": { "f1-score": 1.0, @@ -7672,7 +7880,7 @@ "recall": 1.0, "support": 2.0 }, - "accuracy": 0.9, + "accuracy": 0.8714285714285714, "billing_help": { "f1-score": 0.0, "precision": 0.0, @@ -7698,9 +7906,9 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 0.6666666666666666, - "precision": 0.5, - "recall": 1.0, + "f1-score": 0.3333333333333333, + "precision": 0.25, + "recall": 0.5, "support": 2.0 }, "download": { @@ -7722,9 +7930,9 @@ "support": 5.0 }, "evaluation": { - "f1-score": 0.0, - "precision": 0.0, - "recall": 0.0, + "f1-score": 0.5, + "precision": 0.5, + "recall": 0.5, "support": 2.0 }, "follow_up": { @@ -7734,9 +7942,9 @@ "support": 11.0 }, "macro avg": { - "f1-score": 0.6635221022395855, - "precision": 0.6578042328042328, - "recall": 0.6885521885521885, + "f1-score": 0.646896135613619, + "precision": 0.6657407407407407, + "recall": 0.6538299663299663, "support": 70.0 }, "onboarding_setup": { @@ -7746,9 +7954,9 @@ "support": 4.0 }, "product_discovery": { - "f1-score": 1.0, + "f1-score": 0.9333333333333333, "precision": 1.0, - "recall": 1.0, + "recall": 0.875, "support": 8.0 }, "provider_selection": { @@ -7770,44 +7978,44 @@ "support": 2.0 }, "task_execution": { - "f1-score": 0.9230769230769231, - "precision": 0.8571428571428571, + "f1-score": 0.8571428571428571, + "precision": 0.75, "recall": 1.0, "support": 6.0 }, "troubleshooting": { - "f1-score": 1.0, + "f1-score": 0.6666666666666666, "precision": 1.0, - "recall": 1.0, + "recall": 0.5, "support": 2.0 }, "weighted avg": { - "f1-score": 0.8939882610403741, - "precision": 0.9094217687074829, - "recall": 0.9, + "f1-score": 0.8759558172936446, + "precision": 0.9073809523809524, + "recall": 0.8714285714285714, "support": 70.0 } }, "suite": "test" }, "train": { - "accepted_accuracy": 0.8978, - "accepted_coverage": 1.0, - "accuracy": 0.8978, + "accepted_accuracy": 0.9068, + "accepted_coverage": 0.9936, + "accuracy": 0.9042, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv", "count": 313, "dataset_path": "/content/agentic-intent-classifier/data/subtype/train.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.0064, "head": "intent_subtype", - "macro_f1": 0.877, + "macro_f1": 0.8787, "per_class_metrics": { "account_help": { - "f1-score": 0.7142857142857143, - "precision": 0.7142857142857143, - "recall": 0.7142857142857143, + "f1-score": 0.75, + "precision": 0.6666666666666666, + "recall": 0.8571428571428571, "support": 7.0 }, - "accuracy": 0.8977635782747604, + "accuracy": 0.9041533546325878, "billing_help": { "f1-score": 1.0, "precision": 1.0, @@ -7821,32 +8029,32 @@ "support": 5.0 }, "comparison": { - "f1-score": 0.967741935483871, - "precision": 0.9375, - "recall": 1.0, + "f1-score": 0.9655172413793104, + "precision": 1.0, + "recall": 0.9333333333333333, "support": 15.0 }, "contact_sales": { - "f1-score": 0.8, + "f1-score": 0.7142857142857143, "precision": 1.0, - "recall": 0.6666666666666666, + "recall": 0.5555555555555556, "support": 9.0 }, "deal_seeking": { - "f1-score": 0.9090909090909091, - "precision": 0.9090909090909091, + "f1-score": 0.9523809523809523, + "precision": 1.0, "recall": 0.9090909090909091, "support": 11.0 }, "download": { - "f1-score": 0.9411764705882353, - "precision": 0.8888888888888888, + "f1-score": 1.0, + "precision": 1.0, "recall": 1.0, "support": 8.0 }, "education": { - "f1-score": 0.9629629629629629, - "precision": 0.9285714285714286, + "f1-score": 0.9719626168224299, + "precision": 0.9454545454545454, "recall": 1.0, "support": 52.0 }, @@ -7857,21 +8065,21 @@ "support": 20.0 }, "evaluation": { - "f1-score": 0.6923076923076923, - "precision": 1.0, + "f1-score": 0.6666666666666666, + "precision": 0.9, "recall": 0.5294117647058824, "support": 17.0 }, "follow_up": { - "f1-score": 0.8571428571428571, - "precision": 0.8823529411764706, - "recall": 0.8333333333333334, + "f1-score": 0.8888888888888888, + "precision": 0.8888888888888888, + "recall": 0.8888888888888888, "support": 36.0 }, "macro avg": { - "f1-score": 0.8770498618135788, - "precision": 0.8988923431325393, - "recall": 0.876671278202288, + "f1-score": 0.8786951095392168, + "precision": 0.9007433723013433, + "recall": 0.8782602497120292, "support": 313.0 }, "onboarding_setup": { @@ -7881,9 +8089,9 @@ "support": 17.0 }, "product_discovery": { - "f1-score": 0.90625, - "precision": 0.8787878787878788, - "recall": 0.9354838709677419, + "f1-score": 0.9090909090909091, + "precision": 0.8571428571428571, + "recall": 0.967741935483871, "support": 31.0 }, "provider_selection": { @@ -7893,48 +8101,48 @@ "support": 25.0 }, "purchase": { - "f1-score": 0.8, - "precision": 1.0, + "f1-score": 0.7272727272727273, + "precision": 0.8, "recall": 0.6666666666666666, "support": 6.0 }, "signup": { - "f1-score": 0.8648648648648649, - "precision": 0.7619047619047619, + "f1-score": 0.8888888888888888, + "precision": 0.8, "recall": 1.0, "support": 16.0 }, "task_execution": { - "f1-score": 0.8292682926829268, - "precision": 0.7727272727272727, - "recall": 0.8947368421052632, + "f1-score": 0.8571428571428571, + "precision": 0.782608695652174, + "recall": 0.9473684210526315, "support": 19.0 }, "troubleshooting": { - "f1-score": 0.8, - "precision": 0.8333333333333334, - "recall": 0.7692307692307693, + "f1-score": 0.782608695652174, + "precision": 0.9, + "recall": 0.6923076923076923, "support": 13.0 }, "weighted avg": { - "f1-score": 0.894423568060199, - "precision": 0.9063956713482179, - "recall": 0.8977635782747604, + "f1-score": 0.9005147505975646, + "precision": 0.9118244687664052, + "recall": 0.9041533546325878, "support": 313.0 } }, "suite": "train" }, "val": { - "accepted_accuracy": 0.8608, - "accepted_coverage": 0.9875, - "accuracy": 0.85, + "accepted_accuracy": 0.8625, + "accepted_coverage": 1.0, + "accuracy": 0.8625, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv", "count": 80, "dataset_path": "/content/agentic-intent-classifier/data/subtype/val.jsonl", - "fallback_rate": 0.0125, + "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.6722, + "macro_f1": 0.6561, "per_class_metrics": { "account_help": { "f1-score": 0.5, @@ -7942,7 +8150,7 @@ "recall": 0.5, "support": 2.0 }, - "accuracy": 0.85, + "accuracy": 0.8625, "billing_help": { "f1-score": 0.0, "precision": 0.0, @@ -7956,9 +8164,9 @@ "support": 3.0 }, "comparison": { - "f1-score": 0.5, - "precision": 0.5, - "recall": 0.5, + "f1-score": 0.4, + "precision": 1.0, + "recall": 0.25, "support": 4.0 }, "contact_sales": { @@ -7968,8 +8176,8 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 0.8, - "precision": 0.6666666666666666, + "f1-score": 0.6666666666666666, + "precision": 0.5, "recall": 1.0, "support": 2.0 }, @@ -8004,21 +8212,21 @@ "support": 11.0 }, "macro avg": { - "f1-score": 0.5974890931031281, - "precision": 0.5811447811447812, - "recall": 0.6353535353535353, + "f1-score": 0.5832054560954817, + "precision": 0.5891975308641975, + "recall": 0.6315656565656567, "support": 80.0 }, "onboarding_setup": { - "f1-score": 0.8888888888888888, - "precision": 1.0, + "f1-score": 0.8, + "precision": 0.8, "recall": 0.8, "support": 5.0 }, "product_discovery": { - "f1-score": 0.8571428571428571, - "precision": 0.9, - "recall": 0.8181818181818182, + "f1-score": 0.9565217391304348, + "precision": 0.9166666666666666, + "recall": 1.0, "support": 11.0 }, "provider_selection": { @@ -8034,14 +8242,14 @@ "support": 2.0 }, "signup": { - "f1-score": 0.8, - "precision": 0.6666666666666666, + "f1-score": 0.6666666666666666, + "precision": 0.5, "recall": 1.0, "support": 2.0 }, "task_execution": { - "f1-score": 0.8421052631578947, - "precision": 0.7272727272727273, + "f1-score": 0.9411764705882353, + "precision": 0.8888888888888888, "recall": 1.0, "support": 8.0 }, @@ -8052,9 +8260,9 @@ "support": 1.0 }, "weighted avg": { - "f1-score": 0.8380398913951546, - "precision": 0.8423106060606059, - "recall": 0.85, + "f1-score": 0.8443893861892583, + "precision": 0.8649305555555555, + "recall": 0.8625, "support": 80.0 } }, @@ -8177,12 +8385,12 @@ }, "hard_cases": { "accepted_accuracy": 1.0, - "accepted_coverage": 0.9836, + "accepted_coverage": 1.0, "accuracy": 1.0, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_hard_cases_confusion_matrix.csv", "count": 61, "dataset_path": "/content/agentic-intent-classifier/data/hard_cases.jsonl", - "fallback_rate": 0.0164, + "fallback_rate": 0.0, "head": "intent_type", "macro_f1": 1.0, "per_class_metrics": { @@ -8263,17 +8471,17 @@ "suite": "hard_cases" }, "test": { - "accepted_accuracy": 0.9362, - "accepted_coverage": 1.0, - "accuracy": 0.9362, + "accepted_accuracy": 0.8889, + "accepted_coverage": 0.9574, + "accuracy": 0.8723, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv", "count": 47, "dataset_path": "/content/agentic-intent-classifier/data/test.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.0426, "head": "intent_type", - "macro_f1": 0.9235, + "macro_f1": 0.8006, "per_class_metrics": { - "accuracy": 0.9361702127659575, + "accuracy": 0.8723404255319149, "ambiguous": { "f1-score": 0.875, "precision": 1.0, @@ -8287,15 +8495,15 @@ "support": 1.0 }, "commercial": { - "f1-score": 0.9523809523809523, - "precision": 0.9090909090909091, - "recall": 1.0, + "f1-score": 0.9, + "precision": 0.9, + "recall": 0.9, "support": 10.0 }, "creative_generation": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.0, + "precision": 0.0, + "recall": 0.0, "support": 1.0 }, "exploratory": { @@ -8305,15 +8513,15 @@ "support": 1.0 }, "informational": { - "f1-score": 0.9411764705882353, - "precision": 0.8888888888888888, + "f1-score": 0.8888888888888888, + "precision": 0.8, "recall": 1.0, "support": 8.0 }, "macro avg": { - "f1-score": 0.9235224089635853, - "precision": 0.9297979797979797, - "recall": 0.9444444444444444, + "f1-score": 0.8005555555555555, + "precision": 0.8074999999999999, + "recall": 0.8219444444444445, "support": 47.0 }, "personal_reflection": { @@ -8335,32 +8543,32 @@ "support": 3.0 }, "transactional": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.875, + "precision": 0.875, + "recall": 0.875, "support": 8.0 }, "weighted avg": { - "f1-score": 0.9360614458549377, - "precision": 0.9511068128089405, - "recall": 0.9361702127659575, + "f1-score": 0.8734633569739952, + "precision": 0.8914893617021277, + "recall": 0.8723404255319149, "support": 47.0 } }, "suite": "test" }, "third_wave_cases": { - "accepted_accuracy": 0.8846, + "accepted_accuracy": 0.8462, "accepted_coverage": 1.0, - "accuracy": 0.8846, + "accuracy": 0.8462, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv", "count": 26, "dataset_path": "/content/agentic-intent-classifier/data/third_wave_cases.jsonl", "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 0.8209, + "macro_f1": 0.8148, "per_class_metrics": { - "accuracy": 0.8846153846153846, + "accuracy": 0.8461538461538461, "ambiguous": { "f1-score": 0.8235294117647058, "precision": 1.0, @@ -8374,9 +8582,9 @@ "support": 1.0 }, "commercial": { - "f1-score": 0.9230769230769231, - "precision": 0.8571428571428571, - "recall": 1.0, + "f1-score": 0.88, + "precision": 0.8461538461538461, + "recall": 0.9166666666666666, "support": 12.0 }, "creative_generation": { @@ -8398,9 +8606,9 @@ "support": 0.0 }, "macro avg": { - "f1-score": 0.5746606334841629, - "precision": 0.5857142857142857, - "recall": 0.5700000000000001, + "f1-score": 0.5703529411764705, + "precision": 0.5846153846153846, + "recall": 0.5616666666666666, "support": 26.0 }, "personal_reflection": { @@ -8428,30 +8636,30 @@ "support": 0.0 }, "weighted avg": { - "f1-score": 0.8966237382526975, - "precision": 0.9340659340659341, - "recall": 0.8846153846153846, + "f1-score": 0.8767420814479638, + "precision": 0.9289940828402367, + "recall": 0.8461538461538461, "support": 26.0 } }, "suite": "third_wave_cases" }, "train": { - "accepted_accuracy": 1.0, - "accepted_coverage": 0.9945, - "accuracy": 1.0, + "accepted_accuracy": 0.9945, + "accepted_coverage": 1.0, + "accuracy": 0.9945, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv", "count": 183, "dataset_path": "/content/agentic-intent-classifier/data/train.jsonl", - "fallback_rate": 0.0055, + "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 1.0, + "macro_f1": 0.9936, "per_class_metrics": { - "accuracy": 1.0, + "accuracy": 0.994535519125683, "ambiguous": { - "f1-score": 1.0, + "f1-score": 0.9836065573770492, "precision": 1.0, - "recall": 1.0, + "recall": 0.967741935483871, "support": 31.0 }, "chit_chat": { @@ -8485,9 +8693,9 @@ "support": 38.0 }, "macro avg": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.9935987509758002, + "precision": 0.990909090909091, + "recall": 0.9967741935483871, "support": 183.0 }, "personal_reflection": { @@ -8503,8 +8711,8 @@ "support": 5.0 }, "support": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9523809523809523, + "precision": 0.9090909090909091, "recall": 1.0, "support": 10.0 }, @@ -8515,26 +8723,26 @@ "support": 28.0 }, "weighted avg": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.9946208349863281, + "precision": 0.9950322901142573, + "recall": 0.994535519125683, "support": 183.0 } }, "suite": "train" }, "val": { - "accepted_accuracy": 0.9362, - "accepted_coverage": 1.0, - "accuracy": 0.9362, + "accepted_accuracy": 0.8913, + "accepted_coverage": 0.9787, + "accuracy": 0.8723, "confusion_matrix_path": "/content/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv", "count": 47, "dataset_path": "/content/agentic-intent-classifier/data/val.jsonl", - "fallback_rate": 0.0, + "fallback_rate": 0.0213, "head": "intent_type", - "macro_f1": 0.9108, + "macro_f1": 0.8144, "per_class_metrics": { - "accuracy": 0.9361702127659575, + "accuracy": 0.8723404255319149, "ambiguous": { "f1-score": 0.9411764705882353, "precision": 1.0, @@ -8554,8 +8762,8 @@ "support": 10.0 }, "creative_generation": { - "f1-score": 0.6666666666666666, - "precision": 0.5, + "f1-score": 0.4, + "precision": 0.25, "recall": 1.0, "support": 1.0 }, @@ -8572,9 +8780,9 @@ "support": 8.0 }, "macro avg": { - "f1-score": 0.9107843137254902, - "precision": 0.89, - "recall": 0.966388888888889, + "f1-score": 0.8143740573152337, + "precision": 0.8150000000000001, + "recall": 0.9080555555555556, "support": 47.0 }, "personal_reflection": { @@ -8584,27 +8792,27 @@ "support": 5.0 }, "prohibited": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.6666666666666666, + "precision": 0.5, "recall": 1.0, "support": 1.0 }, "support": { - "f1-score": 1.0, + "f1-score": 0.8, "precision": 1.0, - "recall": 1.0, + "recall": 0.6666666666666666, "support": 3.0 }, "transactional": { - "f1-score": 0.9333333333333333, + "f1-score": 0.7692307692307693, "precision": 1.0, - "recall": 0.875, + "recall": 0.625, "support": 8.0 }, "weighted avg": { - "f1-score": 0.9419274092615769, - "precision": 0.9574468085106383, - "recall": 0.9361702127659575, + "f1-score": 0.8884631430313531, + "precision": 0.9414893617021277, + "recall": 0.8723404255319149, "support": 47.0 } }, diff --git a/iab_classifier_model_output/train_metrics.json b/iab_classifier_model_output/train_metrics.json index cc536ed82a216975b70bc5163e96c341f2b3eb0f..e48d14dce1881226e11549a53e5473f9bb437521 100644 --- a/iab_classifier_model_output/train_metrics.json +++ b/iab_classifier_model_output/train_metrics.json @@ -4,22 +4,22 @@ "test_count": 3282, "test_metrics": { "epoch": 3.0, - "test_accuracy": 0.9439366240097502, - "test_loss": 1.8114978075027466, - "test_macro_f1": 0.9150587607646185, - "test_runtime": 9.356, - "test_samples_per_second": 350.793, - "test_steps_per_second": 22.018 + "test_accuracy": 0.9320536258379037, + "test_loss": 2.1225109100341797, + "test_macro_f1": 0.8928275998599801, + "test_runtime": 11.2269, + "test_samples_per_second": 292.333, + "test_steps_per_second": 18.349 }, "train_count": 13211, "val_count": 3282, "val_metrics": { "epoch": 3.0, - "val_accuracy": 0.9485070079219988, - "val_loss": 1.8056248426437378, - "val_macro_f1": 0.9206922853424929, - "val_runtime": 9.3831, - "val_samples_per_second": 349.776, - "val_steps_per_second": 21.954 + "val_accuracy": 0.9320536258379037, + "val_loss": 2.119333505630493, + "val_macro_f1": 0.8965787473982275, + "val_runtime": 11.1915, + "val_samples_per_second": 293.257, + "val_steps_per_second": 18.407 } } diff --git a/multitask_intent_model_output/multitask_intent.onnx b/multitask_intent_model_output/multitask_intent.onnx index 3677f5e5ac4a347bed2444d824241494775befb5..28b80ff95ef7944f5a03e24159faab14908f4bf7 100644 --- a/multitask_intent_model_output/multitask_intent.onnx +++ b/multitask_intent_model_output/multitask_intent.onnx @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:62cd7051bfe36d521d9059be8df5f782e46c5c821d9674f424c0530633da23dd +oid sha256:d536f910f1068119861d06d698a8c4ba100173abc4b192f42beeedf14c274707 size 61398 diff --git a/multitask_intent_model_output/multitask_intent.onnx.data b/multitask_intent_model_output/multitask_intent.onnx.data index b17ed51470ca36cce47139b7f7c990b0464b30fe..3c6c7fc6b66497998e8e479b4504c33c94f17e3f 100644 --- a/multitask_intent_model_output/multitask_intent.onnx.data +++ b/multitask_intent_model_output/multitask_intent.onnx.data @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e8bd2ab2f34a2f8add41671f3686866b66db8583964ea41559255e055a45ffa +oid sha256:ed0f4674d1b56d1cd6e42129cf6d7f0542559c24c7bbcf93fc361f87a1d89488 size 265598976 diff --git a/multitask_intent_model_output/multitask_model.pt b/multitask_intent_model_output/multitask_model.pt index 05282eb1cbc6b84c146a354eebfeabb697cdd5f4..eddf246b410e9f0e0ac40e7f59def8e7b932e3bc 100644 --- a/multitask_intent_model_output/multitask_model.pt +++ b/multitask_intent_model_output/multitask_model.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40505a22ecb1bfce5491f1728716cdbdf4aa99001a74b0d95af1255277f36a59 +oid sha256:110aa9638912221fc67dc16d1452192a280c0651d6701726abedd9a1fa60fc9f size 265602027 diff --git a/multitask_intent_model_output/train_metrics.json b/multitask_intent_model_output/train_metrics.json index 8aacd10b4efd9d3458906e1bb85d2856428441ec..37f548826f2deb50a7c1ff22ade8d1adec8d9ba1 100644 --- a/multitask_intent_model_output/train_metrics.json +++ b/multitask_intent_model_output/train_metrics.json @@ -8,30 +8,30 @@ "test_count": 70, "test_metrics": { "epoch": 4.0, - "test_decision_phase_accuracy": 0.7931034482758621, - "test_decision_phase_macro_f1": 0.8010204081632653, + "test_decision_phase_accuracy": 0.7586206896551724, + "test_decision_phase_macro_f1": 0.763718820861678, "test_intent_subtype_accuracy": 0.8714285714285714, - "test_intent_subtype_macro_f1": 0.7758580399233755, - "test_intent_type_accuracy": 0.8936170212765957, - "test_intent_type_macro_f1": 0.8110224089635854, - "test_loss": 1.376851201057434, - "test_runtime": 0.1497, - "test_samples_per_second": 467.54, - "test_steps_per_second": 33.396 + "test_intent_subtype_macro_f1": 0.8317236029317956, + "test_intent_type_accuracy": 0.8723404255319149, + "test_intent_type_macro_f1": 0.8005555555555555, + "test_loss": 1.4344456195831299, + "test_runtime": 0.1682, + "test_samples_per_second": 416.197, + "test_steps_per_second": 29.728 }, "train_count": 1590, "val_count": 473, "val_metrics": { "epoch": 4.0, - "val_decision_phase_accuracy": 0.9512195121951219, - "val_decision_phase_macro_f1": 0.9431640512510706, - "val_intent_subtype_accuracy": 0.8888888888888888, - "val_intent_subtype_macro_f1": 0.876672226881405, - "val_intent_type_accuracy": 0.9792387543252595, - "val_intent_type_macro_f1": 0.9727773461133127, - "val_loss": 0.5223116278648376, - "val_runtime": 0.9733, - "val_samples_per_second": 485.974, - "val_steps_per_second": 30.823 + "val_decision_phase_accuracy": 0.9560975609756097, + "val_decision_phase_macro_f1": 0.9496568779026763, + "val_intent_subtype_accuracy": 0.8950617283950617, + "val_intent_subtype_macro_f1": 0.8822267346328656, + "val_intent_type_accuracy": 0.9757785467128027, + "val_intent_type_macro_f1": 0.970010435450997, + "val_loss": 0.5456064343452454, + "val_runtime": 1.1168, + "val_samples_per_second": 423.544, + "val_steps_per_second": 26.863 } } diff --git a/training/run_full_training_pipeline.py b/training/run_full_training_pipeline.py index 29faa08b4472bfd19b830593db641bfcf9c0b6e6..7a47ca0b3977a565324614bcb295b2256d03a00c 100644 --- a/training/run_full_training_pipeline.py +++ b/training/run_full_training_pipeline.py @@ -21,6 +21,51 @@ def run_step(args: list[str]) -> None: print(f" end: {ended_wall}\n took: {elapsed_s:.2f}s") +def verify_local_artifacts() -> None: + """Ensure complete local training artifacts were generated. + + This check intentionally validates *local repo paths* so a complete run + cannot silently rely on hub cache/snapshot paths. + """ + required_dirs = { + "multitask model dir": BASE_DIR / "multitask_intent_model_output", + "iab classifier dir": BASE_DIR / "iab_classifier_model_output", + "calibration dir": BASE_DIR / "artifacts" / "calibration", + } + required_files = { + "multitask weights": BASE_DIR / "multitask_intent_model_output" / "multitask_model.pt", + "multitask metadata": BASE_DIR / "multitask_intent_model_output" / "metadata.json", + "iab model config": BASE_DIR / "iab_classifier_model_output" / "config.json", + "iab model weights": BASE_DIR / "iab_classifier_model_output" / "model.safetensors", + "calibration intent_type": BASE_DIR / "artifacts" / "calibration" / "intent_type.json", + "calibration intent_subtype": BASE_DIR / "artifacts" / "calibration" / "intent_subtype.json", + "calibration decision_phase": BASE_DIR / "artifacts" / "calibration" / "decision_phase.json", + "calibration iab_content": BASE_DIR / "artifacts" / "calibration" / "iab_content.json", + } + + missing: list[str] = [] + for name, path in required_dirs.items(): + if not path.is_dir(): + missing.append(f"[MISS] {name}: {path}") + else: + print(f"[OK ] {name}: {path}") + for name, path in required_files.items(): + if not path.is_file(): + missing.append(f"[MISS] {name}: {path}") + else: + print(f"[OK ] {name}: {path}") + + if missing: + print("\nLocal artifact verification failed:") + for line in missing: + print(line) + raise RuntimeError( + "Expected local training artifacts are missing. " + "Training finished, but outputs were not generated in the repo paths." + ) + print("\nLocal artifact verification passed.") + + def main() -> None: pipeline_start = time.perf_counter() pipeline_start_wall = datetime.now(timezone.utc).isoformat() @@ -116,6 +161,9 @@ def main() -> None: if args.smoke_test: run_step([python, "combined_inference.py", args.smoke_test_query]) + # Always enforce local artifact presence for complete training reliability. + verify_local_artifacts() + pipeline_elapsed_s = time.perf_counter() - pipeline_start pipeline_end_wall = datetime.now(timezone.utc).isoformat() print( diff --git a/training/upload_to_hf.py b/training/upload_to_hf.py index cb1d465cf12bc8acae118f6356d1b4d8df6eff5d..26a40ca6f6a56f4f445f4df8aaee70d948cb648f 100644 --- a/training/upload_to_hf.py +++ b/training/upload_to_hf.py @@ -10,11 +10,15 @@ from __future__ import annotations import argparse import os +import shutil import sys +import tempfile import time from datetime import datetime, timezone from pathlib import Path +LARGE_FILE_UPLOAD_THRESHOLD_BYTES = 100 * 1024 * 1024 + def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Upload trained intent/IAB artifacts to Hugging Face Hub.") @@ -53,6 +57,24 @@ def _parse_args() -> argparse.Namespace: action="store_true", help="Upload a Hugging Face model card file as README.md in the Hub repo root.", ) + parser.add_argument( + "--include-serving-code", + action="store_true", + help="Upload core runtime Python/code files required for Hub trust_remote_code inference.", + ) + parser.add_argument( + "--include-root-checkpoint", + action="store_true", + help="Upload root-level compatibility checkpoint/tokenizer files used by transformers.pipeline loader.", + ) + parser.add_argument( + "--include-all", + action="store_true", + help=( + "Upload everything needed for end-to-end Hub usage: multitask + iab + calibration + " + "HF README + serving code + root checkpoint/tokenizer files." + ), + ) parser.add_argument( "--hf-readme-path", default="HF_MODEL_CARD.md", @@ -81,6 +103,64 @@ def _parse_args() -> argparse.Namespace: return parser.parse_args() +def _iter_local_files(path: Path) -> list[Path]: + if path.is_file(): + return [path] + return sorted(p for p in path.rglob("*") if p.is_file()) + + +def _remote_file_paths(path_in_repo: str, local_path: Path) -> list[str]: + if local_path.is_file(): + return [path_in_repo] + return [ + f"{path_in_repo}/{file_path.relative_to(local_path).as_posix()}" + for file_path in _iter_local_files(local_path) + ] + + +def _requires_large_upload(local_path: Path) -> bool: + return any(file_path.stat().st_size >= LARGE_FILE_UPLOAD_THRESHOLD_BYTES for file_path in _iter_local_files(local_path)) + + +def _upload_via_large_folder(api, repo_id: str, repo_path: str, local_path: Path) -> None: + with tempfile.TemporaryDirectory(prefix="hf_large_upload_") as tmp_dir: + staging_root = Path(tmp_dir) + staged_target = staging_root / repo_path + staged_target.parent.mkdir(parents=True, exist_ok=True) + if local_path.is_file(): + shutil.copy2(local_path, staged_target) + else: + shutil.copytree( + local_path, + staged_target, + ignore=shutil.ignore_patterns(".cache", "__pycache__"), + ) + # Ensure resumable-upload metadata from previous local attempts does not + # get carried into the fresh staging directory. + shutil.rmtree(staged_target / ".cache", ignore_errors=True) + api.upload_large_folder( + repo_id=repo_id, + repo_type="model", + folder_path=str(staging_root), + print_report=False, + ) + + +def _verify_remote_upload(api, repo_id: str, repo_path: str, local_path: Path) -> None: + expected = set(_remote_file_paths(repo_path, local_path)) + for attempt in range(4): + files = set(api.list_repo_files(repo_id=repo_id, repo_type="model")) + missing = sorted(expected - files) + if not missing: + return + if attempt == 3: + raise RuntimeError( + "Upload completed but the following remote files are still missing: " + + ", ".join(missing[:20]) + ) + time.sleep(2 * (attempt + 1)) + + def main() -> int: started_at = time.perf_counter() started_wall = datetime.now(timezone.utc).isoformat() @@ -96,6 +176,14 @@ def main() -> int: calibration_dir = (repo_root / args.calibration_dir).resolve() hf_readme_path = (repo_root / args.hf_readme_path).resolve() + if args.include_all: + args.include_multitask = True + args.include_iab = True + args.include_calibration = True + args.include_hf_readme = True + args.include_serving_code = True + args.include_root_checkpoint = True + to_upload: list[tuple[str, Path]] = [] if args.include_multitask: to_upload.append(("multitask_intent_model_output", multitask_dir)) @@ -106,8 +194,43 @@ def main() -> int: if args.include_hf_readme: to_upload.append(("README.md", hf_readme_path)) + if args.include_serving_code: + # Files needed by trust_remote_code execution path. + for rel in [ + "pipeline.py", + "config.py", + "config.json", + "combined_inference.py", + "model_runtime.py", + "multitask_runtime.py", + "multitask_model.py", + "schemas.py", + "inference_intent_type.py", + "inference_subtype.py", + "inference_decision_phase.py", + "inference_iab_classifier.py", + "iab_classifier.py", + "iab_taxonomy.py", + ]: + to_upload.append((rel, (repo_root / rel).resolve())) + + if args.include_root_checkpoint: + for rel in [ + "model.safetensors", + "tokenizer.json", + "tokenizer_config.json", + "special_tokens_map.json", + "vocab.txt", + ]: + to_upload.append((rel, (repo_root / rel).resolve())) + if not to_upload: - print("Nothing to upload. Pass --include-multitask, --include-iab, and/or --include-calibration.", file=sys.stderr) + print( + "Nothing to upload. Pass include flags (e.g. --include-all), or one/more of: " + "--include-multitask --include-iab --include-calibration --include-hf-readme " + "--include-serving-code --include-root-checkpoint.", + file=sys.stderr, + ) return 2 # Import lazily so `--dry-run` works without extra deps. @@ -127,27 +250,26 @@ def main() -> int: if args.dry_run: print(f"[DRY] Would upload {local_dir} -> {args.repo_id}:{repo_path}") continue - # Upload single README.md file (Hub model card) vs directories - if repo_path == "README.md": - step_start = time.perf_counter() - print(f"[UPLOAD] {local_dir} -> {args.repo_id}:README.md") + step_start = time.perf_counter() + mode = "large-folder" if _requires_large_upload(local_dir) else "standard" + print(f"[UPLOAD] {local_dir} -> {args.repo_id}:{repo_path} ({mode})") + if mode == "large-folder": + _upload_via_large_folder(api, args.repo_id, repo_path, local_dir) + elif local_dir.is_file(): api.upload_file( repo_id=args.repo_id, repo_type="model", path_or_fileobj=str(local_dir), - path_in_repo="README.md", + path_in_repo=repo_path, ) - print(f"[DONE ] README.md took {(time.perf_counter() - step_start):.2f}s") - continue - - step_start = time.perf_counter() - print(f"[UPLOAD] {local_dir} -> {args.repo_id}:{repo_path}") - api.upload_folder( - repo_id=args.repo_id, - repo_type="model", - folder_path=str(local_dir), - path_in_repo=repo_path, - ) + else: + api.upload_folder( + repo_id=args.repo_id, + repo_type="model", + folder_path=str(local_dir), + path_in_repo=repo_path, + ) + _verify_remote_upload(api, args.repo_id, repo_path, local_dir) print(f"[DONE ] {repo_path} took {(time.perf_counter() - step_start):.2f}s") ended_wall = datetime.now(timezone.utc).isoformat()