diff --git a/HF_MODEL_CARD.md b/HF_MODEL_CARD.md new file mode 100644 index 0000000000000000000000000000000000000000..b2dda974c4052450156b03d139588793e5ff37b1 --- /dev/null +++ b/HF_MODEL_CARD.md @@ -0,0 +1,208 @@ +--- +language: +- en +library_name: transformers +pipeline_tag: text-classification +base_model: distilbert-base-uncased +metrics: +- accuracy +- f1 +tags: +- intent-classification +- multitask +- iab +- conversational-ai +- adtech +- calibrated-confidence +license: apache-2.0 +--- + +# admesh/agentic-intent-classifier + +Production-ready intent + IAB classifier bundle for conversational traffic. + +Combines multitask intent modeling, supervised IAB content classification, and per-head confidence calibration to support safe monetization decisions in real time. + +## Links + +- Hugging Face: https://huggingface.co/admesh/agentic-intent-classifier +- GitHub: https://github.com/GouniManikumar12/agentic-intent-classifier + +## What It Predicts + +| Field | Description | +|---|---| +| `intent.type` | `commercial`, `informational`, `navigational`, `transactional`, … | +| `intent.subtype` | `product_discovery`, `comparison`, `how_to`, … | +| `intent.decision_phase` | `awareness`, `consideration`, `decision`, … | +| `iab_content` | IAB Content Taxonomy 3.0 tier1 / tier2 / tier3 labels | +| `component_confidence` | Per-head calibrated confidence with threshold flags | +| `system_decision` | Monetization eligibility, opportunity type, policy | + +--- + +## Deployment Options + +### 1. `transformers.pipeline()` — one line anywhere + +```python +from transformers import pipeline + +clf = pipeline( + "admesh-intent", + model="admesh/agentic-intent-classifier", + trust_remote_code=True, +) + +result = clf("Which laptop should I buy for college?") +``` + +Batch and custom thresholds: + +```python +# batch +results = clf([ + "Best running shoes under $100", + "How does TCP work?", + "Buy noise-cancelling headphones", +]) + +# custom confidence thresholds +result = clf( + "Buy headphones", + threshold_overrides={"intent_type": 0.6, "intent_subtype": 0.35}, +) +``` + +--- + +### 2. HF Inference Endpoints (managed, deploy to AWS / Azure / GCP) + +1. Go to https://ui.endpoints.huggingface.co +2. **New Endpoint** → select `admesh/agentic-intent-classifier` +3. Framework: **PyTorch** — Task: **Text Classification** +4. Enable **"Load with trust_remote_code"** +5. Deploy + +The endpoint serves the same `pipeline()` interface above via REST: + +```bash +curl https://.endpoints.huggingface.cloud \ + -H "Authorization: Bearer $HF_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"inputs": "Which laptop should I buy for college?"}' +``` + +--- + +### 3. HF Spaces (Gradio / Streamlit demo) + +```python +# app.py for a Gradio Space +import gradio as gr +from transformers import pipeline + +clf = pipeline( + "admesh-intent", + model="admesh/agentic-intent-classifier", + trust_remote_code=True, +) + +def classify(text): + return clf(text) + +gr.Interface(fn=classify, inputs="text", outputs="json").launch() +``` + +--- + +### 4. Local / notebook via `snapshot_download` + +```python +import sys +from huggingface_hub import snapshot_download + +local_dir = snapshot_download( + repo_id="admesh/agentic-intent-classifier", + repo_type="model", +) +sys.path.insert(0, local_dir) + +from pipeline import AdmeshIntentPipeline +clf = AdmeshIntentPipeline() +result = clf("I need a CRM for a 5-person startup") +``` + +Or the one-liner factory: + +```python +from pipeline import AdmeshIntentPipeline +clf = AdmeshIntentPipeline.from_pretrained("admesh/agentic-intent-classifier") +``` + +--- + +## Example Output + +```json +{ + "model_output": { + "classification": { + "iab_content": { + "taxonomy": "IAB Content Taxonomy", + "taxonomy_version": "3.0", + "tier1": {"id": "552", "label": "Style & Fashion"}, + "tier2": {"id": "579", "label": "Men's Fashion"}, + "mapping_mode": "exact", + "mapping_confidence": 0.73 + }, + "intent": { + "type": "commercial", + "subtype": "product_discovery", + "decision_phase": "consideration", + "confidence": 0.9549, + "commercial_score": 0.656 + } + } + }, + "system_decision": { + "policy": { + "monetization_eligibility": "allowed_with_caution", + "eligibility_reason": "commercial_discovery_signal_present" + }, + "opportunity": {"type": "soft_recommendation", "strength": "medium"} + }, + "meta": { + "system_version": "0.6.0-phase4", + "calibration_enabled": true, + "iab_mapping_is_placeholder": false + } +} +``` + +## Reproducible Revision + +```python +from huggingface_hub import snapshot_download +local_dir = snapshot_download( + repo_id="admesh/agentic-intent-classifier", + repo_type="model", + revision="0584798f8efee6beccd778b0afa06782ab5add60", +) +``` + +## Included Artifacts + +| Path | Contents | +|---|---| +| `multitask_intent_model_output/` | DistilBERT multitask weights + tokenizer | +| `iab_classifier_model_output/` | IAB content classifier weights + tokenizer | +| `artifacts/calibration/` | Per-head temperature + threshold JSONs | +| `pipeline.py` | `AdmeshIntentPipeline` (transformers.Pipeline subclass) | +| `combined_inference.py` | Core inference logic | + +## Notes + +- `trust_remote_code=True` is required because this model uses a custom multi-head architecture that does not map to a single standard `AutoModel` checkpoint. +- `meta.iab_mapping_is_placeholder: true` means IAB artifacts were missing or skipped; train and calibrate IAB for full production accuracy. +- For long-running servers, instantiate once and reuse — models are cached in memory after the first call. diff --git a/README.md b/README.md index ceeca66f3ded2f679319ad0c808bc78fb12ea5e2..0db9f92289d361494b182604e0e62ff12222719f 100644 --- a/README.md +++ b/README.md @@ -1,81 +1,188 @@ ---- -language: -- en -library_name: transformers -pipeline_tag: text-classification -base_model: distilbert-base-uncased -metrics: -- accuracy -- f1 -tags: -- intent-classification -- multitask -- iab -- conversational-ai -- adtech - - calibrated-confidence -license: apache-2.0 ---- - -# admesh/agentic-intent-classifier +# Agentic Intent Classifier -Production-ready intent + IAB classifier bundle for conversational traffic. +`agentic-intent-classifier` is a multi-head query classification stack for conversational traffic. -This package combines multitask intent modeling, supervised IAB classification, and confidence calibration to support safe monetization decisions in real time. - -## What It Predicts +It currently produces: - `intent.type` - `intent.subtype` - `intent.decision_phase` - `iab_content` -- per-head calibrated confidence -- fallback/policy/opportunity decision envelope +- calibrated confidence per head +- combined fallback / policy / opportunity decisions + +The repo is beyond the original v0.1 baseline. It now includes: + +- shared config and label ownership +- reusable model runtime +- calibrated confidence and threshold gating +- combined inference with fallback/policy logic +- request/response validation in the demo API +- repeatable evaluation and regression suites +- full-TSV IAB taxonomy retrieval support through tier4 +- a local embedding index for taxonomy-node retrieval over IAB content paths +- a separate synthetic full-intent-taxonomy augmentation dataset for non-IAB heads +- a dedicated intent-type difficulty dataset and held-out benchmark with `easy`, `medium`, and `hard` cases +- a dedicated decision-phase difficulty dataset and held-out benchmark with `easy`, `medium`, and `hard` cases + +Generated model weights are intentionally not committed. + +## Current Taxonomy + +### `intent.type` + +- `informational` +- `exploratory` +- `commercial` +- `transactional` +- `support` +- `personal_reflection` +- `creative_generation` +- `chit_chat` +- `ambiguous` +- `prohibited` + +### `intent.decision_phase` + +- `awareness` +- `research` +- `consideration` +- `decision` +- `action` +- `post_purchase` +- `support` -## Why It Is Useful +### `intent.subtype` -- Single package for intent, phase, subtype, and IAB routing -- Calibrated thresholds for safer downstream decisions -- Works out of the box with `combined_inference.py` and `demo_api.py` -- Easy local run, Colab run, or server integration +- `education` +- `product_discovery` +- `comparison` +- `evaluation` +- `deal_seeking` +- `provider_selection` +- `signup` +- `purchase` +- `booking` +- `download` +- `contact_sales` +- `task_execution` +- `onboarding_setup` +- `troubleshooting` +- `account_help` +- `billing_help` +- `follow_up` +- `emotional_reflection` -## Links +### `iab_content` -- Hugging Face model: https://huggingface.co/admesh/agentic-intent-classifier -- GitHub source: https://github.com/GouniManikumar12/agentic-intent-classifier +- candidates are derived from every row in [data/iab-content/Content Taxonomy 3.0.tsv](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab-content/Content%20Taxonomy%203.0.tsv) +- retrieval output supports `tier1`, `tier2`, `tier3`, and optional `tier4` -## Quick Start +## What The System Does + +- runs three classifier heads: + - `intent_type` + - `intent_subtype` + - `decision_phase` +- resolves `iab_content` through a local embedding index over taxonomy nodes plus generic label/path reranking +- applies calibration artifacts when present +- computes `commercial_score` +- applies fallback when confidence is too weak or policy-safe blocking is required +- emits a schema-validated combined envelope + +## What The System Does Not Do + +- it is not a multi-turn memory system +- it is not a production-optimized low-latency serving path +- it is not yet trained on large real-traffic human-labeled intent data +- combined decision logic is still heuristic, even though it is materially stronger than the original baseline + +## Project Layout + +- [config.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/config.py): labels, thresholds, artifact paths, model paths +- [model_runtime.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/model_runtime.py): shared calibrated inference runtime +- [combined_inference.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/combined_inference.py): composed system response +- [inference_intent_type.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/inference_intent_type.py): direct `intent_type` inference entrypoint +- [inference_iab_classifier.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/inference_iab_classifier.py): direct supervised `iab_content` inference entrypoint +- [schemas.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/schemas.py): request/response validation +- [demo_api.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/demo_api.py): local validated API +- [iab_taxonomy.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/iab_taxonomy.py): full taxonomy parser/index +- [iab_classifier.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/iab_classifier.py): supervised IAB runtime with taxonomy-aware parent fallback +- [iab_retrieval.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/iab_retrieval.py): optional shadow retrieval baseline +- [training/build_full_intent_taxonomy_dataset.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/build_full_intent_taxonomy_dataset.py): separate synthetic intent augmentation dataset +- [training/build_intent_type_difficulty_dataset.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/build_intent_type_difficulty_dataset.py): extra `intent_type` augmentation plus held-out difficulty benchmark +- [training/build_decision_phase_difficulty_dataset.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/build_decision_phase_difficulty_dataset.py): extra `decision_phase` augmentation plus held-out difficulty benchmark +- [training/build_subtype_difficulty_dataset.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/build_subtype_difficulty_dataset.py): extra `intent_subtype` augmentation plus held-out difficulty benchmark +- [training/build_subtype_dataset.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/build_subtype_dataset.py): subtype dataset generation from existing corpora +- [training/train_iab.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/train_iab.py): train the supervised IAB classifier head +- [training/build_iab_taxonomy_embeddings.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/build_iab_taxonomy_embeddings.py): build local IAB node embedding artifacts +- [training/run_full_training_pipeline.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/training/run_full_training_pipeline.py): full multi-head training/calibration/eval pipeline +- [evaluation/run_evaluation.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/evaluation/run_evaluation.py): repeatable benchmark runner +- [evaluation/run_regression_suite.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/evaluation/run_regression_suite.py): known-failure regression runner +- [evaluation/run_iab_mapping_suite.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/evaluation/run_iab_mapping_suite.py): IAB behavior-lock regression runner +- [evaluation/run_iab_quality_suite.py](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/evaluation/run_iab_quality_suite.py): curated IAB quality-target runner +- [known_limitations.md](/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/known_limitations.md): current gaps and caveats + +## Quickstart: Run From Hugging Face + +Download the trained bundle and run inference in three lines — no local training required. ```python +import sys from huggingface_hub import snapshot_download +# Download the full bundle (models + calibration + code) local_dir = snapshot_download( repo_id="admesh/agentic-intent-classifier", repo_type="model", ) -print(local_dir) +sys.path.insert(0, local_dir) + +# Import and instantiate +from pipeline import AdmeshIntentPipeline +clf = AdmeshIntentPipeline() + +# Classify +import json +result = clf("Which laptop should I buy for college?") +print(json.dumps(result, indent=2)) ``` -```bash -cd "" -python3 training/pipeline_verify.py -python3 combined_inference.py "Which laptop should I buy for college?" +Or use the one-liner factory method: + +```python +from pipeline import AdmeshIntentPipeline # after sys.path.insert above + +clf = AdmeshIntentPipeline.from_pretrained("admesh/agentic-intent-classifier") +result = clf("I need a CRM for a 5-person startup") ``` -## API Mode +Batch mode and custom thresholds are also supported: -```bash -cd "" -python3 demo_api.py +```python +# Batch +results = clf([ + "Best running shoes under $100", + "How does gradient descent work?", + "Buy noise-cancelling headphones", +]) + +# Custom confidence thresholds +result = clf( + "Buy noise-cancelling headphones", + threshold_overrides={"intent_type": 0.6, "intent_subtype": 0.35}, +) ``` +Verify artifacts and run a smoke test from the CLI: + ```bash -curl -sS -X POST http://127.0.0.1:8008/classify \ - -H 'Content-Type: application/json' \ - -d '{"text":"I need CRM for a 5 person startup"}' +cd "" +python3 training/pipeline_verify.py +python3 combined_inference.py "Which CRM should I buy for a 3-person startup?" ``` -## Reproducible Revision +Pin a specific revision for reproducibility: ```python local_dir = snapshot_download( @@ -85,13 +192,301 @@ local_dir = snapshot_download( ) ``` -## Included Folders +--- + +## Setup (for local training) + +```bash +python3 -m venv .venv +source .venv/bin/activate +pip install -r agentic-intent-classifier/requirements.txt +``` + +## Inference (local training path) + +Run one query locally: + +```bash +cd agentic-intent-classifier +python3 training/train_iab.py +python3 training/calibrate_confidence.py --head iab_content +python3 combined_inference.py "Which CRM should I buy for a 3-person startup?" +``` + +Run only the `intent_type` head: + +```bash +cd agentic-intent-classifier +python3 inference_intent_type.py "best shoes under 100" +``` + +Run the demo API: + +```bash +cd agentic-intent-classifier +python3 demo_api.py +``` + +Example request: + +```bash +curl -sS -X POST http://127.0.0.1:8008/classify \ + -H 'Content-Type: application/json' \ + -d '{"text":"I cannot log into my account"}' +``` + +Infra endpoints: + +```bash +curl -sS http://127.0.0.1:8008/health +curl -sS http://127.0.0.1:8008/version +``` + +Train only the IAB classifier head: + +```bash +cd agentic-intent-classifier +python3 training/train_iab.py +python3 training/calibrate_confidence.py --head iab_content +``` + +The online `iab_content` path now uses the compact supervised classifier. Retrieval is still available as an optional shadow baseline. + +Build the optional retrieval shadow index: + +```bash +cd agentic-intent-classifier +python3 training/build_iab_taxonomy_embeddings.py +``` + +By default the shadow retrieval path uses `Alibaba-NLP/gte-Qwen2-1.5B-instruct`. The retrieval runtime applies the model's query-side instruction format and last-token pooling, matching the Hugging Face usage guidance. If you want to point retrieval at a different embedding model, set `IAB_RETRIEVAL_MODEL_NAME_OVERRIDE` before building the index. + +Open-source users can swap in their own embedding model, but the contract is: + +- query embeddings and taxonomy-node embeddings must be produced by the same model and model revision +- after changing models, you must rebuild `artifacts/iab/taxonomy_embeddings.pt` +- the repository only tests and supports the default model path out of the box +- not every Hugging Face embedding model is drop-in compatible with this runtime; some require custom pooling, query instructions, or `trust_remote_code` + +Example override: + +```bash +cd agentic-intent-classifier +export IAB_RETRIEVAL_MODEL_NAME_OVERRIDE=mixedbread-ai/mxbai-embed-large-v1 +python3 training/build_iab_taxonomy_embeddings.py +``` + +This writes: + +- `artifacts/iab/taxonomy_nodes.json` +- `artifacts/iab/taxonomy_embeddings.pt` + +## Training + +### Full local pipeline + +```bash +cd agentic-intent-classifier +python3 training/run_full_training_pipeline.py +``` + +This pipeline now does: + +1. build separate full-intent-taxonomy augmentation data +2. build separate `intent_type` difficulty augmentation + benchmark +3. train `intent_type` +4. build subtype corpus +5. build separate `intent_subtype` difficulty augmentation + benchmark +6. train `intent_subtype` +7. build separate `decision_phase` difficulty augmentation + benchmark +8. train `decision_phase` +9. train `iab_content` +10. calibrate all classifier heads, including `iab_content` +11. run regression/evaluation unless `--skip-full-eval` is used + +### Build datasets individually + +Separate full-intent augmentation: + +```bash +cd agentic-intent-classifier +python3 training/build_full_intent_taxonomy_dataset.py +``` + +Intent-type difficulty augmentation and benchmark: + +```bash +cd agentic-intent-classifier +python3 training/build_intent_type_difficulty_dataset.py +``` + +Decision-phase difficulty augmentation and benchmark: + +```bash +cd agentic-intent-classifier +python3 training/build_decision_phase_difficulty_dataset.py +``` + +Subtype difficulty augmentation and benchmark: + +```bash +cd agentic-intent-classifier +python3 training/build_subtype_difficulty_dataset.py +``` + +Subtype dataset: + +```bash +cd agentic-intent-classifier +python3 training/build_subtype_dataset.py +``` + +IAB embedding index: + +```bash +cd agentic-intent-classifier +python3 training/build_iab_taxonomy_embeddings.py +``` + +### Train heads individually + +```bash +cd agentic-intent-classifier +python3 training/train.py +python3 training/train_subtype.py +python3 training/train_decision_phase.py +``` + +### Calibration + +```bash +cd agentic-intent-classifier +python3 training/calibrate_confidence.py --head intent_type +python3 training/calibrate_confidence.py --head intent_subtype +python3 training/calibrate_confidence.py --head decision_phase +``` + +## Evaluation + +Full evaluation: + +```bash +cd agentic-intent-classifier +python3 evaluation/run_evaluation.py +``` + +Known-failure regression: + +```bash +cd agentic-intent-classifier +python3 evaluation/run_regression_suite.py +``` + +IAB behavior-lock regression: + +```bash +cd agentic-intent-classifier +python3 evaluation/run_iab_mapping_suite.py +``` + +IAB quality-target evaluation: + +```bash +cd agentic-intent-classifier +python3 evaluation/run_iab_quality_suite.py +``` + +Threshold sweeps: + +```bash +cd agentic-intent-classifier +python3 evaluation/sweep_intent_threshold.py +``` + +Artifacts are written to: -- `multitask_intent_model_output/` -- `iab_classifier_model_output/` - `artifacts/calibration/` +- `artifacts/evaluation/latest/` + +## Google Colab + +Use Colab for the full retraining pass if local memory is limited. + +Clone once: + +```bash +%cd /content +!git clone https://github.com/GouniManikumar12/agentic-intent-classifier.git +%cd /content/agentic-intent-classifier +``` + +If the repo is already cloned and you want the latest code, pull manually: + +```bash +!git pull origin main +``` + +Full pipeline: + +```bash +!python training/run_full_training_pipeline.py +``` + +If full evaluation is too heavy for the current Colab runtime: + +```bash +!python training/run_full_training_pipeline.py \ + --iab-embedding-batch-size 32 \ + --skip-full-eval +``` + +Then run eval separately after training: + +```bash +!python evaluation/run_regression_suite.py +!python evaluation/run_iab_mapping_suite.py +!python evaluation/run_iab_quality_suite.py +!python evaluation/run_evaluation.py +``` + +## Current Saved Metrics + +Generate fresh metrics with: + +```bash +cd agentic-intent-classifier +python3 evaluation/run_evaluation.py +``` + +Do not treat any checked-in summary as canonical unless it was regenerated after the current code and artifacts were built. The IAB path is now retrieval-based, so older saved reports from the deleted hierarchy stack are not meaningful. + +## Latency Note + +`combined_inference.py` is a debugging/offline path, not a production latency path. + +Current production truth: + +- per-request CLI execution is not a sub-50ms architecture +- production serving should use a long-lived API process with preloaded models +- if sub-50ms becomes a hard requirement, the serving path will need: + - persistent loaded models + - runtime optimization + - likely fewer model passes or a shared multi-head model + +## Current Status + +Current repo status: + +- full 10-class `intent.type` taxonomy is wired +- subtype and phase heads are present +- difficulty benchmarks are wired for `intent_type`, `intent_subtype`, and `decision_phase` +- full-TSV IAB taxonomy retrieval is wired through tier4 +- separate full-intent augmentation dataset is in place +- evaluation/runtime memory handling is improved for large IAB splits -## Notes +The main remaining gap is not basic infrastructure anymore. It is improving real-world robustness, especially for: -- Use the three folders above together for expected behavior. -- If integrating in production, prefer long-lived API processes with preloaded models. +- `decision_phase` +- `intent_subtype` +- confidence quality on borderline commercial queries +- real-traffic supervision beyond synthetic data diff --git a/artifacts/calibration/decision_phase.json b/artifacts/calibration/decision_phase.json index e417d60c8ae9b82e48679e3da006f16bf629557f..afd98bfbbe14ac529cf412d1d5a23aa299806ce4 100644 --- a/artifacts/calibration/decision_phase.json +++ b/artifacts/calibration/decision_phase.json @@ -1,20 +1,20 @@ { "calibrated": true, "confidence_threshold": 0.22, - "generated_at": "2026-03-25T05:10:14.092098+00:00", + "generated_at": "2026-03-25T07:02:39.749873+00:00", "head": "decision_phase", "metrics": { "calibrated_accuracy": 0.8621, - "calibrated_expected_calibration_error": 0.0877, - "calibrated_negative_log_likelihood": 0.5315, - "mean_calibrated_confidence": 0.866, - "mean_raw_confidence": 0.8684, + "calibrated_expected_calibration_error": 0.0915, + "calibrated_negative_log_likelihood": 0.5003, + "mean_calibrated_confidence": 0.8724, + "mean_raw_confidence": 0.8716, "raw_accuracy": 0.8621, - "raw_expected_calibration_error": 0.087, - "raw_negative_log_likelihood": 0.5317 + "raw_expected_calibration_error": 0.0911, + "raw_negative_log_likelihood": 0.5003 }, "minimum_threshold_floor": 0.22, - "optimized_temperature_candidate": 1.008347, + "optimized_temperature_candidate": 0.997346, "selected_threshold_before_floor": { "accepted_accuracy": 0.8621, "coverage": 1.0, @@ -22,7 +22,7 @@ }, "selection_split": "val", "selection_target_precision": 0.75, - "temperature": 1.008347, + "temperature": 0.997346, "temperature_scaling_applied": true, "threshold_summary": { "accepted_accuracy": 0.8621, diff --git a/artifacts/calibration/iab_content.json b/artifacts/calibration/iab_content.json index d196427d97791dbb018cd372ec05aa369f7ea0c8..af58e666db83258528e23a6a4786091e66097bb2 100644 --- a/artifacts/calibration/iab_content.json +++ b/artifacts/calibration/iab_content.json @@ -1,32 +1,32 @@ { "calibrated": true, "confidence_threshold": 0.12, - "generated_at": "2026-03-25T05:12:02.550364+00:00", + "generated_at": "2026-03-25T07:04:35.676097+00:00", "head": "iab_content", "metrics": { - "calibrated_accuracy": 0.9442, - "calibrated_expected_calibration_error": 0.2773, - "calibrated_negative_log_likelihood": 0.5519, - "mean_calibrated_confidence": 0.6669, - "mean_raw_confidence": 0.2286, - "raw_accuracy": 0.9442, - "raw_expected_calibration_error": 0.7157, - "raw_negative_log_likelihood": 1.6567 + "calibrated_accuracy": 0.9159, + "calibrated_expected_calibration_error": 0.2475, + "calibrated_negative_log_likelihood": 0.5736, + "mean_calibrated_confidence": 0.6684, + "mean_raw_confidence": 0.1932, + "raw_accuracy": 0.9159, + "raw_expected_calibration_error": 0.7227, + "raw_negative_log_likelihood": 1.8448 }, "minimum_threshold_floor": 0.12, - "optimized_temperature_candidate": 0.607335, + "optimized_temperature_candidate": 0.562804, "selected_threshold_before_floor": { - "accepted_accuracy": 0.9442, + "accepted_accuracy": 0.9159, "coverage": 1.0, "threshold": 0.0 }, "selection_split": "val", "selection_target_precision": 0.7, - "temperature": 0.607335, + "temperature": 0.562804, "temperature_scaling_applied": true, "threshold_summary": { - "accepted_accuracy": 0.9478, - "coverage": 0.9915, + "accepted_accuracy": 0.921, + "coverage": 0.9878, "threshold": 0.12 } } diff --git a/artifacts/calibration/intent_subtype.json b/artifacts/calibration/intent_subtype.json index f722cd75ec2ad537e06abdd88eb1601e8708e501..d2c67a0f431c191c7f7e98984cabbc1272c1a4a4 100644 --- a/artifacts/calibration/intent_subtype.json +++ b/artifacts/calibration/intent_subtype.json @@ -1,31 +1,31 @@ { "calibrated": true, "confidence_threshold": 0.25, - "generated_at": "2026-03-25T05:09:58.809351+00:00", + "generated_at": "2026-03-25T07:02:24.141670+00:00", "head": "intent_subtype", "metrics": { - "calibrated_accuracy": 0.875, - "calibrated_expected_calibration_error": 0.0778, - "calibrated_negative_log_likelihood": 0.4165, - "mean_calibrated_confidence": 0.8225, - "mean_raw_confidence": 0.7521, - "raw_accuracy": 0.875, - "raw_expected_calibration_error": 0.1475, - "raw_negative_log_likelihood": 0.4843 + "calibrated_accuracy": 0.9, + "calibrated_expected_calibration_error": 0.1181, + "calibrated_negative_log_likelihood": 0.4376, + "mean_calibrated_confidence": 0.8188, + "mean_raw_confidence": 0.7458, + "raw_accuracy": 0.9, + "raw_expected_calibration_error": 0.1548, + "raw_negative_log_likelihood": 0.5071 }, "minimum_threshold_floor": 0.25, - "optimized_temperature_candidate": 0.834211, + "optimized_temperature_candidate": 0.831169, "selected_threshold_before_floor": { - "accepted_accuracy": 0.875, + "accepted_accuracy": 0.9, "coverage": 1.0, "threshold": 0.0 }, "selection_split": "val", "selection_target_precision": 0.75, - "temperature": 0.834211, + "temperature": 0.831169, "temperature_scaling_applied": true, "threshold_summary": { - "accepted_accuracy": 0.875, + "accepted_accuracy": 0.9, "coverage": 1.0, "threshold": 0.25 } diff --git a/artifacts/calibration/intent_type.json b/artifacts/calibration/intent_type.json index 62f26a25bc9d33c89e825104b9bb9c237554cec3..d05879704eac21800f0eddaa3f871c0f40acd245 100644 --- a/artifacts/calibration/intent_type.json +++ b/artifacts/calibration/intent_type.json @@ -1,31 +1,31 @@ { "calibrated": true, "confidence_threshold": 0.4, - "generated_at": "2026-03-25T05:09:42.900721+00:00", + "generated_at": "2026-03-25T07:02:07.798259+00:00", "head": "intent_type", "metrics": { - "calibrated_accuracy": 0.9362, - "calibrated_expected_calibration_error": 0.0424, - "calibrated_negative_log_likelihood": 0.3117, - "mean_calibrated_confidence": 0.8993, - "mean_raw_confidence": 0.8741, - "raw_accuracy": 0.9362, - "raw_expected_calibration_error": 0.0788, - "raw_negative_log_likelihood": 0.3262 + "calibrated_accuracy": 0.9149, + "calibrated_expected_calibration_error": 0.061, + "calibrated_negative_log_likelihood": 0.3056, + "mean_calibrated_confidence": 0.9173, + "mean_raw_confidence": 0.8989, + "raw_accuracy": 0.9149, + "raw_expected_calibration_error": 0.0532, + "raw_negative_log_likelihood": 0.314 }, "minimum_threshold_floor": 0.4, - "optimized_temperature_candidate": 0.916196, + "optimized_temperature_candidate": 0.93857, "selected_threshold_before_floor": { - "accepted_accuracy": 0.9362, + "accepted_accuracy": 0.9149, "coverage": 1.0, "threshold": 0.0 }, "selection_split": "val", "selection_target_precision": 0.8, - "temperature": 0.916196, + "temperature": 0.93857, "temperature_scaling_applied": true, "threshold_summary": { - "accepted_accuracy": 0.9362, + "accepted_accuracy": 0.9149, "coverage": 1.0, "threshold": 0.4 } diff --git a/artifacts/evaluation/latest/combined_demo_benchmark.json b/artifacts/evaluation/latest/combined_demo_benchmark.json index cb002124eddc26b745b7656de406c8ca0420c227..85a8a69208e1af91ef10a7afb077b3d6cada4294 100644 --- a/artifacts/evaluation/latest/combined_demo_benchmark.json +++ b/artifacts/evaluation/latest/combined_demo_benchmark.json @@ -5,26 +5,19 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.9035, - "mapping_mode": "nearest_equivalent", + "mapping_confidence": 0.2243, + "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { "id": "596", "label": "Technology & Computing" - }, - "tier2": { - "id": "599", - "label": "Computing" - }, - "tier3": { - "id": "602", - "label": "Software and Applications" } }, "intent": { @@ -32,31 +25,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9947, + "confidence": 0.9632, "confidence_threshold": 0.22, "label": "awareness", "meets_threshold": true, - "raw_confidence": 0.9788 + "raw_confidence": 0.9627 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9547, + "confidence": 0.9866, "confidence_threshold": 0.25, "label": "education", "meets_threshold": true, - "raw_confidence": 0.9547 + "raw_confidence": 0.9572 }, "intent_type": { "calibrated": true, - "confidence": 0.9972, + "confidence": 0.9737, "confidence_threshold": 0.4, "label": "informational", "meets_threshold": true, - "raw_confidence": 0.9662 + "raw_confidence": 0.9629 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9947, + "confidence": 0.9632, "decision_phase": "awareness", "subtype": "education", "summary": "Classified as informational intent with subtype education in the awareness phase.", @@ -95,18 +88,19 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.8427, - "mapping_mode": "nearest_equivalent", + "mapping_confidence": 0.1254, + "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "1", - "label": "Automotive" + "id": "596", + "label": "Technology & Computing" } }, "intent": { @@ -114,31 +108,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9944, + "confidence": 0.9477, "confidence_threshold": 0.22, "label": "awareness", "meets_threshold": true, - "raw_confidence": 0.9779 + "raw_confidence": 0.9471 }, "intent_subtype": { "calibrated": true, - "confidence": 0.955, + "confidence": 0.9851, "confidence_threshold": 0.25, "label": "education", "meets_threshold": true, - "raw_confidence": 0.955 + "raw_confidence": 0.9541 }, "intent_type": { "calibrated": true, - "confidence": 0.9969, + "confidence": 0.973, "confidence_threshold": 0.4, "label": "informational", "meets_threshold": true, - "raw_confidence": 0.9637 + "raw_confidence": 0.9621 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9944, + "confidence": 0.9477, "decision_phase": "awareness", "subtype": "education", "summary": "Classified as informational intent with subtype education in the awareness phase.", @@ -177,53 +171,54 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.7798, + "mapping_confidence": 0.1886, "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "483", - "label": "Sports" + "id": "1", + "label": "Automotive" } }, "intent": { - "commercial_score": 0.656, + "commercial_score": 0.728, "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9965, + "confidence": 0.9402, "confidence_threshold": 0.22, "label": "consideration", "meets_threshold": true, - "raw_confidence": 0.9846 + "raw_confidence": 0.9395 }, "intent_subtype": { "calibrated": true, - "confidence": 0.4682, + "confidence": 0.518, "confidence_threshold": 0.25, - "label": "product_discovery", + "label": "comparison", "meets_threshold": true, - "raw_confidence": 0.4682 + "raw_confidence": 0.4557 }, "intent_type": { "calibrated": true, - "confidence": 0.9995, + "confidence": 0.9808, "confidence_threshold": 0.4, "label": "commercial", "meets_threshold": true, - "raw_confidence": 0.9895 + "raw_confidence": 0.9724 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.4682, + "confidence": 0.518, "decision_phase": "consideration", - "subtype": "product_discovery", - "summary": "Classified as commercial intent with subtype product_discovery in the consideration phase.", + "subtype": "comparison", + "summary": "Classified as commercial intent with subtype comparison in the consideration phase.", "type": "commercial" } }, @@ -234,8 +229,8 @@ "consideration" ], "opportunity": { - "strength": "medium", - "type": "soft_recommendation" + "strength": "high", + "type": "comparison_slot" }, "policy": { "applied_thresholds": { @@ -245,7 +240,7 @@ "intent_type_confidence_min": 0.4 }, "decision_basis": "score_threshold", - "eligibility_reason": "commercial_discovery_signal_present", + "eligibility_reason": "commercial_comparison_signal_present", "monetization_eligibility": "allowed_with_caution", "regulated_vertical": false, "sensitivity": "low" @@ -259,26 +254,19 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.8606, + "mapping_confidence": 0.0941, "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "596", - "label": "Technology & Computing" - }, - "tier2": { - "id": "599", - "label": "Computing" - }, - "tier3": { - "id": "619", - "label": "Internet" + "id": "123", + "label": "Careers" } }, "intent": { @@ -286,31 +274,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9964, + "confidence": 0.9117, "confidence_threshold": 0.22, "label": "consideration", "meets_threshold": true, - "raw_confidence": 0.9842 + "raw_confidence": 0.9108 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9449, + "confidence": 0.9762, "confidence_threshold": 0.25, "label": "comparison", "meets_threshold": true, - "raw_confidence": 0.9449 + "raw_confidence": 0.9343 }, "intent_type": { "calibrated": true, - "confidence": 0.9995, + "confidence": 0.9639, "confidence_threshold": 0.4, "label": "commercial", "meets_threshold": true, - "raw_confidence": 0.9892 + "raw_confidence": 0.9503 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9449, + "confidence": 0.9117, "decision_phase": "consideration", "subtype": "comparison", "summary": "Classified as commercial intent with subtype comparison in the consideration phase.", @@ -349,22 +337,19 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.8737, + "mapping_confidence": 0.411, "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "52", - "label": "Business and Finance" - }, - "tier2": { - "id": "53", - "label": "Business" + "id": "596", + "label": "Technology & Computing" } }, "intent": { @@ -372,31 +357,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.963, + "confidence": 0.6147, "confidence_threshold": 0.22, "label": "decision", "meets_threshold": true, - "raw_confidence": 0.9122 + "raw_confidence": 0.614 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9119, + "confidence": 0.758, "confidence_threshold": 0.25, "label": "provider_selection", "meets_threshold": true, - "raw_confidence": 0.9119 + "raw_confidence": 0.6571 }, "intent_type": { "calibrated": true, - "confidence": 0.9994, + "confidence": 0.9801, "confidence_threshold": 0.4, "label": "commercial", "meets_threshold": true, - "raw_confidence": 0.9874 + "raw_confidence": 0.9714 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9119, + "confidence": 0.6147, "decision_phase": "decision", "subtype": "provider_selection", "summary": "Classified as commercial intent with subtype provider_selection in the decision phase.", @@ -435,26 +420,19 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.7133, + "mapping_confidence": 0.1002, "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "239", - "label": "Hobbies & Interests" - }, - "tier2": { - "id": "264", - "label": "Content Production" - }, - "tier3": { - "id": "266", - "label": "Freelance Writing" + "id": "v9i3On", + "label": "Sensitive Topics" } }, "intent": { @@ -462,31 +440,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9991, + "confidence": 0.9175, "confidence_threshold": 0.22, "label": "action", "meets_threshold": true, - "raw_confidence": 0.9947 + "raw_confidence": 0.9167 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9382, + "confidence": 0.9477, "confidence_threshold": 0.25, "label": "signup", "meets_threshold": true, - "raw_confidence": 0.9382 + "raw_confidence": 0.8793 }, "intent_type": { "calibrated": true, - "confidence": 0.9996, + "confidence": 0.9518, "confidence_threshold": 0.4, "label": "transactional", "meets_threshold": true, - "raw_confidence": 0.9902 + "raw_confidence": 0.9354 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9382, + "confidence": 0.9175, "decision_phase": "action", "subtype": "signup", "summary": "Classified as transactional intent with subtype signup in the action phase.", @@ -525,13 +503,14 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.7997, - "mapping_mode": "nearest_equivalent", + "mapping_confidence": 0.3828, + "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { @@ -548,31 +527,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.999, + "confidence": 0.9432, "confidence_threshold": 0.22, "label": "action", "meets_threshold": true, - "raw_confidence": 0.9945 + "raw_confidence": 0.9425 }, "intent_subtype": { "calibrated": true, - "confidence": 0.8724, + "confidence": 0.7947, "confidence_threshold": 0.25, "label": "booking", "meets_threshold": true, - "raw_confidence": 0.8724 + "raw_confidence": 0.6973 }, "intent_type": { "calibrated": true, - "confidence": 0.9996, + "confidence": 0.9554, "confidence_threshold": 0.4, "label": "transactional", "meets_threshold": true, - "raw_confidence": 0.9901 + "raw_confidence": 0.9398 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.8724, + "confidence": 0.7947, "decision_phase": "action", "subtype": "booking", "summary": "Classified as transactional intent with subtype booking in the action phase.", @@ -611,30 +590,19 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.8423, + "mapping_confidence": 0.5835, "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { "id": "596", "label": "Technology & Computing" - }, - "tier2": { - "id": "599", - "label": "Computing" - }, - "tier3": { - "id": "619", - "label": "Internet" - }, - "tier4": { - "id": "620", - "label": "Cloud Computing" } }, "intent": { @@ -642,31 +610,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9736, + "confidence": 0.9641, "confidence_threshold": 0.22, "label": "post_purchase", "meets_threshold": true, - "raw_confidence": 0.9264 + "raw_confidence": 0.9637 }, "intent_subtype": { "calibrated": true, - "confidence": 0.921, + "confidence": 0.9717, "confidence_threshold": 0.25, "label": "onboarding_setup", "meets_threshold": true, - "raw_confidence": 0.921 + "raw_confidence": 0.9232 }, "intent_type": { "calibrated": true, - "confidence": 0.9935, + "confidence": 0.4496, "confidence_threshold": 0.4, "label": "transactional", "meets_threshold": true, - "raw_confidence": 0.9448 + "raw_confidence": 0.4228 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9736, + "confidence": 0.4496, "decision_phase": "post_purchase", "subtype": "onboarding_setup", "summary": "Classified as transactional intent with subtype onboarding_setup in the post_purchase phase.", @@ -705,26 +673,19 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.8039, + "mapping_confidence": 0.3535, "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "596", - "label": "Technology & Computing" - }, - "tier2": { - "id": "599", - "label": "Computing" - }, - "tier3": { - "id": "619", - "label": "Internet" + "id": "391", + "label": "Personal Finance" } }, "intent": { @@ -732,31 +693,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9969, + "confidence": 0.953, "confidence_threshold": 0.22, "label": "support", "meets_threshold": true, - "raw_confidence": 0.9863 + "raw_confidence": 0.9525 }, "intent_subtype": { "calibrated": true, - "confidence": 0.923, + "confidence": 0.9154, "confidence_threshold": 0.25, "label": "account_help", "meets_threshold": true, - "raw_confidence": 0.923 + "raw_confidence": 0.8312 }, "intent_type": { "calibrated": true, - "confidence": 0.9988, + "confidence": 0.9602, "confidence_threshold": 0.4, "label": "support", "meets_threshold": true, - "raw_confidence": 0.9811 + "raw_confidence": 0.946 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.923, + "confidence": 0.9154, "decision_phase": "support", "subtype": "account_help", "summary": "Classified as support intent with subtype account_help in the support phase.", @@ -801,22 +762,19 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.7854, - "mapping_mode": "nearest_equivalent", + "mapping_confidence": 0.1373, + "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "286", - "label": "Medical Health" - }, - "tier2": { - "id": "287", - "label": "Diseases and Conditions" + "id": "186", + "label": "Family and Relationships" } }, "intent": { @@ -824,31 +782,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9699, + "confidence": 0.9173, "confidence_threshold": 0.22, "label": "awareness", "meets_threshold": true, - "raw_confidence": 0.9258 + "raw_confidence": 0.9165 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9435, + "confidence": 0.9644, "confidence_threshold": 0.25, "label": "emotional_reflection", "meets_threshold": true, - "raw_confidence": 0.9435 + "raw_confidence": 0.9072 }, "intent_type": { "calibrated": true, - "confidence": 0.9916, + "confidence": 0.96, "confidence_threshold": 0.4, "label": "personal_reflection", "meets_threshold": true, - "raw_confidence": 0.9406 + "raw_confidence": 0.9459 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9435, + "confidence": 0.9173, "decision_phase": "awareness", "subtype": "emotional_reflection", "summary": "Classified as personal_reflection intent with subtype emotional_reflection in the awareness phase.", @@ -893,18 +851,19 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.7304, + "mapping_confidence": 0.0961, "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "SPSHQ5", - "label": "Genres" + "id": "v9i3On", + "label": "Sensitive Topics" } }, "intent": { @@ -912,31 +871,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9934, + "confidence": 0.8376, "confidence_threshold": 0.22, "label": "research", "meets_threshold": true, - "raw_confidence": 0.9746 + "raw_confidence": 0.8363 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9631, + "confidence": 0.9649, "confidence_threshold": 0.25, "label": "follow_up", "meets_threshold": true, - "raw_confidence": 0.9631 + "raw_confidence": 0.9077 }, "intent_type": { "calibrated": true, - "confidence": 0.9934, + "confidence": 0.9456, "confidence_threshold": 0.4, "label": "ambiguous", "meets_threshold": true, - "raw_confidence": 0.9405 + "raw_confidence": 0.9278 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9934, + "confidence": 0.8376, "decision_phase": "research", "subtype": "follow_up", "summary": "Classified as ambiguous intent with subtype follow_up in the research phase.", @@ -981,22 +940,19 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.7779, + "mapping_confidence": 0.1013, "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "52", - "label": "Business and Finance" - }, - "tier2": { - "id": "53", - "label": "Business" + "id": "473", + "label": "Shopping" } }, "intent": { @@ -1004,31 +960,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9888, + "confidence": 0.9155, "confidence_threshold": 0.22, "label": "research", "meets_threshold": true, - "raw_confidence": 0.9639 + "raw_confidence": 0.9146 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9487, + "confidence": 0.9201, "confidence_threshold": 0.25, "label": "follow_up", "meets_threshold": true, - "raw_confidence": 0.9487 + "raw_confidence": 0.8294 }, "intent_type": { "calibrated": true, - "confidence": 0.9916, + "confidence": 0.8933, "confidence_threshold": 0.4, "label": "ambiguous", "meets_threshold": true, - "raw_confidence": 0.9321 + "raw_confidence": 0.8671 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9888, + "confidence": 0.8933, "decision_phase": "research", "subtype": "follow_up", "summary": "Classified as ambiguous intent with subtype follow_up in the research phase.", @@ -1073,30 +1029,19 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.7753, + "mapping_confidence": 0.0593, "mapping_mode": "nearest_equivalent", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { - "id": "596", - "label": "Technology & Computing" - }, - "tier2": { - "id": "599", - "label": "Computing" - }, - "tier3": { - "id": "619", - "label": "Internet" - }, - "tier4": { - "id": "623", - "label": "Email" + "id": "v9i3On", + "label": "Sensitive Topics" } }, "intent": { @@ -1104,31 +1049,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9991, + "confidence": 0.9532, "confidence_threshold": 0.22, "label": "action", "meets_threshold": true, - "raw_confidence": 0.9948 + "raw_confidence": 0.9527 }, "intent_subtype": { "calibrated": true, - "confidence": 0.8874, + "confidence": 0.8947, "confidence_threshold": 0.25, "label": "signup", "meets_threshold": true, - "raw_confidence": 0.8874 + "raw_confidence": 0.8015 }, "intent_type": { "calibrated": true, - "confidence": 0.9996, + "confidence": 0.9685, "confidence_threshold": 0.4, "label": "transactional", "meets_threshold": true, - "raw_confidence": 0.9908 + "raw_confidence": 0.9565 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.8874, + "confidence": 0.8947, "decision_phase": "action", "subtype": "signup", "summary": "Classified as transactional intent with subtype signup in the action phase.", @@ -1167,30 +1112,19 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.8626, - "mapping_mode": "nearest_equivalent", + "mapping_confidence": 0.1316, + "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { "id": "596", "label": "Technology & Computing" - }, - "tier2": { - "id": "599", - "label": "Computing" - }, - "tier3": { - "id": "619", - "label": "Internet" - }, - "tier4": { - "id": "627", - "label": "Search" } }, "intent": { @@ -1198,31 +1132,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9966, + "confidence": 0.9582, "confidence_threshold": 0.22, "label": "consideration", "meets_threshold": true, - "raw_confidence": 0.9852 + "raw_confidence": 0.9576 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9415, + "confidence": 0.9612, "confidence_threshold": 0.25, "label": "comparison", "meets_threshold": true, - "raw_confidence": 0.9415 + "raw_confidence": 0.9052 }, "intent_type": { "calibrated": true, - "confidence": 0.9994, + "confidence": 0.9594, "confidence_threshold": 0.4, "label": "commercial", "meets_threshold": true, - "raw_confidence": 0.9884 + "raw_confidence": 0.9447 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9415, + "confidence": 0.9582, "decision_phase": "consideration", "subtype": "comparison", "summary": "Classified as commercial intent with subtype comparison in the consideration phase.", @@ -1261,30 +1195,19 @@ "response": { "meta": { "calibration_enabled": true, + "iab_mapping_is_placeholder": false, "system_version": "0.6.0-phase4" }, "model_output": { "classification": { "iab_content": { - "mapping_confidence": 0.8741, - "mapping_mode": "nearest_equivalent", + "mapping_confidence": 0.1245, + "mapping_mode": "exact", "taxonomy": "IAB Content Taxonomy", "taxonomy_version": "3.0", "tier1": { "id": "596", "label": "Technology & Computing" - }, - "tier2": { - "id": "599", - "label": "Computing" - }, - "tier3": { - "id": "619", - "label": "Internet" - }, - "tier4": { - "id": "620", - "label": "Cloud Computing" } }, "intent": { @@ -1292,31 +1215,31 @@ "component_confidence": { "decision_phase": { "calibrated": true, - "confidence": 0.9939, + "confidence": 0.9531, "confidence_threshold": 0.22, "label": "awareness", "meets_threshold": true, - "raw_confidence": 0.9764 + "raw_confidence": 0.9526 }, "intent_subtype": { "calibrated": true, - "confidence": 0.9545, + "confidence": 0.9844, "confidence_threshold": 0.25, "label": "education", "meets_threshold": true, - "raw_confidence": 0.9545 + "raw_confidence": 0.9518 }, "intent_type": { "calibrated": true, - "confidence": 0.9964, + "confidence": 0.9738, "confidence_threshold": 0.4, "label": "informational", "meets_threshold": true, - "raw_confidence": 0.961 + "raw_confidence": 0.9632 }, "overall_strategy": "min_required_component_confidence" }, - "confidence": 0.9939, + "confidence": 0.9531, "decision_phase": "awareness", "subtype": "education", "summary": "Classified as informational intent with subtype education in the awareness phase.", diff --git a/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv b/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv index c8827e9780329754ce9f759db095bf156ff49349..9b8fd9c944cdc6aa3c2a312e82480d4c7f25943c 100644 --- a/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv +++ b/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv @@ -1,8 +1,8 @@ ,awareness,research,consideration,decision,action,post_purchase,support -awareness,14,1,0,0,0,0,0 -research,0,14,0,0,0,1,0 +awareness,15,0,0,0,0,0,0 +research,0,15,0,0,0,0,0 consideration,0,1,14,0,0,0,0 decision,0,0,0,15,0,0,0 -action,0,0,0,1,13,1,0 +action,0,1,0,0,14,0,0 post_purchase,0,0,0,0,0,15,0 support,0,0,0,0,0,0,15 diff --git a/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_report.json b/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_report.json index c93729d076c04cbc93fc8410157d5f178753b06a..4a057f5a36db3f9163c2feb6aa1bcd2aa4802742 100644 --- a/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_report.json +++ b/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_report.json @@ -1,10 +1,10 @@ { - "accepted_accuracy": 0.9524, + "accepted_accuracy": 0.981, "accepted_coverage": 1.0, - "accuracy": 0.9524, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv", + "accuracy": 0.981, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv", "count": 105, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/decision_phase_benchmark.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/decision_phase_benchmark.jsonl", "difficulty_breakdown": { "easy": { "accepted_accuracy": 0.9714, @@ -15,12 +15,12 @@ "macro_f1": 0.9711 }, "hard": { - "accepted_accuracy": 0.8857, + "accepted_accuracy": 0.9714, "accepted_coverage": 1.0, - "accuracy": 0.8857, + "accuracy": 0.9714, "count": 35, "fallback_rate": 0.0, - "macro_f1": 0.883 + "macro_f1": 0.9711 }, "medium": { "accepted_accuracy": 1.0, @@ -33,19 +33,19 @@ }, "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.9526, + "macro_f1": 0.9812, "per_class_metrics": { - "accuracy": 0.9523809523809523, + "accuracy": 0.9809523809523809, "action": { - "f1-score": 0.9285714285714286, + "f1-score": 0.9655172413793104, "precision": 1.0, - "recall": 0.8666666666666667, + "recall": 0.9333333333333333, "support": 15.0 }, "awareness": { - "f1-score": 0.9655172413793104, + "f1-score": 1.0, "precision": 1.0, - "recall": 0.9333333333333333, + "recall": 1.0, "support": 15.0 }, "consideration": { @@ -55,27 +55,27 @@ "support": 15.0 }, "decision": { - "f1-score": 0.967741935483871, - "precision": 0.9375, + "f1-score": 1.0, + "precision": 1.0, "recall": 1.0, "support": 15.0 }, "macro avg": { - "f1-score": 0.9525819504665047, - "precision": 0.9564075630252101, - "recall": 0.9523809523809523, + "f1-score": 0.9812192118226601, + "precision": 0.9831932773109244, + "recall": 0.980952380952381, "support": 105.0 }, "post_purchase": { - "f1-score": 0.9375, - "precision": 0.8823529411764706, + "f1-score": 1.0, + "precision": 1.0, "recall": 1.0, "support": 15.0 }, "research": { - "f1-score": 0.9032258064516129, - "precision": 0.875, - "recall": 0.9333333333333333, + "f1-score": 0.9375, + "precision": 0.8823529411764706, + "recall": 1.0, "support": 15.0 }, "support": { @@ -85,9 +85,9 @@ "support": 15.0 }, "weighted avg": { - "f1-score": 0.9525819504665048, - "precision": 0.9564075630252101, - "recall": 0.9523809523809523, + "f1-score": 0.9812192118226601, + "precision": 0.9831932773109243, + "recall": 0.9809523809523809, "support": 105.0 } }, diff --git a/artifacts/evaluation/latest/decision_phase_final_wave_cases_report.json b/artifacts/evaluation/latest/decision_phase_final_wave_cases_report.json index cb57e964bde69b8af52a59922b1fa9d646c81b7d..27d9f7931a1c7696cc0dfee7463e22c2c58313a5 100644 --- a/artifacts/evaluation/latest/decision_phase_final_wave_cases_report.json +++ b/artifacts/evaluation/latest/decision_phase_final_wave_cases_report.json @@ -2,9 +2,9 @@ "accepted_accuracy": 0.963, "accepted_coverage": 1.0, "accuracy": 0.963, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_final_wave_cases_confusion_matrix.csv", + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_final_wave_cases_confusion_matrix.csv", "count": 27, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/decision_phase/final_wave_cases.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/decision_phase/final_wave_cases.jsonl", "fallback_rate": 0.0, "head": "decision_phase", "macro_f1": 0.961, diff --git a/artifacts/evaluation/latest/decision_phase_hard_cases_confusion_matrix.csv b/artifacts/evaluation/latest/decision_phase_hard_cases_confusion_matrix.csv index b6a2b622d8019e7a6d1b4a5f88a3a70cdee65df4..442fc9eddeeddf94ff47971f4f90dfc69bdd1839 100644 --- a/artifacts/evaluation/latest/decision_phase_hard_cases_confusion_matrix.csv +++ b/artifacts/evaluation/latest/decision_phase_hard_cases_confusion_matrix.csv @@ -1,7 +1,7 @@ ,awareness,research,consideration,decision,action,post_purchase,support awareness,6,0,0,0,0,0,0 research,2,5,0,0,0,0,0 -consideration,0,1,6,0,0,0,0 +consideration,0,2,5,0,0,0,0 decision,0,0,0,7,0,0,0 action,0,0,0,0,0,0,0 post_purchase,0,0,0,0,0,6,0 diff --git a/artifacts/evaluation/latest/decision_phase_hard_cases_report.json b/artifacts/evaluation/latest/decision_phase_hard_cases_report.json index 5bae5f62a6e3570ac203ec04230b93786b0e0cde..b45aae51d2de88dec1d07a0a18c6e6fd1afc2b8c 100644 --- a/artifacts/evaluation/latest/decision_phase_hard_cases_report.json +++ b/artifacts/evaluation/latest/decision_phase_hard_cases_report.json @@ -1,15 +1,15 @@ { - "accepted_accuracy": 0.9231, + "accepted_accuracy": 0.8974, "accepted_coverage": 1.0, - "accuracy": 0.9231, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_hard_cases_confusion_matrix.csv", + "accuracy": 0.8974, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_hard_cases_confusion_matrix.csv", "count": 39, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/decision_phase/hard_cases.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/decision_phase/hard_cases.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.9249, + "macro_f1": 0.9008, "per_class_metrics": { - "accuracy": 0.9230769230769231, + "accuracy": 0.8974358974358975, "action": { "f1-score": 0.0, "precision": 0.0, @@ -23,9 +23,9 @@ "support": 6.0 }, "consideration": { - "f1-score": 0.9230769230769231, + "f1-score": 0.8333333333333334, "precision": 1.0, - "recall": 0.8571428571428571, + "recall": 0.7142857142857143, "support": 7.0 }, "decision": { @@ -35,9 +35,9 @@ "support": 7.0 }, "macro avg": { - "f1-score": 0.792778649921507, - "precision": 0.7976190476190477, - "recall": 0.7959183673469388, + "f1-score": 0.772108843537415, + "precision": 0.7806122448979592, + "recall": 0.7755102040816327, "support": 39.0 }, "post_purchase": { @@ -47,8 +47,8 @@ "support": 6.0 }, "research": { - "f1-score": 0.7692307692307693, - "precision": 0.8333333333333334, + "f1-score": 0.7142857142857143, + "precision": 0.7142857142857143, "recall": 0.7142857142857143, "support": 7.0 }, @@ -59,9 +59,9 @@ "support": 6.0 }, "weighted avg": { - "f1-score": 0.9227951535643845, - "precision": 0.9316239316239316, - "recall": 0.9230769230769231, + "f1-score": 0.8968253968253967, + "precision": 0.9102564102564102, + "recall": 0.8974358974358975, "support": 39.0 } }, diff --git a/artifacts/evaluation/latest/decision_phase_test_confusion_matrix.csv b/artifacts/evaluation/latest/decision_phase_test_confusion_matrix.csv index 1e023e835f3edaa09c791cadc5a5f4d34f7902a7..9c86b7926328819b939559cd74742406f3f9cab9 100644 --- a/artifacts/evaluation/latest/decision_phase_test_confusion_matrix.csv +++ b/artifacts/evaluation/latest/decision_phase_test_confusion_matrix.csv @@ -1,8 +1,8 @@ ,awareness,research,consideration,decision,action,post_purchase,support awareness,3,0,0,0,0,0,0 research,3,2,0,0,0,0,0 -consideration,0,1,4,0,0,0,0 +consideration,0,2,3,0,0,0,0 decision,0,0,0,5,0,0,0 action,0,0,0,0,3,0,0 post_purchase,0,0,0,0,0,4,0 -support,0,0,0,0,0,0,4 +support,0,0,0,0,0,1,3 diff --git a/artifacts/evaluation/latest/decision_phase_test_report.json b/artifacts/evaluation/latest/decision_phase_test_report.json index bdee435efc0a3b290ed656729099f91785a878ac..6d2262875780ef77d6eb79b7553656ae4f144a79 100644 --- a/artifacts/evaluation/latest/decision_phase_test_report.json +++ b/artifacts/evaluation/latest/decision_phase_test_report.json @@ -1,15 +1,15 @@ { - "accepted_accuracy": 0.8621, + "accepted_accuracy": 0.7931, "accepted_coverage": 1.0, - "accuracy": 0.8621, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_test_confusion_matrix.csv", + "accuracy": 0.7931, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_test_confusion_matrix.csv", "count": 29, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/decision_phase/test.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/decision_phase/test.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.8651, + "macro_f1": 0.801, "per_class_metrics": { - "accuracy": 0.8620689655172413, + "accuracy": 0.7931034482758621, "action": { "f1-score": 1.0, "precision": 1.0, @@ -23,9 +23,9 @@ "support": 3.0 }, "consideration": { - "f1-score": 0.8888888888888888, + "f1-score": 0.75, "precision": 1.0, - "recall": 0.8, + "recall": 0.6, "support": 5.0 }, "decision": { @@ -35,33 +35,33 @@ "support": 5.0 }, "macro avg": { - "f1-score": 0.865079365079365, - "precision": 0.8809523809523808, - "recall": 0.8857142857142858, + "f1-score": 0.8010204081632653, + "precision": 0.8285714285714285, + "recall": 0.8214285714285714, "support": 29.0 }, "post_purchase": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.8888888888888888, + "precision": 0.8, "recall": 1.0, "support": 4.0 }, "research": { - "f1-score": 0.5, - "precision": 0.6666666666666666, + "f1-score": 0.4444444444444444, + "precision": 0.5, "recall": 0.4, "support": 5.0 }, "support": { - "f1-score": 1.0, + "f1-score": 0.8571428571428571, "precision": 1.0, - "recall": 1.0, + "recall": 0.75, "support": 4.0 }, "weighted avg": { - "f1-score": 0.8601532567049808, - "precision": 0.8908045977011494, - "recall": 0.8620689655172413, + "f1-score": 0.7915982484948002, + "precision": 0.8344827586206897, + "recall": 0.7931034482758621, "support": 29.0 } }, diff --git a/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv b/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv index 43f500fa605280ea54e82f7a647ec0a86be353c6..e5f9d4fde279029f572e7589aa7dfb7bbd92e130 100644 --- a/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv +++ b/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv @@ -1,8 +1,8 @@ ,awareness,research,consideration,decision,action,post_purchase,support awareness,16,0,0,0,0,0,0 -research,1,14,0,0,0,0,0 -consideration,0,0,17,0,0,0,0 -decision,0,0,0,16,0,0,0 +research,2,13,0,0,0,0,0 +consideration,0,1,16,0,0,0,0 +decision,0,0,1,15,0,0,0 action,0,0,0,0,10,0,0 post_purchase,0,0,0,0,0,14,0 support,0,0,0,0,0,0,14 diff --git a/artifacts/evaluation/latest/decision_phase_train_report.json b/artifacts/evaluation/latest/decision_phase_train_report.json index 027e36871a50d40370fe7ae7dac30585c010adbf..594e9ace53a60e9f28dd60208f27f3bac79ab0db 100644 --- a/artifacts/evaluation/latest/decision_phase_train_report.json +++ b/artifacts/evaluation/latest/decision_phase_train_report.json @@ -1,15 +1,15 @@ { - "accepted_accuracy": 0.9902, + "accepted_accuracy": 0.9608, "accepted_coverage": 1.0, - "accuracy": 0.9902, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv", + "accuracy": 0.9608, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv", "count": 102, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/decision_phase/train.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/decision_phase/train.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.9907, + "macro_f1": 0.9638, "per_class_metrics": { - "accuracy": 0.9901960784313726, + "accuracy": 0.9607843137254902, "action": { "f1-score": 1.0, "precision": 1.0, @@ -17,27 +17,27 @@ "support": 10.0 }, "awareness": { - "f1-score": 0.9696969696969697, - "precision": 0.9411764705882353, + "f1-score": 0.9411764705882353, + "precision": 0.8888888888888888, "recall": 1.0, "support": 16.0 }, "consideration": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.9411764705882353, + "precision": 0.9411764705882353, + "recall": 0.9411764705882353, "support": 17.0 }, "decision": { - "f1-score": 1.0, + "f1-score": 0.967741935483871, "precision": 1.0, - "recall": 1.0, + "recall": 0.9375, "support": 16.0 }, "macro avg": { - "f1-score": 0.9907448872966115, - "precision": 0.9915966386554622, - "recall": 0.9904761904761905, + "f1-score": 0.9638066572568961, + "precision": 0.9655195411497932, + "recall": 0.9636204481792717, "support": 102.0 }, "post_purchase": { @@ -47,9 +47,9 @@ "support": 14.0 }, "research": { - "f1-score": 0.9655172413793104, - "precision": 1.0, - "recall": 0.9333333333333333, + "f1-score": 0.896551724137931, + "precision": 0.9285714285714286, + "recall": 0.8666666666666667, "support": 15.0 }, "support": { @@ -59,9 +59,9 @@ "support": 14.0 }, "weighted avg": { - "f1-score": 0.9901755895670704, - "precision": 0.9907727797001153, - "recall": 0.9901960784313726, + "f1-score": 0.9606957878355163, + "precision": 0.9622626828509181, + "recall": 0.9607843137254902, "support": 102.0 } }, diff --git a/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv b/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv index ef673ac7607122b6d8f5bf45eaa814fdc3d49bb1..38ac31a8390225b7f744f683af81ecd62786e238 100644 --- a/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv +++ b/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv @@ -5,4 +5,4 @@ consideration,0,0,5,0,0,0,0 decision,0,0,1,3,0,0,0 action,0,0,0,0,3,0,0 post_purchase,0,1,0,0,0,3,0 -support,0,0,0,0,0,0,4 +support,0,0,0,0,0,1,3 diff --git a/artifacts/evaluation/latest/decision_phase_val_report.json b/artifacts/evaluation/latest/decision_phase_val_report.json index 68978875f544d03dbc1cd5d780ee2d0b9c712b96..47374f760432f0622f7c32dcb8428b604afdc2cf 100644 --- a/artifacts/evaluation/latest/decision_phase_val_report.json +++ b/artifacts/evaluation/latest/decision_phase_val_report.json @@ -1,15 +1,15 @@ { - "accepted_accuracy": 0.8966, + "accepted_accuracy": 0.8621, "accepted_coverage": 1.0, - "accuracy": 0.8966, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv", + "accuracy": 0.8621, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv", "count": 29, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/decision_phase/val.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/decision_phase/val.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.8975, + "macro_f1": 0.8618, "per_class_metrics": { - "accuracy": 0.896551724137931, + "accuracy": 0.8620689655172413, "action": { "f1-score": 1.0, "precision": 1.0, @@ -35,14 +35,14 @@ "support": 4.0 }, "macro avg": { - "f1-score": 0.8974953617810761, - "precision": 0.9166666666666667, - "recall": 0.8928571428571429, + "f1-score": 0.8617810760667904, + "precision": 0.880952380952381, + "recall": 0.8571428571428571, "support": 29.0 }, "post_purchase": { - "f1-score": 0.8571428571428571, - "precision": 1.0, + "f1-score": 0.75, + "precision": 0.75, "recall": 0.75, "support": 4.0 }, @@ -53,15 +53,15 @@ "support": 4.0 }, "support": { - "f1-score": 1.0, + "f1-score": 0.8571428571428571, "precision": 1.0, - "recall": 1.0, + "recall": 0.75, "support": 4.0 }, "weighted avg": { - "f1-score": 0.8947604120017911, - "precision": 0.9080459770114944, - "recall": 0.896551724137931, + "f1-score": 0.8602776533811015, + "precision": 0.8735632183908046, + "recall": 0.8620689655172413, "support": 29.0 } }, diff --git a/artifacts/evaluation/latest/iab_behavior_lock_regression.json b/artifacts/evaluation/latest/iab_behavior_lock_regression.json index 73e85a68428098c300d2875117a918e8ab00813b..16e7f5ccc3b268f3327cbc14ee503d0e09f33a28 100644 --- a/artifacts/evaluation/latest/iab_behavior_lock_regression.json +++ b/artifacts/evaluation/latest/iab_behavior_lock_regression.json @@ -1,21 +1,21 @@ { "by_status": { "must_fix": { - "failed": 0, - "passed": 12, + "failed": 12, + "passed": 0, "total": 12 } }, - "cases_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/examples/iab_behavior_lock_cases.json", + "cases_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/examples/iab_behavior_lock_cases.json", "count": 12, - "failed": 0, - "passed": 12, + "failed": 12, + "passed": 0, "results": [ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Type" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -23,15 +23,21 @@ "model_output.classification.iab_content.tier2.label": "Auto Type" }, "id": "car-buying-maps-to-automotive-buying", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Auto Type", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Vehicle shopping queries should map into the automotive buying branch, not business sales.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which car to buy in 2026" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Laptops" @@ -43,9 +49,15 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptop-buying-maps-to-laptops", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Laptop shopping should resolve into the laptops branch, not business sales.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which laptop to buy in 2026" }, @@ -53,8 +65,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Laptops" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -63,18 +75,29 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "labtop-buying-maps-to-laptops", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Laptops", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Common typo handling should still land in the laptops branch.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which labtop to buy in 2026" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -83,18 +106,34 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "crm-awareness-maps-to-sales", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "CRM education should resolve to the closest business/sales path, not generic software.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What is CRM software?" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -103,9 +142,25 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "crm-comparison-maps-to-sales", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Direct CRM vendor comparison should map cleanly into the sales domain.", - "pass": true, + "pass": false, "status": "must_fix", "text": "HubSpot vs Zoho for a small team" }, @@ -113,8 +168,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -123,34 +178,51 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "marketing-tools-map-to-marketing", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Marketing tool discovery should map to the marketing and advertising branch.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best AI SEO tools for content teams" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing" + "model_output.classification.iab_content.tier1.label": "Careers" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing" }, "id": "ml-explanation-maps-to-ai", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + } + ], "notes": "ML and NLP educational prompts should land in the AI branch.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What is intent classification in NLP?" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -159,15 +231,31 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "support-credential-help-maps-to-business-it", - "mismatches": [], + "mismatches": [ + { + "actual": "Personal Finance", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Credential and account help should map to business IT rather than generic business.", - "pass": true, + "pass": false, "status": "must_fix", "text": "How do I reset my password?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": "Dining Out" }, @@ -177,18 +265,24 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "restaurant-booking-maps-to-dining-out", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Generic dining requests should not inherit the repo's business default.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Book a table for 2 tonight" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Hobbies & Interests", - "model_output.classification.iab_content.tier2.label": "Content Production", - "model_output.classification.iab_content.tier3.label": "Freelance Writing" + "model_output.classification.iab_content.tier1.label": "Sensitive Topics", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -197,19 +291,35 @@ "model_output.classification.iab_content.tier3.label": "Freelance Writing" }, "id": "trial-signup-maps-to-software", - "mismatches": [], + "mismatches": [ + { + "actual": "Sensitive Topics", + "expected": "Hobbies & Interests", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Content Production", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Freelance Writing", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Software action queries should map to the software/application branch.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Start my free trial" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Communication" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Remote Working", + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -219,15 +329,41 @@ "model_output.classification.iab_content.tier4.label": "Communication" }, "id": "communication-software-maps-to-tier4", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Remote Working", + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Communication", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Full taxonomy support should preserve the tier4 communication branch.", - "pass": true, + "pass": false, "status": "must_fix", "text": "best communication software for remote teams" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "expected": { @@ -235,9 +371,15 @@ "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "id": "vodka-query-maps-to-alcoholic-beverages", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Food and beverage prompts should not fall through to the business default.", - "pass": true, + "pass": false, "status": "must_fix", "text": "what is best vodka drink should i try" } diff --git a/artifacts/evaluation/latest/iab_content_cross_vertical_benchmark_report.json b/artifacts/evaluation/latest/iab_content_cross_vertical_benchmark_report.json index 8d9987374ac6aaef210472cf94d7485ef867985d..54ca383fbd42f73f6d9d51989f5435b396b819c2 100644 --- a/artifacts/evaluation/latest/iab_content_cross_vertical_benchmark_report.json +++ b/artifacts/evaluation/latest/iab_content_cross_vertical_benchmark_report.json @@ -1,93 +1,98 @@ { - "accepted_accuracy": 0.3444, - "accepted_coverage": 1.0, - "accuracy": 0.3444, + "accepted_accuracy": 0.4103, + "accepted_coverage": 0.8667, + "accuracy": 0.3667, "count": 90, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab_cross_vertical_benchmark.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab_cross_vertical_benchmark.jsonl", "difficulty_breakdown": { "easy": { - "accepted_accuracy": 0.2667, - "accepted_coverage": 1.0, - "accuracy": 0.2667, + "accepted_accuracy": 0.3846, + "accepted_coverage": 0.8667, + "accuracy": 0.3667, "count": 30, - "fallback_rate": 0.0, - "macro_f1": 0.1633 + "fallback_rate": 0.1333, + "macro_f1": 0.2619 }, "hard": { - "accepted_accuracy": 0.3667, - "accepted_coverage": 1.0, - "accuracy": 0.3667, + "accepted_accuracy": 0.5385, + "accepted_coverage": 0.8667, + "accuracy": 0.4667, "count": 30, - "fallback_rate": 0.0, - "macro_f1": 0.2174 + "fallback_rate": 0.1333, + "macro_f1": 0.3182 }, "medium": { - "accepted_accuracy": 0.4, - "accepted_coverage": 1.0, - "accuracy": 0.4, + "accepted_accuracy": 0.3077, + "accepted_coverage": 0.8667, + "accuracy": 0.2667, "count": 30, - "fallback_rate": 0.0, - "macro_f1": 0.2667 + "fallback_rate": 0.1333, + "macro_f1": 0.1633 } }, - "fallback_rate": 0.0, + "fallback_rate": 0.1333, "head": "iab_content", - "macro_f1": 0.1808, - "primary_source": "embedding_retrieval", + "macro_f1": 0.2081, + "primary_source": "supervised_classifier", "suite": "cross_vertical_benchmark", "tier_metrics": { - "average_prediction_depth": 2.5333, + "average_prediction_depth": 1.9889, "error_buckets": { - "exact_match": 31, - "parent_safe_stop": 5, - "right_tier1_wrong_tier2": 19, - "wrong_deep_leaf": 13, - "wrong_tier1": 22 + "exact_match": 33, + "parent_safe_stop": 3, + "right_tier1_wrong_tier2": 20, + "wrong_deep_leaf": 6, + "wrong_tier1": 28 }, - "exact_path_accuracy": 0.3444, - "parent_safe_accuracy": 0.4889, - "tier1_accuracy": 0.7556, - "tier2_accuracy": 0.5238, - "tier3_accuracy": 0.4762, - "tier4_accuracy": 1.0 + "exact_path_accuracy": 0.3667, + "parent_safe_accuracy": 0.5333, + "tier1_accuracy": 0.6889, + "tier2_accuracy": 0.4286, + "tier3_accuracy": 0.381, + "tier4_accuracy": 0.3333 }, "view_metrics": { - "combined_path": { - "average_prediction_depth": 2.5333, + "classifier": { + "average_prediction_depth": 1.9889, "error_buckets": { - "exact_match": 27, - "parent_safe_stop": 5, - "right_tier1_wrong_tier2": 19, - "wrong_deep_leaf": 17, - "wrong_tier1": 22 + "exact_match": 33, + "parent_safe_stop": 3, + "right_tier1_wrong_tier2": 20, + "wrong_deep_leaf": 6, + "wrong_tier1": 28 }, - "exact_path_accuracy": 0.3, - "fallback_overuse_count": 12, - "fallback_rate": 0.1333, - "parent_safe_accuracy": 0.4444, - "tier1_accuracy": 0.7556, - "tier2_accuracy": 0.5238, + "exact_path_accuracy": 0.3667, + "parent_safe_accuracy": 0.5333, + "tier1_accuracy": 0.6889, + "tier2_accuracy": 0.4286, "tier3_accuracy": 0.381, - "tier4_accuracy": 0.5 - }, - "disagreements": { - "retrieval_vs_combined": 0 + "tier4_accuracy": 0.3333 }, - "embedding_retrieval": { - "average_prediction_depth": 2.5333, + "combined_path": { + "average_prediction_depth": 1.9889, "error_buckets": { - "exact_match": 27, - "parent_safe_stop": 5, - "right_tier1_wrong_tier2": 19, - "wrong_deep_leaf": 17, - "wrong_tier1": 22 + "exact_match": 33, + "parent_safe_stop": 3, + "right_tier1_wrong_tier2": 20, + "wrong_deep_leaf": 6, + "wrong_tier1": 28 }, - "exact_path_accuracy": 0.3, - "parent_safe_accuracy": 0.4444, - "tier1_accuracy": 0.7556, - "tier2_accuracy": 0.5238, + "exact_path_accuracy": 0.3667, + "fallback_overuse_count": 18, + "fallback_rate": 0.2, + "parent_safe_accuracy": 0.5333, + "tier1_accuracy": 0.6889, + "tier2_accuracy": 0.4286, "tier3_accuracy": 0.381, - "tier4_accuracy": 0.5 + "tier4_accuracy": 0.3333 + }, + "disagreements": { + "classifier_vs_combined": 0 + }, + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } } diff --git a/artifacts/evaluation/latest/iab_content_difficulty_benchmark_report.json b/artifacts/evaluation/latest/iab_content_difficulty_benchmark_report.json index d3b5d207fb3fa86c57663572a73345410b260236..0d1f13e3bea711764b4b9729bb60ad7cc0f6f1a3 100644 --- a/artifacts/evaluation/latest/iab_content_difficulty_benchmark_report.json +++ b/artifacts/evaluation/latest/iab_content_difficulty_benchmark_report.json @@ -1,93 +1,97 @@ { - "accepted_accuracy": 0.3782, - "accepted_coverage": 1.0, - "accuracy": 0.3782, + "accepted_accuracy": 0.4959, + "accepted_coverage": 0.7885, + "accuracy": 0.391, "count": 156, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab_benchmark.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab_benchmark.jsonl", "difficulty_breakdown": { "easy": { - "accepted_accuracy": 0.4038, - "accepted_coverage": 1.0, - "accuracy": 0.4038, + "accepted_accuracy": 0.5778, + "accepted_coverage": 0.8654, + "accuracy": 0.5, "count": 52, - "fallback_rate": 0.0, - "macro_f1": 0.2171 + "fallback_rate": 0.1346, + "macro_f1": 0.3025 }, "hard": { - "accepted_accuracy": 0.3077, - "accepted_coverage": 1.0, - "accuracy": 0.3077, + "accepted_accuracy": 0.35, + "accepted_coverage": 0.7692, + "accuracy": 0.2692, "count": 52, - "fallback_rate": 0.0, - "macro_f1": 0.1626 + "fallback_rate": 0.2308, + "macro_f1": 0.1505 }, "medium": { - "accepted_accuracy": 0.4231, - "accepted_coverage": 1.0, - "accuracy": 0.4231, + "accepted_accuracy": 0.5526, + "accepted_coverage": 0.7308, + "accuracy": 0.4038, "count": 52, - "fallback_rate": 0.0, - "macro_f1": 0.2265 + "fallback_rate": 0.2692, + "macro_f1": 0.2184 } }, - "fallback_rate": 0.0, + "fallback_rate": 0.2115, "head": "iab_content", - "macro_f1": 0.1593, - "primary_source": "embedding_retrieval", + "macro_f1": 0.1715, + "primary_source": "supervised_classifier", "suite": "difficulty_benchmark", "tier_metrics": { - "average_prediction_depth": 2.5833, + "average_prediction_depth": 1.9936, "error_buckets": { - "exact_match": 59, - "parent_safe_stop": 17, - "right_tier1_wrong_tier2": 42, - "wrong_deep_leaf": 13, - "wrong_tier1": 25 + "exact_match": 61, + "parent_safe_stop": 4, + "right_tier1_wrong_tier2": 41, + "wrong_tier1": 50 }, - "exact_path_accuracy": 0.3782, - "parent_safe_accuracy": 0.6154, - "tier1_accuracy": 0.8397, - "tier2_accuracy": 0.5705, - "tier3_accuracy": 0.5648, - "tier4_accuracy": 0.5833 + "exact_path_accuracy": 0.391, + "parent_safe_accuracy": 0.6218, + "tier1_accuracy": 0.6795, + "tier2_accuracy": 0.4167, + "tier3_accuracy": 0.4259, + "tier4_accuracy": 0.4167 }, "view_metrics": { + "classifier": { + "average_prediction_depth": 1.9936, + "error_buckets": { + "exact_match": 56, + "parent_safe_stop": 4, + "right_tier1_wrong_tier2": 41, + "wrong_deep_leaf": 5, + "wrong_tier1": 50 + }, + "exact_path_accuracy": 0.359, + "parent_safe_accuracy": 0.5897, + "tier1_accuracy": 0.6795, + "tier2_accuracy": 0.4167, + "tier3_accuracy": 0.3796, + "tier4_accuracy": 0.25 + }, "combined_path": { - "average_prediction_depth": 2.5833, + "average_prediction_depth": 1.9936, "error_buckets": { - "exact_match": 48, - "parent_safe_stop": 17, - "right_tier1_wrong_tier2": 42, - "wrong_deep_leaf": 24, - "wrong_tier1": 25 + "exact_match": 56, + "parent_safe_stop": 4, + "right_tier1_wrong_tier2": 41, + "wrong_deep_leaf": 5, + "wrong_tier1": 50 }, - "exact_path_accuracy": 0.3077, + "exact_path_accuracy": 0.359, "fallback_overuse_count": 11, "fallback_rate": 0.0705, - "parent_safe_accuracy": 0.5449, - "tier1_accuracy": 0.8397, - "tier2_accuracy": 0.5705, - "tier3_accuracy": 0.4352, + "parent_safe_accuracy": 0.5897, + "tier1_accuracy": 0.6795, + "tier2_accuracy": 0.4167, + "tier3_accuracy": 0.3796, "tier4_accuracy": 0.25 }, "disagreements": { - "retrieval_vs_combined": 0 + "classifier_vs_combined": 0 }, - "embedding_retrieval": { - "average_prediction_depth": 2.5833, - "error_buckets": { - "exact_match": 48, - "parent_safe_stop": 17, - "right_tier1_wrong_tier2": 42, - "wrong_deep_leaf": 24, - "wrong_tier1": 25 - }, - "exact_path_accuracy": 0.3077, - "parent_safe_accuracy": 0.5449, - "tier1_accuracy": 0.8397, - "tier2_accuracy": 0.5705, - "tier3_accuracy": 0.4352, - "tier4_accuracy": 0.25 + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } } diff --git a/artifacts/evaluation/latest/iab_content_extended_cases_report.json b/artifacts/evaluation/latest/iab_content_extended_cases_report.json index b8ab0b46884b6ef25b21f7464a5c39d760156279..23b707f29282c1fdc1df0081c233a4f256a52ffa 100644 --- a/artifacts/evaluation/latest/iab_content_extended_cases_report.json +++ b/artifacts/evaluation/latest/iab_content_extended_cases_report.json @@ -1,64 +1,69 @@ { - "accepted_accuracy": 0.25, - "accepted_coverage": 1.0, - "accuracy": 0.25, + "accepted_accuracy": 0.6, + "accepted_coverage": 0.625, + "accuracy": 0.5, "count": 8, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab/extended_cases.jsonl", - "fallback_rate": 0.0, + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab/extended_cases.jsonl", + "fallback_rate": 0.375, "head": "iab_content", - "macro_f1": 0.1429, - "primary_source": "embedding_retrieval", + "macro_f1": 0.3333, + "primary_source": "supervised_classifier", "suite": "extended_cases", "tier_metrics": { - "average_prediction_depth": 2.375, + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 2, - "right_tier1_wrong_tier2": 3, - "wrong_deep_leaf": 2, + "exact_match": 4, + "right_tier1_wrong_tier2": 2, + "wrong_deep_leaf": 1, "wrong_tier1": 1 }, - "exact_path_accuracy": 0.25, - "parent_safe_accuracy": 0.375, + "exact_path_accuracy": 0.5, + "parent_safe_accuracy": 0.625, "tier1_accuracy": 0.875, - "tier2_accuracy": 0.4286, - "tier3_accuracy": 1.0, + "tier2_accuracy": 0.5714, + "tier3_accuracy": 0.0, "tier4_accuracy": 0.0 }, "view_metrics": { - "combined_path": { - "average_prediction_depth": 2.375, + "classifier": { + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 2, - "right_tier1_wrong_tier2": 3, - "wrong_deep_leaf": 2, + "exact_match": 4, + "right_tier1_wrong_tier2": 2, + "wrong_deep_leaf": 1, "wrong_tier1": 1 }, - "exact_path_accuracy": 0.25, - "fallback_overuse_count": 1, - "fallback_rate": 0.125, - "parent_safe_accuracy": 0.375, + "exact_path_accuracy": 0.5, + "parent_safe_accuracy": 0.625, "tier1_accuracy": 0.875, - "tier2_accuracy": 0.4286, + "tier2_accuracy": 0.5714, "tier3_accuracy": 0.0, "tier4_accuracy": 0.0 }, - "disagreements": { - "retrieval_vs_combined": 0 - }, - "embedding_retrieval": { - "average_prediction_depth": 2.375, + "combined_path": { + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 2, - "right_tier1_wrong_tier2": 3, - "wrong_deep_leaf": 2, + "exact_match": 4, + "right_tier1_wrong_tier2": 2, + "wrong_deep_leaf": 1, "wrong_tier1": 1 }, - "exact_path_accuracy": 0.25, - "parent_safe_accuracy": 0.375, + "exact_path_accuracy": 0.5, + "fallback_overuse_count": 2, + "fallback_rate": 0.25, + "parent_safe_accuracy": 0.625, "tier1_accuracy": 0.875, - "tier2_accuracy": 0.4286, + "tier2_accuracy": 0.5714, "tier3_accuracy": 0.0, "tier4_accuracy": 0.0 + }, + "disagreements": { + "classifier_vs_combined": 0 + }, + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } } diff --git a/artifacts/evaluation/latest/iab_content_hard_cases_report.json b/artifacts/evaluation/latest/iab_content_hard_cases_report.json index 4870a56d065e7ff08c1c160cc6f5c6b0a457a3c8..d57a21d9108bee79bc14fcc9bd05f8c330e8cffb 100644 --- a/artifacts/evaluation/latest/iab_content_hard_cases_report.json +++ b/artifacts/evaluation/latest/iab_content_hard_cases_report.json @@ -1,66 +1,66 @@ { - "accepted_accuracy": 0.25, - "accepted_coverage": 1.0, - "accuracy": 0.25, + "accepted_accuracy": 0.5, + "accepted_coverage": 0.75, + "accuracy": 0.375, "count": 8, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab/hard_cases.jsonl", - "fallback_rate": 0.0, + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab/hard_cases.jsonl", + "fallback_rate": 0.25, "head": "iab_content", - "macro_f1": 0.1429, - "primary_source": "embedding_retrieval", + "macro_f1": 0.2308, + "primary_source": "supervised_classifier", "suite": "hard_cases", "tier_metrics": { - "average_prediction_depth": 2.375, + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 2, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 2, - "wrong_tier1": 3 + "exact_match": 3, + "right_tier1_wrong_tier2": 1, + "wrong_tier1": 4 }, - "exact_path_accuracy": 0.25, + "exact_path_accuracy": 0.375, "parent_safe_accuracy": 0.5, - "tier1_accuracy": 0.625, + "tier1_accuracy": 0.5, "tier2_accuracy": 0.375, - "tier3_accuracy": 0.2, - "tier4_accuracy": 1.0 + "tier3_accuracy": 0.4, + "tier4_accuracy": 0.0 }, "view_metrics": { + "classifier": { + "average_prediction_depth": 1.75, + "error_buckets": { + "exact_match": 3, + "right_tier1_wrong_tier2": 1, + "wrong_tier1": 4 + }, + "exact_path_accuracy": 0.375, + "parent_safe_accuracy": 0.5, + "tier1_accuracy": 0.5, + "tier2_accuracy": 0.375, + "tier3_accuracy": 0.4, + "tier4_accuracy": 0.0 + }, "combined_path": { - "average_prediction_depth": 2.375, + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 1, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 2, - "wrong_deep_leaf": 1, - "wrong_tier1": 3 + "exact_match": 3, + "right_tier1_wrong_tier2": 1, + "wrong_tier1": 4 }, - "exact_path_accuracy": 0.125, + "exact_path_accuracy": 0.375, "fallback_overuse_count": 1, "fallback_rate": 0.125, - "parent_safe_accuracy": 0.375, - "tier1_accuracy": 0.625, + "parent_safe_accuracy": 0.5, + "tier1_accuracy": 0.5, "tier2_accuracy": 0.375, - "tier3_accuracy": 0.0, + "tier3_accuracy": 0.4, "tier4_accuracy": 0.0 }, "disagreements": { - "retrieval_vs_combined": 0 + "classifier_vs_combined": 0 }, - "embedding_retrieval": { - "average_prediction_depth": 2.375, - "error_buckets": { - "exact_match": 1, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 2, - "wrong_deep_leaf": 1, - "wrong_tier1": 3 - }, - "exact_path_accuracy": 0.125, - "parent_safe_accuracy": 0.375, - "tier1_accuracy": 0.625, - "tier2_accuracy": 0.375, - "tier3_accuracy": 0.0, - "tier4_accuracy": 0.0 + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } } diff --git a/artifacts/evaluation/latest/iab_content_test_report.json b/artifacts/evaluation/latest/iab_content_test_report.json index e9683ebdb2de7834b92081a4d4b4f07ccb43ca1a..427579c72843938fd18374b6e687835687f733f5 100644 --- a/artifacts/evaluation/latest/iab_content_test_report.json +++ b/artifacts/evaluation/latest/iab_content_test_report.json @@ -1,31 +1,47 @@ { - "accepted_accuracy": 0.6527, - "accepted_coverage": 1.0, - "accuracy": 0.6527, + "accepted_accuracy": 0.916, + "accepted_coverage": 0.9973, + "accuracy": 0.915, "count": 3282, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab/test.jsonl", - "fallback_rate": 0.0, + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab/test.jsonl", + "fallback_rate": 0.0027, "head": "iab_content", - "macro_f1": 0.6922, - "primary_source": "embedding_retrieval", + "macro_f1": 0.8686, + "primary_source": "supervised_classifier", "suite": "test", "tier_metrics": { - "average_prediction_depth": 2.1889, + "average_prediction_depth": 2.1804, "error_buckets": { - "exact_match": 2142, - "parent_safe_stop": 115, - "right_tier1_wrong_tier2": 674, - "wrong_deep_leaf": 236, - "wrong_tier1": 115 + "exact_match": 3003, + "parent_safe_stop": 65, + "right_tier1_wrong_tier2": 73, + "wrong_deep_leaf": 90, + "wrong_tier1": 51 }, - "exact_path_accuracy": 0.6527, - "parent_safe_accuracy": 0.7721, - "tier1_accuracy": 0.965, - "tier2_accuracy": 0.7587, - "tier3_accuracy": 0.8041, - "tier4_accuracy": 0.7929 + "exact_path_accuracy": 0.915, + "parent_safe_accuracy": 0.9442, + "tier1_accuracy": 0.9845, + "tier2_accuracy": 0.9606, + "tier3_accuracy": 0.8528, + "tier4_accuracy": 0.5286 }, "view_metrics": { + "classifier": { + "average_prediction_depth": 2.1804, + "error_buckets": { + "exact_match": 2965, + "parent_safe_stop": 63, + "right_tier1_wrong_tier2": 85, + "wrong_deep_leaf": 118, + "wrong_tier1": 51 + }, + "exact_path_accuracy": 0.9034, + "parent_safe_accuracy": 0.9321, + "tier1_accuracy": 0.9845, + "tier2_accuracy": 0.9565, + "tier3_accuracy": 0.8218, + "tier4_accuracy": 0.3429 + }, "combined_path": { "count": 3282, "max_combined_rows": 500, @@ -38,21 +54,10 @@ "reason": "dataset_too_large_for_combined_view", "skipped": true }, - "embedding_retrieval": { - "average_prediction_depth": 2.1889, - "error_buckets": { - "exact_match": 2107, - "parent_safe_stop": 109, - "right_tier1_wrong_tier2": 680, - "wrong_deep_leaf": 271, - "wrong_tier1": 115 - }, - "exact_path_accuracy": 0.642, - "parent_safe_accuracy": 0.7596, - "tier1_accuracy": 0.965, - "tier2_accuracy": 0.7566, - "tier3_accuracy": 0.7679, - "tier4_accuracy": 0.6071 + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } } diff --git a/artifacts/evaluation/latest/iab_content_train_report.json b/artifacts/evaluation/latest/iab_content_train_report.json index 17a12e558d864fb6de25ce516aa4d27fd93b6fa6..9b8208b51eb92e4eedfb39d4fafcc1cba8aa3503 100644 --- a/artifacts/evaluation/latest/iab_content_train_report.json +++ b/artifacts/evaluation/latest/iab_content_train_report.json @@ -1,67 +1,63 @@ { - "accepted_accuracy": 0.8115, - "accepted_coverage": 1.0, - "accuracy": 0.8115, + "accepted_accuracy": 0.9221, + "accepted_coverage": 0.998, + "accuracy": 0.9212, "count": 13211, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab/train.jsonl", - "fallback_rate": 0.0, + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab/train.jsonl", + "fallback_rate": 0.002, "head": "iab_content", - "macro_f1": 0.8293, - "primary_source": "embedding_retrieval", + "macro_f1": 0.8805, + "primary_source": "supervised_classifier", "suite": "train", "tier_metrics": { - "average_prediction_depth": 2.2368, + "average_prediction_depth": 2.1738, "error_buckets": { - "exact_match": 10721, - "parent_safe_stop": 346, - "right_tier1_wrong_tier2": 812, - "wrong_deep_leaf": 809, - "wrong_tier1": 523 + "exact_match": 12170, + "parent_safe_stop": 238, + "right_tier1_wrong_tier2": 294, + "wrong_deep_leaf": 337, + "wrong_tier1": 172 }, - "exact_path_accuracy": 0.8115, - "parent_safe_accuracy": 0.8753, - "tier1_accuracy": 0.9604, - "tier2_accuracy": 0.9208, - "tier3_accuracy": 0.8788, - "tier4_accuracy": 0.8732 + "exact_path_accuracy": 0.9212, + "parent_safe_accuracy": 0.9492, + "tier1_accuracy": 0.987, + "tier2_accuracy": 0.9629, + "tier3_accuracy": 0.8617, + "tier4_accuracy": 0.5554 }, "view_metrics": { - "combined_path": { - "average_prediction_depth": 2.2368, + "classifier": { + "average_prediction_depth": 2.1738, "error_buckets": { - "exact_match": 10569, - "parent_safe_stop": 338, - "right_tier1_wrong_tier2": 834, - "wrong_deep_leaf": 947, - "wrong_tier1": 523 + "exact_match": 12011, + "parent_safe_stop": 232, + "right_tier1_wrong_tier2": 342, + "wrong_deep_leaf": 454, + "wrong_tier1": 172 }, - "exact_path_accuracy": 0.8, - "fallback_overuse_count": 1123, - "fallback_rate": 0.085, - "parent_safe_accuracy": 0.8631, - "tier1_accuracy": 0.9604, - "tier2_accuracy": 0.9189, - "tier3_accuracy": 0.843, - "tier4_accuracy": 0.6589 + "exact_path_accuracy": 0.9092, + "parent_safe_accuracy": 0.9367, + "tier1_accuracy": 0.987, + "tier2_accuracy": 0.9588, + "tier3_accuracy": 0.8293, + "tier4_accuracy": 0.3607 + }, + "combined_path": { + "count": 13211, + "max_combined_rows": 500, + "reason": "dataset_too_large_for_combined_view", + "skipped": true }, "disagreements": { - "retrieval_vs_combined": 0 + "count": 13211, + "max_combined_rows": 500, + "reason": "dataset_too_large_for_combined_view", + "skipped": true }, - "embedding_retrieval": { - "average_prediction_depth": 2.2368, - "error_buckets": { - "exact_match": 10569, - "parent_safe_stop": 338, - "right_tier1_wrong_tier2": 834, - "wrong_deep_leaf": 947, - "wrong_tier1": 523 - }, - "exact_path_accuracy": 0.8, - "parent_safe_accuracy": 0.8631, - "tier1_accuracy": 0.9604, - "tier2_accuracy": 0.9189, - "tier3_accuracy": 0.843, - "tier4_accuracy": 0.6589 + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } } diff --git a/artifacts/evaluation/latest/iab_content_val_report.json b/artifacts/evaluation/latest/iab_content_val_report.json index 3c15b89dfde84d4abb48c287151b7575d2e4908a..87b0e02889dc87b8908cf4574000709b074796cf 100644 --- a/artifacts/evaluation/latest/iab_content_val_report.json +++ b/artifacts/evaluation/latest/iab_content_val_report.json @@ -1,67 +1,63 @@ { - "accepted_accuracy": 0.6545, - "accepted_coverage": 1.0, - "accuracy": 0.6545, + "accepted_accuracy": 0.9138, + "accepted_coverage": 0.9963, + "accuracy": 0.9126, "count": 3282, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab/val.jsonl", - "fallback_rate": 0.0, + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab/val.jsonl", + "fallback_rate": 0.0037, "head": "iab_content", - "macro_f1": 0.6957, - "primary_source": "embedding_retrieval", + "macro_f1": 0.8708, + "primary_source": "supervised_classifier", "suite": "val", "tier_metrics": { - "average_prediction_depth": 2.1813, + "average_prediction_depth": 2.1795, "error_buckets": { - "exact_match": 2148, - "parent_safe_stop": 105, - "right_tier1_wrong_tier2": 684, - "wrong_deep_leaf": 234, - "wrong_tier1": 111 + "exact_match": 2995, + "parent_safe_stop": 63, + "right_tier1_wrong_tier2": 81, + "wrong_deep_leaf": 90, + "wrong_tier1": 53 }, - "exact_path_accuracy": 0.6545, - "parent_safe_accuracy": 0.7821, - "tier1_accuracy": 0.9662, - "tier2_accuracy": 0.7577, - "tier3_accuracy": 0.8352, - "tier4_accuracy": 0.7214 + "exact_path_accuracy": 0.9126, + "parent_safe_accuracy": 0.9427, + "tier1_accuracy": 0.9839, + "tier2_accuracy": 0.9565, + "tier3_accuracy": 0.8549, + "tier4_accuracy": 0.5429 }, "view_metrics": { - "combined_path": { - "average_prediction_depth": 2.1813, + "classifier": { + "average_prediction_depth": 2.1795, "error_buckets": { - "exact_match": 2116, - "parent_safe_stop": 100, - "right_tier1_wrong_tier2": 689, - "wrong_deep_leaf": 266, - "wrong_tier1": 111 + "exact_match": 2958, + "parent_safe_stop": 60, + "right_tier1_wrong_tier2": 93, + "wrong_deep_leaf": 118, + "wrong_tier1": 53 }, - "exact_path_accuracy": 0.6447, - "fallback_overuse_count": 413, - "fallback_rate": 0.1258, - "parent_safe_accuracy": 0.7709, - "tier1_accuracy": 0.9662, - "tier2_accuracy": 0.756, - "tier3_accuracy": 0.799, - "tier4_accuracy": 0.55 + "exact_path_accuracy": 0.9013, + "parent_safe_accuracy": 0.9305, + "tier1_accuracy": 0.9839, + "tier2_accuracy": 0.9524, + "tier3_accuracy": 0.8238, + "tier4_accuracy": 0.3643 + }, + "combined_path": { + "count": 3282, + "max_combined_rows": 500, + "reason": "dataset_too_large_for_combined_view", + "skipped": true }, "disagreements": { - "retrieval_vs_combined": 0 + "count": 3282, + "max_combined_rows": 500, + "reason": "dataset_too_large_for_combined_view", + "skipped": true }, - "embedding_retrieval": { - "average_prediction_depth": 2.1813, - "error_buckets": { - "exact_match": 2116, - "parent_safe_stop": 100, - "right_tier1_wrong_tier2": 689, - "wrong_deep_leaf": 266, - "wrong_tier1": 111 - }, - "exact_path_accuracy": 0.6447, - "parent_safe_accuracy": 0.7709, - "tier1_accuracy": 0.9662, - "tier2_accuracy": 0.756, - "tier3_accuracy": 0.799, - "tier4_accuracy": 0.55 + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } } diff --git a/artifacts/evaluation/latest/iab_cross_vertical_behavior_lock_regression.json b/artifacts/evaluation/latest/iab_cross_vertical_behavior_lock_regression.json index 41461a7cc899448c556bedc7342ec85bbea3fa12..cc7b70a3ab2cb0fcf62100422a5b16f1dc8c70cc 100644 --- a/artifacts/evaluation/latest/iab_cross_vertical_behavior_lock_regression.json +++ b/artifacts/evaluation/latest/iab_cross_vertical_behavior_lock_regression.json @@ -1,21 +1,21 @@ { "by_status": { "must_fix": { - "failed": 0, - "passed": 90, + "failed": 89, + "passed": 1, "total": 90 } }, - "cases_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/examples/iab_cross_vertical_behavior_lock_cases.json", + "cases_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/examples/iab_cross_vertical_behavior_lock_cases.json", "count": 90, - "failed": 0, - "passed": 90, + "failed": 89, + "passed": 1, "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type" + "model_output.classification.iab_content.tier2.label": "Travel Accessories" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -23,15 +23,26 @@ "model_output.classification.iab_content.tier2.label": "Travel Type" }, "id": "auto-buying-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Travel Accessories", + "expected": "Travel Type", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Automotive > Auto Buying and Selling.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which car should I buy for commuting?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, @@ -41,17 +52,23 @@ "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, "id": "auto-buying-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Automotive > Auto Buying and Selling.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best used SUV for a family of four" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Type" + "model_output.classification.iab_content.tier2.label": "Auto Shows" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -59,18 +76,29 @@ "model_output.classification.iab_content.tier2.label": "Auto Type" }, "id": "auto-buying-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Auto Shows", + "expected": "Auto Type", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Automotive > Auto Buying and Selling.", - "pass": true, + "pass": false, "status": "must_fix", "text": "I need a shortlist of practical cars before making a purchase this month" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -79,18 +107,34 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "sales-crm-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Business and Finance > Business > Sales.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What is CRM software?" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -99,9 +143,25 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "sales-crm-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Business and Finance > Business > Sales.", - "pass": true, + "pass": false, "status": "must_fix", "text": "HubSpot vs Zoho for a small team" }, @@ -109,8 +169,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": "Sales" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -119,9 +179,20 @@ "model_output.classification.iab_content.tier3.label": "Sales" }, "id": "sales-crm-hard", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Sales", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Business and Finance > Business > Sales.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need software to manage leads and pipeline for a startup sales team" }, @@ -129,8 +200,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -139,9 +210,20 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "marketing-tools-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Business and Finance > Business > Marketing and Advertising.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best SEO tools for content teams" }, @@ -149,7 +231,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -157,18 +239,24 @@ "model_output.classification.iab_content.tier2.label": "Business" }, "id": "marketing-tools-medium", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Business and Finance > Business > Marketing and Advertising.", - "pass": true, + "pass": false, "status": "must_fix", "text": "How should I compare ad attribution platforms?" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier1.label": "Business and Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -177,9 +265,25 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "marketing-tools-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Business and Finance", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Business and Finance > Business > Marketing and Advertising.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need software to measure channel performance across paid and organic campaigns" }, @@ -188,7 +292,7 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -197,17 +301,23 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "business-it-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Business and Finance > Business > Business I.T..", - "pass": true, + "pass": false, "status": "must_fix", "text": "How do I reset my work password?" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search" + "model_output.classification.iab_content.tier1.label": "Business and Finance", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -215,9 +325,20 @@ "model_output.classification.iab_content.tier2.label": "Job Search" }, "id": "business-it-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Business and Finance", + "expected": "Careers", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Job Search", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Business and Finance > Business > Business I.T..", - "pass": true, + "pass": false, "status": "must_fix", "text": "My employees keep getting locked out of their accounts" }, @@ -226,7 +347,7 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -235,15 +356,21 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "business-it-hard", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Business and Finance > Business > Business I.T..", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need identity and access software for login, permissions, and account security" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": "Dining Out" }, @@ -253,15 +380,21 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Food & Drink > Dining Out.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Book a table for six tonight" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": "Dining Out" }, @@ -271,15 +404,21 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Food & Drink > Dining Out.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Good restaurants for a client dinner downtown" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": "Dining Out" }, @@ -289,15 +428,21 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Food & Drink > Dining Out.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a place to eat tonight where I can make a reservation online" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "expected": { @@ -305,15 +450,21 @@ "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "id": "alcoholic-beverages-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Food & Drink > Alcoholic Beverages.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which whiskey cocktail should I order?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "expected": { @@ -321,15 +472,21 @@ "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "id": "alcoholic-beverages-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Food & Drink > Alcoholic Beverages.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best vodka drinks for beginners" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "expected": { @@ -337,31 +494,43 @@ "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "id": "alcoholic-beverages-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Food & Drink > Alcoholic Beverages.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Want a spirit-forward drink recommendation, not a restaurant suggestion" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing" + "model_output.classification.iab_content.tier1.label": "Careers" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing" }, "id": "artificial-intelligence-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Artificial Intelligence.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What is intent classification in NLP?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "Language Learning" }, @@ -371,15 +540,21 @@ "model_output.classification.iab_content.tier2.label": "Language Learning" }, "id": "artificial-intelligence-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Artificial Intelligence.", - "pass": true, + "pass": false, "status": "must_fix", "text": "How do large language models handle text classification?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "Language Learning" }, @@ -389,17 +564,23 @@ "model_output.classification.iab_content.tier2.label": "Language Learning" }, "id": "artificial-intelligence-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Artificial Intelligence.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need the machine learning concept behind language understanding, not software to buy" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business" + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -407,9 +588,20 @@ "model_output.classification.iab_content.tier2.label": "Business" }, "id": "software-apps-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Technology & Computing", + "expected": "Business and Finance", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best workflow software for a small operations team" }, @@ -417,9 +609,9 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Cloud Computing" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -429,9 +621,25 @@ "model_output.classification.iab_content.tier4.label": "Cloud Computing" }, "id": "software-apps-medium", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Cloud Computing", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need project management software for a distributed team" }, @@ -439,8 +647,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -449,19 +657,30 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "software-apps-hard", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Looking for a business software platform to organize internal workflows" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Communication" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Remote Working", + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -471,9 +690,35 @@ "model_output.classification.iab_content.tier4.label": "Communication" }, "id": "communication-software-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Remote Working", + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Communication", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Computer Software and Applications > Communication.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best communication software for remote teams" }, @@ -482,8 +727,8 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Communication" + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -493,19 +738,30 @@ "model_output.classification.iab_content.tier4.label": "Communication" }, "id": "communication-software-medium", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Communication", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Computer Software and Applications > Communication.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Slack vs Teams for internal messaging" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Communication" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -515,9 +771,30 @@ "model_output.classification.iab_content.tier4.label": "Communication" }, "id": "communication-software-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Communication", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Computer Software and Applications > Communication.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a workplace chat tool for cross-functional collaboration" }, @@ -526,8 +803,8 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -537,15 +814,26 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Vercel vs Netlify for website hosting" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Internet", @@ -559,15 +847,21 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best hosting platform for a startup website" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Internet", @@ -581,15 +875,21 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a managed hosting provider to deploy and run our marketing site" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Laptops" @@ -601,15 +901,21 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which laptop should I buy for college?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Laptops" @@ -621,18 +927,24 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best laptop for work and study under 1200" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Consumer Electronics", - "model_output.classification.iab_content.tier3.label": "Smartphones" + "model_output.classification.iab_content.tier2.label": "Computing", + "model_output.classification.iab_content.tier3.label": "Laptops" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -641,9 +953,25 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "laptops-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Computing", + "expected": "Consumer Electronics", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": "Laptops", + "expected": "Smartphones", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a portable computer with good battery life for everyday work" }, @@ -653,7 +981,7 @@ "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Photo Editing Software" + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -663,15 +991,21 @@ "model_output.classification.iab_content.tier4.label": "Photo Editing Software" }, "id": "desktops-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Photo Editing Software", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best desktop for video editing" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Desktops" @@ -683,15 +1017,21 @@ "model_output.classification.iab_content.tier3.label": "Desktops" }, "id": "desktops-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which desktop computer should I buy for a home office?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Desktops" @@ -703,15 +1043,21 @@ "model_output.classification.iab_content.tier3.label": "Desktops" }, "id": "desktops-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a desktop PC with strong performance for creative work" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics" }, @@ -721,15 +1067,21 @@ "model_output.classification.iab_content.tier2.label": "Consumer Electronics" }, "id": "smartphones-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best phone with a good camera under 700" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", "model_output.classification.iab_content.tier3.label": "Smartphones" @@ -741,15 +1093,21 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "smartphones-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Should I buy an iPhone or Pixel this year?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", "model_output.classification.iab_content.tier3.label": "Smartphones" @@ -761,9 +1119,15 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "smartphones-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a new smartphone with strong battery life and a clean software experience" }, @@ -771,8 +1135,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -781,18 +1145,29 @@ "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "id": "style-fashion-parent-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Women's Shoes and Footwear", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Style & Fashion.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best shoes under 100 dollars" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Clothing" + "model_output.classification.iab_content.tier2.label": "High Fashion", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -801,18 +1176,34 @@ "model_output.classification.iab_content.tier3.label": "Women's Clothing" }, "id": "style-fashion-parent-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "High Fashion", + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Women's Clothing", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Style & Fashion.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Affordable fashion accessories for everyday wear" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" + "model_output.classification.iab_content.tier2.label": "Children's Clothing", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -821,18 +1212,34 @@ "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "id": "style-fashion-parent-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Children's Clothing", + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Women's Shoes and Footwear", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Style & Fashion.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need style recommendations for clothing and footwear without a specific brand in mind" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Clothing" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -841,18 +1248,39 @@ "model_output.classification.iab_content.tier3.label": "Women's Clothing" }, "id": "womens-shoes-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Sports", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Walking", + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Women's Clothing", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Style & Fashion > Women's Fashion > Women's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best women's running shoes under 100 dollars" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Clothing" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -861,15 +1289,36 @@ "model_output.classification.iab_content.tier3.label": "Women's Clothing" }, "id": "womens-shoes-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Sports", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Walking", + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Women's Clothing", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Style & Fashion > Women's Fashion > Women's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Comfortable women's sneakers for walking all day" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Women's Fashion", "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" @@ -881,9 +1330,15 @@ "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "id": "womens-shoes-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Style & Fashion > Women's Fashion > Women's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need women's footwear for commuting that looks polished but feels comfortable" }, @@ -892,7 +1347,7 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Men's Fashion", - "model_output.classification.iab_content.tier3.label": "Men's Clothing" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -901,15 +1356,21 @@ "model_output.classification.iab_content.tier3.label": "Men's Clothing" }, "id": "mens-shoes-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Men's Clothing", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Style & Fashion > Men's Fashion > Men's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best men's sneakers for daily wear" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Men's Fashion", "model_output.classification.iab_content.tier3.label": "Men's Clothing" @@ -921,15 +1382,21 @@ "model_output.classification.iab_content.tier3.label": "Men's Clothing" }, "id": "mens-shoes-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Style & Fashion > Men's Fashion > Men's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Good men's dress shoes for office use" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Men's Fashion", "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" @@ -941,17 +1408,23 @@ "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" }, "id": "mens-shoes-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Style & Fashion > Men's Fashion > Men's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need men's footwear that works for workdays and weekend walking" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Real Estate", - "model_output.classification.iab_content.tier2.label": "Hotel Properties" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -959,15 +1432,31 @@ "model_output.classification.iab_content.tier2.label": "Hotel Properties" }, "id": "hotels-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Travel", + "expected": "Real Estate", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Travel Type", + "expected": "Hotel Properties", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Travel > Travel Type > Hotels and Motels.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a hotel in Chicago for two nights" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", "model_output.classification.iab_content.tier2.label": "Travel Type", "model_output.classification.iab_content.tier3.label": "Hotels and Motels" @@ -979,17 +1468,23 @@ "model_output.classification.iab_content.tier3.label": "Hotels and Motels" }, "id": "hotels-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Travel > Travel Type > Hotels and Motels.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best hotels near Times Square for a weekend trip" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -997,15 +1492,26 @@ "model_output.classification.iab_content.tier2.label": "Travel Type" }, "id": "hotels-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Travel Type", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Travel > Travel Type > Hotels and Motels.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Looking for a place to stay during a work trip, not general travel advice" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Real Estate", "model_output.classification.iab_content.tier2.label": "Apartments" }, @@ -1015,15 +1521,21 @@ "model_output.classification.iab_content.tier2.label": "Apartments" }, "id": "real-estate-rentals-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Real Estate > Real Estate Renting and Leasing.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Apartments for rent near downtown Austin" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Real Estate", "model_output.classification.iab_content.tier2.label": "Apartments" }, @@ -1033,18 +1545,24 @@ "model_output.classification.iab_content.tier2.label": "Apartments" }, "id": "real-estate-rentals-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Real Estate > Real Estate Renting and Leasing.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best neighborhoods to lease a two-bedroom apartment in Seattle" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Personal Debt", - "model_output.classification.iab_content.tier3.label": "Home Financing" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Real Estate", + "model_output.classification.iab_content.tier2.label": "Real Estate Renting and Leasing", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1053,18 +1571,39 @@ "model_output.classification.iab_content.tier3.label": "Home Financing" }, "id": "real-estate-rentals-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Real Estate", + "expected": "Personal Finance", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Real Estate Renting and Leasing", + "expected": "Personal Debt", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Home Financing", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Real Estate > Real Estate Renting and Leasing.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need rental listings for a short move, not home-buying advice" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": "Green Solutions" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1073,18 +1612,39 @@ "model_output.classification.iab_content.tier3.label": "Green Solutions" }, "id": "running-and-jogging-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Sports", + "expected": "Business and Finance", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Walking", + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Green Solutions", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best running plan for a first 10k" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", - "model_output.classification.iab_content.tier3.label": "Running and Jogging" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1093,18 +1653,39 @@ "model_output.classification.iab_content.tier3.label": "Running and Jogging" }, "id": "running-and-jogging-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Sports", + "expected": "Healthy Living", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Walking", + "expected": "Fitness and Exercise", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Running and Jogging", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", - "pass": true, + "pass": false, "status": "must_fix", "text": "How should I train for a half marathon as a beginner?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", - "model_output.classification.iab_content.tier3.label": "Running and Jogging" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1113,15 +1694,36 @@ "model_output.classification.iab_content.tier3.label": "Running and Jogging" }, "id": "running-and-jogging-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Sports", + "expected": "Healthy Living", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Walking", + "expected": "Fitness and Exercise", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Running and Jogging", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need guidance on building a weekly jogging routine without getting injured" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", "model_output.classification.iab_content.tier2.label": "Soccer" }, @@ -1131,15 +1733,21 @@ "model_output.classification.iab_content.tier2.label": "Soccer" }, "id": "soccer-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Sports > Soccer.", - "pass": true, + "pass": false, "status": "must_fix", "text": "How do offside rules work in soccer?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", "model_output.classification.iab_content.tier2.label": "Soccer" }, @@ -1149,15 +1757,21 @@ "model_output.classification.iab_content.tier2.label": "Soccer" }, "id": "soccer-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Sports > Soccer.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best soccer drills for beginner players" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports" }, "expected": { @@ -1165,17 +1779,23 @@ "model_output.classification.iab_content.tier1.label": "Sports" }, "id": "soccer-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Sports > Soccer.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need help understanding football tactics for the Premier League, not fantasy sports" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Books and Literature", - "model_output.classification.iab_content.tier2.label": "Fiction" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Fantasy" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1183,17 +1803,33 @@ "model_output.classification.iab_content.tier2.label": "Fiction" }, "id": "fiction-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Genres", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Fantasy", + "expected": "Fiction", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Books and Literature > Fiction.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Recommend a good fantasy novel to read" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Books and Literature", - "model_output.classification.iab_content.tier2.label": "Fiction" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1201,31 +1837,58 @@ "model_output.classification.iab_content.tier2.label": "Fiction" }, "id": "fiction-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Travel", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Travel Type", + "expected": "Fiction", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Books and Literature > Fiction.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best fiction books for a long flight" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Books and Literature" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Genres" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Books and Literature" }, "id": "fiction-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Genres", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Books and Literature > Fiction.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Looking for a character-driven novel, not comics or poetry" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", "model_output.classification.iab_content.tier2.label": "Remodeling & Construction" }, @@ -1235,18 +1898,24 @@ "model_output.classification.iab_content.tier2.label": "Remodeling & Construction" }, "id": "home-improvement-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Home & Garden > Home Improvement.", - "pass": true, + "pass": false, "status": "must_fix", "text": "How much does a kitchen remodel usually cost?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Personal Care", - "model_output.classification.iab_content.tier3.label": "Bath and Shower" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Home & Garden", + "model_output.classification.iab_content.tier2.label": "Indoor Environmental Quality", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1255,17 +1924,38 @@ "model_output.classification.iab_content.tier3.label": "Bath and Shower" }, "id": "home-improvement-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Home & Garden", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Indoor Environmental Quality", + "expected": "Personal Care", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Bath and Shower", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Home & Garden > Home Improvement.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best tools for a DIY bathroom renovation" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Interior Decorating" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1273,17 +1963,28 @@ "model_output.classification.iab_content.tier2.label": "Interior Decorating" }, "id": "home-improvement-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Interior Decorating", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Home & Garden > Home Improvement.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need practical advice for upgrading an older house, not interior decor inspiration" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Education", - "model_output.classification.iab_content.tier2.label": "Language Learning" + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1291,15 +1992,26 @@ "model_output.classification.iab_content.tier2.label": "Language Learning" }, "id": "online-education-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Technology & Computing", + "expected": "Education", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Language Learning", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Education > Online Education.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best online courses for learning Python" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Careers", "model_output.classification.iab_content.tier2.label": "Remote Working" }, @@ -1309,9 +2021,15 @@ "model_output.classification.iab_content.tier2.label": "Remote Working" }, "id": "online-education-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Education > Online Education.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What are good platforms for remote professional classes?" }, @@ -1333,7 +2051,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "College Education" }, @@ -1343,9 +2061,15 @@ "model_output.classification.iab_content.tier2.label": "College Education" }, "id": "postgraduate-education-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Education > College Education > Postgraduate Education.", - "pass": true, + "pass": false, "status": "must_fix", "text": "best universities to study masters" }, @@ -1353,9 +2077,9 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Education", - "model_output.classification.iab_content.tier2.label": "College Education", - "model_output.classification.iab_content.tier3.label": "Postgraduate Education", - "model_output.classification.iab_content.tier4.label": "Professional School" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1365,15 +2089,31 @@ "model_output.classification.iab_content.tier4.label": "Professional School" }, "id": "postgraduate-education-medium", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "College Education", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Postgraduate Education", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Professional School", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Education > College Education > Postgraduate Education.", - "pass": true, + "pass": false, "status": "must_fix", "text": "which graduate schools have strong data science programs" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "College Education" }, @@ -1383,9 +2123,15 @@ "model_output.classification.iab_content.tier2.label": "College Education" }, "id": "postgraduate-education-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Education > College Education > Postgraduate Education.", - "pass": true, + "pass": false, "status": "must_fix", "text": "need postgraduate options for a master's degree, not short online courses" }, @@ -1393,8 +2139,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Medical Health", - "model_output.classification.iab_content.tier2.label": "Diseases and Conditions", - "model_output.classification.iab_content.tier3.label": "Allergies" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1403,9 +2149,20 @@ "model_output.classification.iab_content.tier3.label": "Allergies" }, "id": "medical-health-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Diseases and Conditions", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Allergies", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Medical Health.", - "pass": true, + "pass": false, "status": "must_fix", "text": "what do these allergy symptoms mean" }, @@ -1414,8 +2171,8 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Medical Health", "model_output.classification.iab_content.tier2.label": "Diseases and Conditions", - "model_output.classification.iab_content.tier3.label": "Injuries", - "model_output.classification.iab_content.tier4.label": "First Aid" + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1425,18 +2182,29 @@ "model_output.classification.iab_content.tier4.label": "First Aid" }, "id": "medical-health-medium", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Injuries", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "First Aid", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Medical Health.", - "pass": true, + "pass": false, "status": "must_fix", "text": "when should i see a doctor for persistent knee pain" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Wellness", - "model_output.classification.iab_content.tier3.label": "Physical Therapy" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Medical Health", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1445,15 +2213,36 @@ "model_output.classification.iab_content.tier3.label": "Physical Therapy" }, "id": "medical-health-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Medical Health", + "expected": "Healthy Living", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Wellness", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Physical Therapy", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Medical Health.", - "pass": true, + "pass": false, "status": "must_fix", "text": "need medical advice about symptoms, not wellness or fitness tips" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Careers", "model_output.classification.iab_content.tier2.label": "Remote Working" }, @@ -1463,18 +2252,24 @@ "model_output.classification.iab_content.tier2.label": "Remote Working" }, "id": "careers-job-search-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Careers > Job Search.", - "pass": true, + "pass": false, "status": "must_fix", "text": "best remote jobs for data analysts" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": "Sales" + "model_output.classification.iab_content.tier2.label": "Industries", + "model_output.classification.iab_content.tier3.label": "Management Consulting Industry" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1483,17 +2278,33 @@ "model_output.classification.iab_content.tier3.label": "Sales" }, "id": "careers-job-search-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Industries", + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": "Management Consulting Industry", + "expected": "Sales", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Careers > Job Search.", - "pass": true, + "pass": false, "status": "must_fix", "text": "where should i look for product manager openings" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search" + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1501,17 +2312,28 @@ "model_output.classification.iab_content.tier2.label": "Job Search" }, "id": "careers-job-search-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Genres", + "expected": "Careers", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Job Search", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Careers > Job Search.", - "pass": true, + "pass": false, "status": "must_fix", "text": "need help finding a new role and preparing for interviews" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": "Food Movements" + "model_output.classification.iab_content.tier1.label": "Personal Celebrations & Life Events", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1519,18 +2341,29 @@ "model_output.classification.iab_content.tier2.label": "Food Movements" }, "id": "personal-finance-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Personal Celebrations & Life Events", + "expected": "Food & Drink", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Food Movements", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Personal Finance > Financial Planning.", - "pass": true, + "pass": false, "status": "must_fix", "text": "how much should i save each month" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Economy", - "model_output.classification.iab_content.tier3.label": "Financial Reform" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1539,17 +2372,38 @@ "model_output.classification.iab_content.tier3.label": "Financial Reform" }, "id": "personal-finance-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Personal Finance", + "expected": "Business and Finance", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Economy", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Financial Reform", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Personal Finance > Financial Planning.", - "pass": true, + "pass": false, "status": "must_fix", "text": "best budgeting approach for a growing family" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Retirement Planning" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1557,15 +2411,26 @@ "model_output.classification.iab_content.tier2.label": "Retirement Planning" }, "id": "personal-finance-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Retirement Planning", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Personal Finance > Financial Planning.", - "pass": true, + "pass": false, "status": "must_fix", "text": "need help planning savings and retirement, not business finance advice" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Family and Relationships", "model_output.classification.iab_content.tier2.label": "Parenting" }, @@ -1575,17 +2440,23 @@ "model_output.classification.iab_content.tier2.label": "Parenting" }, "id": "parenting-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Family and Relationships > Parenting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "tips for parenting a toddler" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Family and Relationships", - "model_output.classification.iab_content.tier2.label": "Parenting" + "model_output.classification.iab_content.tier1.label": "Education", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1593,18 +2464,29 @@ "model_output.classification.iab_content.tier2.label": "Parenting" }, "id": "parenting-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Education", + "expected": "Family and Relationships", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Parenting", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Family and Relationships > Parenting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "how do i help my teenager spend less time online" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Family and Relationships", "model_output.classification.iab_content.tier2.label": "Parenting", - "model_output.classification.iab_content.tier3.label": "Special Needs Kids" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1613,15 +2495,26 @@ "model_output.classification.iab_content.tier3.label": "Special Needs Kids" }, "id": "parenting-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Special Needs Kids", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Family and Relationships > Parenting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "need parenting advice for a child starting preschool" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden" }, "expected": { @@ -1629,17 +2522,23 @@ "model_output.classification.iab_content.tier1.label": "Home & Garden" }, "id": "gardening-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Home & Garden > Gardening.", - "pass": true, + "pass": false, "status": "must_fix", "text": "best plants for a small balcony garden" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Gardening" + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1647,15 +2546,26 @@ "model_output.classification.iab_content.tier2.label": "Gardening" }, "id": "gardening-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Food & Drink", + "expected": "Home & Garden", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Gardening", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Home & Garden > Gardening.", - "pass": true, + "pass": false, "status": "must_fix", "text": "how often should i water tomato plants" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden" }, "expected": { @@ -1663,15 +2573,21 @@ "model_output.classification.iab_content.tier1.label": "Home & Garden" }, "id": "gardening-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Home & Garden > Gardening.", - "pass": true, + "pass": false, "status": "must_fix", "text": "need gardening advice for a shady backyard, not interior decor ideas" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Entertainment", "model_output.classification.iab_content.tier2.label": "Movies" }, @@ -1681,18 +2597,24 @@ "model_output.classification.iab_content.tier2.label": "Movies" }, "id": "movies-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Entertainment > Movies.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What movie should we watch tonight?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Video Gaming", - "model_output.classification.iab_content.tier2.label": "Video Game Genres", - "model_output.classification.iab_content.tier3.label": "Horror Video Games" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Horror", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1701,18 +2623,39 @@ "model_output.classification.iab_content.tier3.label": "Horror Video Games" }, "id": "movies-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Genres", + "expected": "Video Gaming", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Horror", + "expected": "Video Game Genres", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Horror Video Games", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Entertainment > Movies.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best thriller movies from the last few years" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Entertainment", - "model_output.classification.iab_content.tier2.label": "Music", - "model_output.classification.iab_content.tier3.label": "Soundtracks, TV and Showtunes" + "model_output.classification.iab_content.tier2.label": "Movies", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1721,9 +2664,25 @@ "model_output.classification.iab_content.tier3.label": "Soundtracks, TV and Showtunes" }, "id": "movies-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Movies", + "expected": "Music", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Soundtracks, TV and Showtunes", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Entertainment > Movies.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Looking for film recommendations, not TV shows or music" } diff --git a/artifacts/evaluation/latest/iab_cross_vertical_quality_target_eval.json b/artifacts/evaluation/latest/iab_cross_vertical_quality_target_eval.json index a44a44a29d122e8f45e8905395e3c24bc94392cb..c6ecbf915728ae060e2d2f3c6091b17726b69243 100644 --- a/artifacts/evaluation/latest/iab_cross_vertical_quality_target_eval.json +++ b/artifacts/evaluation/latest/iab_cross_vertical_quality_target_eval.json @@ -1,21 +1,21 @@ { "by_status": { "must_fix": { - "failed": 86, - "passed": 4, + "failed": 57, + "passed": 33, "total": 90 } }, - "cases_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/examples/iab_cross_vertical_mapping_cases.json", + "cases_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/examples/iab_cross_vertical_mapping_cases.json", "count": 90, - "failed": 86, - "passed": 4, + "failed": 57, + "passed": 33, "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Type" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Accessories" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -25,7 +25,17 @@ "id": "auto-buying-easy", "mismatches": [ { - "actual": "Auto Type", + "actual": "Travel", + "expected": "Automotive", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Travel Accessories", "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" } @@ -37,7 +47,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, @@ -48,6 +58,11 @@ }, "id": "auto-buying-medium", "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, { "actual": "Auto Body Styles", "expected": "Auto Buying and Selling", @@ -61,9 +76,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Type" + "model_output.classification.iab_content.tier2.label": "Auto Shows" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -73,7 +88,12 @@ "id": "auto-buying-hard", "mismatches": [ { - "actual": "Auto Type", + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Auto Shows", "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" } @@ -85,10 +105,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -104,17 +124,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Software and Applications", + "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" } @@ -127,9 +142,9 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -140,7 +155,7 @@ "id": "sales-crm-medium", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Careers", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, @@ -150,12 +165,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Internet", + "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" } @@ -169,7 +184,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -185,6 +200,11 @@ "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" }, + { + "actual": null, + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + }, { "actual": null, "expected": "Sales", @@ -199,8 +219,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Hobbies & Interests", - "model_output.classification.iab_content.tier2.label": "Content Production", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -212,7 +232,7 @@ "id": "marketing-tools-easy", "mismatches": [ { - "actual": "Hobbies & Interests", + "actual": "Technology & Computing", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, @@ -222,7 +242,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Content Production", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -240,8 +260,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Sensitive Topics", - "model_output.classification.iab_content.tier2.label": "Online Piracy", + "model_output.classification.iab_content.tier1.label": "Business and Finance", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -252,18 +272,13 @@ }, "id": "marketing-tools-medium", "mismatches": [ - { - "actual": "Sensitive Topics", - "expected": "Business and Finance", - "path": "model_output.classification.iab_content.tier1.label" - }, { "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Online Piracy", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -281,9 +296,9 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Genres", - "model_output.classification.iab_content.tier2.label": "Talk Radio", - "model_output.classification.iab_content.tier3.label": "Public Radio" + "model_output.classification.iab_content.tier1.label": "Business and Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -293,23 +308,18 @@ }, "id": "marketing-tools-hard", "mismatches": [ - { - "actual": "Genres", - "expected": "Business and Finance", - "path": "model_output.classification.iab_content.tier1.label" - }, { "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Talk Radio", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Public Radio", + "actual": null, "expected": "Marketing and Advertising", "path": "model_output.classification.iab_content.tier3.label" } @@ -322,8 +332,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -335,7 +345,7 @@ "id": "business-it-easy", "mismatches": [ { - "actual": "Careers", + "actual": "Technology & Computing", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, @@ -345,7 +355,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Job Search", + "actual": "Computing", "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -364,7 +374,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -380,6 +390,11 @@ "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" }, + { + "actual": null, + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + }, { "actual": null, "expected": "Business I.T.", @@ -396,7 +411,7 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -422,7 +437,7 @@ "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Software and Applications", + "actual": null, "expected": "Business I.T.", "path": "model_output.classification.iab_content.tier3.label" } @@ -434,9 +449,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Table Tennis" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": "Dining Out" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -444,33 +459,17 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-easy", - "mismatches": [ - { - "actual": "Sports", - "expected": "Food & Drink", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Table Tennis", - "expected": "Dining Out", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Food & Drink > Dining Out.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Book a table for six tonight" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Attractions", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": "Dining Out" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -478,33 +477,17 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-medium", - "mismatches": [ - { - "actual": "Attractions", - "expected": "Food & Drink", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Dining Out", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Food & Drink > Dining Out.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Good restaurants for a client dinner downtown" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Dining Out" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -512,28 +495,17 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Dining Out", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Food & Drink > Dining Out.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a place to eat tonight where I can make a reservation online" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -541,33 +513,17 @@ "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" }, "id": "alcoholic-beverages-easy", - "mismatches": [ - { - "actual": "Style & Fashion", - "expected": "Food & Drink", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Alcoholic Beverages", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Food & Drink > Alcoholic Beverages.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Which whiskey cocktail should I order?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -575,28 +531,17 @@ "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" }, "id": "alcoholic-beverages-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Alcoholic Beverages", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Food & Drink > Alcoholic Beverages.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best vodka drinks for beginners" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Non-Alcoholic Beverages" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -606,12 +551,7 @@ "id": "alcoholic-beverages-hard", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Non-Alcoholic Beverages", "expected": "Alcoholic Beverages", "path": "model_output.classification.iab_content.tier2.label" } @@ -624,7 +564,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Sensitive Topics", + "model_output.classification.iab_content.tier1.label": "Careers", "model_output.classification.iab_content.tier2.label": null }, "expected": { @@ -635,7 +575,7 @@ "id": "artificial-intelligence-easy", "mismatches": [ { - "actual": "Sensitive Topics", + "actual": "Careers", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, @@ -657,9 +597,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Sensitive Topics", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Education", + "model_output.classification.iab_content.tier2.label": "Language Learning" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -669,17 +609,12 @@ "id": "artificial-intelligence-medium", "mismatches": [ { - "actual": "Sensitive Topics", + "actual": "Education", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Language Learning", "expected": "Artificial Intelligence", "path": "model_output.classification.iab_content.tier2.label" } @@ -691,7 +626,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "Language Learning" }, @@ -707,11 +642,6 @@ "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Language Learning", "expected": "Artificial Intelligence", @@ -727,8 +657,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -742,6 +672,16 @@ "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", @@ -753,8 +693,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -768,6 +708,16 @@ "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", @@ -779,8 +729,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -794,6 +744,16 @@ "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", @@ -803,7 +763,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Careers", "model_output.classification.iab_content.tier2.label": "Remote Working", "model_output.classification.iab_content.tier3.label": null, @@ -823,11 +783,6 @@ "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Remote Working", "expected": "Computing", @@ -854,7 +809,7 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", + "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -872,7 +827,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Internet", + "actual": null, "expected": "Software and Applications", "path": "model_output.classification.iab_content.tier3.label" }, @@ -891,7 +846,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Remote Working", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, @@ -915,7 +870,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Remote Working", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -975,11 +930,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": null + "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -989,30 +944,19 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Web Hosting", - "path": "model_output.classification.iab_content.tier4.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best hosting platform for a startup website" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": null + "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1022,26 +966,15 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Web Hosting", - "path": "model_output.classification.iab_content.tier4.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a managed hosting provider to deploy and run our marketing site" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Laptops" @@ -1053,21 +986,15 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Which laptop should I buy for college?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Laptops" @@ -1079,21 +1006,15 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best laptop for work and study under 1200" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Laptops" @@ -1105,15 +1026,9 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a portable computer with good battery life for everyday work" }, @@ -1150,10 +1065,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier3.label": "Desktops" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1162,29 +1077,18 @@ "model_output.classification.iab_content.tier3.label": "Desktops" }, "id": "desktops-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Software and Applications", - "expected": "Desktops", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Which desktop computer should I buy for a home office?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Desktops" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1193,29 +1097,18 @@ "model_output.classification.iab_content.tier3.label": "Desktops" }, "id": "desktops-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Desktops", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a desktop PC with strong performance for creative work" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Smartphones" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1224,29 +1117,18 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "smartphones-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Smartphones", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best phone with a good camera under 700" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Smartphones" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1255,26 +1137,15 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "smartphones-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Smartphones", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Should I buy an iPhone or Pixel this year?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", "model_output.classification.iab_content.tier3.label": "Smartphones" @@ -1286,15 +1157,9 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "smartphones-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a new smartphone with strong battery life and a clean software experience" }, @@ -1316,7 +1181,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion" }, "expected": { @@ -1324,15 +1189,21 @@ "model_output.classification.iab_content.tier1.label": "Style & Fashion" }, "id": "style-fashion-parent-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Style & Fashion.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Affordable fashion accessories for everyday wear" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion" }, "expected": { @@ -1340,18 +1211,24 @@ "model_output.classification.iab_content.tier1.label": "Style & Fashion" }, "id": "style-fashion-parent-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Style & Fashion.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need style recommendations for clothing and footwear without a specific brand in mind" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1362,9 +1239,19 @@ "id": "womens-shoes-easy", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Sports", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Walking", + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Women's Shoes and Footwear", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical easy IAB mapping case for Style & Fashion > Women's Fashion > Women's Shoes and Footwear.", @@ -1374,10 +1261,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1388,9 +1275,19 @@ "id": "womens-shoes-medium", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Sports", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Walking", + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Women's Shoes and Footwear", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Style & Fashion > Women's Fashion > Women's Shoes and Footwear.", @@ -1400,7 +1297,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Women's Fashion", "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" @@ -1412,15 +1309,9 @@ "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "id": "womens-shoes-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Style & Fashion > Women's Fashion > Women's Shoes and Footwear.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need women's footwear for commuting that looks polished but feels comfortable" }, @@ -1429,7 +1320,7 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Men's Fashion", - "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1443,6 +1334,11 @@ "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Men's Shoes and Footwear", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical easy IAB mapping case for Style & Fashion > Men's Fashion > Men's Shoes and Footwear.", @@ -1452,10 +1348,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Men's Fashion", - "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" + "model_output.classification.iab_content.tier3.label": "Men's Clothing" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1466,9 +1362,9 @@ "id": "mens-shoes-medium", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Men's Clothing", + "expected": "Men's Shoes and Footwear", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Style & Fashion > Men's Fashion > Men's Shoes and Footwear.", @@ -1478,7 +1374,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Men's Fashion", "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" @@ -1490,24 +1386,18 @@ "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" }, "id": "mens-shoes-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Style & Fashion > Men's Fashion > Men's Shoes and Footwear.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need men's footwear that works for workdays and weekend walking" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Attractions", - "model_output.classification.iab_content.tier2.label": "Nightclubs", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type", + "model_output.classification.iab_content.tier3.label": "Hotels and Motels" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1516,39 +1406,18 @@ "model_output.classification.iab_content.tier3.label": "Hotels and Motels" }, "id": "hotels-easy", - "mismatches": [ - { - "actual": "Attractions", - "expected": "Travel", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Nightclubs", - "expected": "Travel Type", - "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": null, - "expected": "Hotels and Motels", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Travel > Travel Type > Hotels and Motels.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a hotel in Chicago for two nights" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", "model_output.classification.iab_content.tier2.label": "Travel Type", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Hotels and Motels" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1557,26 +1426,15 @@ "model_output.classification.iab_content.tier3.label": "Hotels and Motels" }, "id": "hotels-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Hotels and Motels", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Travel > Travel Type > Hotels and Motels.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best hotels near Times Square for a weekend trip" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null @@ -1589,11 +1447,6 @@ }, "id": "hotels-hard", "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": null, "expected": "Travel Type", @@ -1612,7 +1465,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Real Estate", "model_output.classification.iab_content.tier2.label": "Apartments" }, @@ -1623,11 +1476,6 @@ }, "id": "real-estate-rentals-easy", "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Apartments", "expected": "Real Estate Renting and Leasing", @@ -1641,7 +1489,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Real Estate", "model_output.classification.iab_content.tier2.label": "Apartments" }, @@ -1652,6 +1500,11 @@ }, "id": "real-estate-rentals-medium", "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, { "actual": "Apartments", "expected": "Real Estate Renting and Leasing", @@ -1665,9 +1518,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Real Estate", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Real Estate Renting and Leasing" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1675,29 +1528,18 @@ "model_output.classification.iab_content.tier2.label": "Real Estate Renting and Leasing" }, "id": "real-estate-rentals-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Real Estate Renting and Leasing", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Real Estate > Real Estate Renting and Leasing.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need rental listings for a short move, not home-buying advice" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", - "model_output.classification.iab_content.tier3.label": "Running and Jogging" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1708,9 +1550,19 @@ "id": "running-and-jogging-easy", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Sports", + "expected": "Healthy Living", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Walking", + "expected": "Fitness and Exercise", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Running and Jogging", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical easy IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", @@ -1720,10 +1572,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", - "model_output.classification.iab_content.tier3.label": "Running and Jogging" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1734,9 +1586,19 @@ "id": "running-and-jogging-medium", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Sports", + "expected": "Healthy Living", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Walking", + "expected": "Fitness and Exercise", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Running and Jogging", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", @@ -1746,10 +1608,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", - "model_output.classification.iab_content.tier3.label": "Running and Jogging" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1760,9 +1622,19 @@ "id": "running-and-jogging-hard", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Sports", + "expected": "Healthy Living", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Walking", + "expected": "Fitness and Exercise", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Running and Jogging", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical hard IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", @@ -1772,9 +1644,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Australian Rules Football" + "model_output.classification.iab_content.tier2.label": "Soccer" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1782,28 +1654,17 @@ "model_output.classification.iab_content.tier2.label": "Soccer" }, "id": "soccer-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Australian Rules Football", - "expected": "Soccer", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Sports > Soccer.", - "pass": false, + "pass": true, "status": "must_fix", "text": "How do offside rules work in soccer?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Soccer" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1811,28 +1672,17 @@ "model_output.classification.iab_content.tier2.label": "Soccer" }, "id": "soccer-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Soccer", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Sports > Soccer.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best soccer drills for beginner players" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": "Fantasy Sports" + "model_output.classification.iab_content.tier2.label": "Soccer" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1840,26 +1690,15 @@ "model_output.classification.iab_content.tier2.label": "Soccer" }, "id": "soccer-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Fantasy Sports", - "expected": "Soccer", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Sports > Soccer.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need help understanding football tactics for the Premier League, not fantasy sports" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Genres", "model_output.classification.iab_content.tier2.label": "Fantasy" }, @@ -1875,11 +1714,6 @@ "expected": "Books and Literature", "path": "model_output.classification.iab_content.tier1.label" }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Fantasy", "expected": "Fiction", @@ -1893,9 +1727,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Books and Literature", - "model_output.classification.iab_content.tier2.label": "Fiction" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1903,17 +1737,33 @@ "model_output.classification.iab_content.tier2.label": "Fiction" }, "id": "fiction-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Travel", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Travel Type", + "expected": "Fiction", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Books and Literature > Fiction.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best fiction books for a long flight" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Books and Literature", - "model_output.classification.iab_content.tier2.label": "Comics and Graphic Novels" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Romance" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1923,12 +1773,12 @@ "id": "fiction-hard", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Genres", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Comics and Graphic Novels", + "actual": "Romance", "expected": "Fiction", "path": "model_output.classification.iab_content.tier2.label" } @@ -1940,7 +1790,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", "model_output.classification.iab_content.tier2.label": "Remodeling & Construction" }, @@ -1951,11 +1801,6 @@ }, "id": "home-improvement-easy", "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Remodeling & Construction", "expected": "Home Improvement", @@ -1969,9 +1814,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Personal Care" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Home & Garden", + "model_output.classification.iab_content.tier2.label": "Indoor Environmental Quality" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -1981,17 +1826,7 @@ "id": "home-improvement-medium", "mismatches": [ { - "actual": "Style & Fashion", - "expected": "Home & Garden", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Personal Care", + "actual": "Indoor Environmental Quality", "expected": "Home Improvement", "path": "model_output.classification.iab_content.tier2.label" } @@ -2003,9 +1838,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Interior Decorating" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2015,12 +1850,7 @@ "id": "home-improvement-hard", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Interior Decorating", + "actual": null, "expected": "Home Improvement", "path": "model_output.classification.iab_content.tier2.label" } @@ -2033,8 +1863,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Education", - "model_output.classification.iab_content.tier2.label": "Online Education" + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2043,10 +1873,20 @@ }, "id": "online-education-easy", "mismatches": [ + { + "actual": "Technology & Computing", + "expected": "Education", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Online Education", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical easy IAB mapping case for Education > Online Education.", @@ -2056,7 +1896,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Careers", "model_output.classification.iab_content.tier2.label": "Remote Working" }, @@ -2072,11 +1912,6 @@ "expected": "Education", "path": "model_output.classification.iab_content.tier1.label" }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Remote Working", "expected": "Online Education", @@ -2091,8 +1926,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2102,7 +1937,7 @@ "id": "online-education-hard", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Careers", "expected": "Education", "path": "model_output.classification.iab_content.tier1.label" }, @@ -2112,7 +1947,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Computing", + "actual": null, "expected": "Online Education", "path": "model_output.classification.iab_content.tier2.label" } @@ -2124,7 +1959,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "College Education", "model_output.classification.iab_content.tier3.label": "Postgraduate Education" @@ -2136,15 +1971,9 @@ "model_output.classification.iab_content.tier3.label": "Postgraduate Education" }, "id": "postgraduate-education-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Education > College Education > Postgraduate Education.", - "pass": false, + "pass": true, "status": "must_fix", "text": "best universities to study masters" }, @@ -2152,8 +1981,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Education", - "model_output.classification.iab_content.tier2.label": "College Education", - "model_output.classification.iab_content.tier3.label": "Postgraduate Education" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2167,6 +1996,16 @@ "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "College Education", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Postgraduate Education", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Education > College Education > Postgraduate Education.", @@ -2176,7 +2015,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "College Education", "model_output.classification.iab_content.tier3.label": "Postgraduate Education" @@ -2188,15 +2027,9 @@ "model_output.classification.iab_content.tier3.label": "Postgraduate Education" }, "id": "postgraduate-education-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Education > College Education > Postgraduate Education.", - "pass": false, + "pass": true, "status": "must_fix", "text": "need postgraduate options for a master's degree, not short online courses" }, @@ -2246,7 +2079,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Medical Health" }, "expected": { @@ -2254,21 +2087,15 @@ "model_output.classification.iab_content.tier1.label": "Medical Health" }, "id": "medical-health-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Medical Health.", - "pass": false, + "pass": true, "status": "must_fix", "text": "need medical advice about symptoms, not wellness or fitness tips" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Careers", "model_output.classification.iab_content.tier2.label": "Remote Working" }, @@ -2279,11 +2106,6 @@ }, "id": "careers-job-search-easy", "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Remote Working", "expected": "Job Search", @@ -2297,7 +2119,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Business and Finance", "model_output.classification.iab_content.tier2.label": "Industries" }, @@ -2313,11 +2135,6 @@ "expected": "Careers", "path": "model_output.classification.iab_content.tier1.label" }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Industries", "expected": "Job Search", @@ -2332,8 +2149,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Industries" + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2343,7 +2160,7 @@ "id": "careers-job-search-hard", "mismatches": [ { - "actual": "Business and Finance", + "actual": "Genres", "expected": "Careers", "path": "model_output.classification.iab_content.tier1.label" }, @@ -2353,7 +2170,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Industries", + "actual": null, "expected": "Job Search", "path": "model_output.classification.iab_content.tier2.label" } @@ -2366,7 +2183,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier1.label": "Personal Celebrations & Life Events", "model_output.classification.iab_content.tier2.label": null }, "expected": { @@ -2376,6 +2193,11 @@ }, "id": "personal-finance-easy", "mismatches": [ + { + "actual": "Personal Celebrations & Life Events", + "expected": "Personal Finance", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "nearest_equivalent", "expected": "exact", @@ -2394,7 +2216,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Personal Finance", "model_output.classification.iab_content.tier2.label": null }, @@ -2405,11 +2227,6 @@ }, "id": "personal-finance-medium", "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": null, "expected": "Financial Planning", @@ -2423,9 +2240,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Retirement Planning" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2435,12 +2252,7 @@ "id": "personal-finance-hard", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Retirement Planning", + "actual": null, "expected": "Financial Planning", "path": "model_output.classification.iab_content.tier2.label" } @@ -2452,7 +2264,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Family and Relationships", "model_output.classification.iab_content.tier2.label": "Parenting" }, @@ -2462,15 +2274,9 @@ "model_output.classification.iab_content.tier2.label": "Parenting" }, "id": "parenting-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Family and Relationships > Parenting.", - "pass": false, + "pass": true, "status": "must_fix", "text": "tips for parenting a toddler" }, @@ -2478,7 +2284,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Education", - "model_output.classification.iab_content.tier2.label": "Online Education" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2498,7 +2304,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Online Education", + "actual": null, "expected": "Parenting", "path": "model_output.classification.iab_content.tier2.label" } @@ -2510,7 +2316,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Family and Relationships", "model_output.classification.iab_content.tier2.label": "Parenting" }, @@ -2520,21 +2326,15 @@ "model_output.classification.iab_content.tier2.label": "Parenting" }, "id": "parenting-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Family and Relationships > Parenting.", - "pass": false, + "pass": true, "status": "must_fix", "text": "need parenting advice for a child starting preschool" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", "model_output.classification.iab_content.tier2.label": "Gardening" }, @@ -2544,22 +2344,16 @@ "model_output.classification.iab_content.tier2.label": "Gardening" }, "id": "gardening-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Home & Garden > Gardening.", - "pass": false, + "pass": true, "status": "must_fix", "text": "best plants for a small balcony garden" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": null }, "expected": { @@ -2570,7 +2364,7 @@ "id": "gardening-medium", "mismatches": [ { - "actual": "Personal Finance", + "actual": "Food & Drink", "expected": "Home & Garden", "path": "model_output.classification.iab_content.tier1.label" }, @@ -2592,9 +2386,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Gardening" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2602,28 +2396,17 @@ "model_output.classification.iab_content.tier2.label": "Gardening" }, "id": "gardening-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Gardening", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Home & Garden > Gardening.", - "pass": false, + "pass": true, "status": "must_fix", "text": "need gardening advice for a shady backyard, not interior decor ideas" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Genres", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Entertainment", + "model_output.classification.iab_content.tier2.label": "Movies" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2631,31 +2414,15 @@ "model_output.classification.iab_content.tier2.label": "Movies" }, "id": "movies-easy", - "mismatches": [ - { - "actual": "Genres", - "expected": "Entertainment", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Movies", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Entertainment > Movies.", - "pass": false, + "pass": true, "status": "must_fix", "text": "What movie should we watch tonight?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Genres", "model_output.classification.iab_content.tier2.label": "Horror" }, @@ -2671,11 +2438,6 @@ "expected": "Entertainment", "path": "model_output.classification.iab_content.tier1.label" }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Horror", "expected": "Movies", @@ -2689,9 +2451,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Entertainment", - "model_output.classification.iab_content.tier2.label": "Music" + "model_output.classification.iab_content.tier2.label": "Movies" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2699,20 +2461,9 @@ "model_output.classification.iab_content.tier2.label": "Movies" }, "id": "movies-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Music", - "expected": "Movies", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Entertainment > Movies.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Looking for film recommendations, not TV shows or music" } diff --git a/artifacts/evaluation/latest/iab_quality_target_eval.json b/artifacts/evaluation/latest/iab_quality_target_eval.json index 7ecaed190f3901a692a7a506a5b975b33e49e2b0..8b120400be86c2a90bde8a37dd059bdaa09bd8ec 100644 --- a/artifacts/evaluation/latest/iab_quality_target_eval.json +++ b/artifacts/evaluation/latest/iab_quality_target_eval.json @@ -1,15 +1,15 @@ { "by_status": { "must_fix": { - "failed": 12, - "passed": 0, + "failed": 9, + "passed": 3, "total": 12 } }, - "cases_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/examples/iab_mapping_cases.json", + "cases_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/examples/iab_mapping_cases.json", "count": 12, - "failed": 12, - "passed": 0, + "failed": 9, + "passed": 3, "results": [ { "actual": { @@ -42,7 +42,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Laptops" @@ -54,15 +54,9 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptop-buying-maps-to-laptops", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Laptop shopping should resolve into the laptops branch, not business sales.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Which laptop to buy in 2026" }, @@ -70,7 +64,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -81,6 +75,11 @@ }, "id": "labtop-buying-maps-to-laptops", "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, { "actual": null, "expected": "Laptops", @@ -99,10 +98,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -118,14 +117,19 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Software and Applications", + "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "CRM education should resolve to the closest business/sales path, not generic software.", @@ -136,9 +140,9 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -149,17 +153,17 @@ "id": "crm-comparison-maps-to-sales", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Careers", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Software and Applications", + "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" }, @@ -177,8 +181,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Hobbies & Interests", - "model_output.classification.iab_content.tier2.label": "Content Production", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -190,12 +194,12 @@ "id": "marketing-tools-map-to-marketing", "mismatches": [ { - "actual": "Hobbies & Interests", + "actual": "Technology & Computing", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Content Production", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -218,8 +222,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -229,7 +233,12 @@ "id": "ml-explanation-maps-to-ai", "mismatches": [ { - "actual": "Computing", + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, "expected": "Artificial Intelligence", "path": "model_output.classification.iab_content.tier2.label" }, @@ -247,9 +256,9 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -260,17 +269,17 @@ "id": "support-credential-help-maps-to-business-it", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Personal Finance", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Internet", + "actual": null, "expected": "Business I.T.", "path": "model_output.classification.iab_content.tier3.label" } @@ -282,7 +291,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": "Dining Out" }, @@ -292,23 +301,17 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "restaurant-booking-maps-to-dining-out", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Generic dining requests should not inherit the repo's business default.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Book a table for 2 tonight" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business", + "model_output.classification.iab_content.tier1.label": "Sensitive Topics", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -320,12 +323,12 @@ "id": "trial-signup-maps-to-software", "mismatches": [ { - "actual": "Business and Finance", + "actual": "Sensitive Topics", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Business", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -342,10 +345,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Remote Working", + "model_output.classification.iab_content.tier3.label": null, "model_output.classification.iab_content.tier4.label": null }, "expected": { @@ -358,7 +361,17 @@ "id": "communication-software-maps-to-tier4", "mismatches": [ { - "actual": "Software and Applications", + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Remote Working", + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, "expected": "Computer Software and Applications", "path": "model_output.classification.iab_content.tier3.label" }, @@ -366,11 +379,6 @@ "actual": null, "expected": "Communication", "path": "model_output.classification.iab_content.tier4.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "Full taxonomy support should preserve the tier4 communication branch.", @@ -380,9 +388,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -390,20 +398,9 @@ "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" }, "id": "vodka-query-maps-to-alcoholic-beverages", - "mismatches": [ - { - "actual": null, - "expected": "Alcoholic Beverages", - "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Food and beverage prompts should not fall through to the business default.", - "pass": false, + "pass": true, "status": "must_fix", "text": "what is best vodka drink should i try" } diff --git a/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv b/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv index 02b3cde7d588095d35524b67aa0741144feb45f8..0dc8a87026700a39cf0a6ce1886adc58d853b52b 100644 --- a/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv @@ -1,19 +1,19 @@ ,education,product_discovery,comparison,evaluation,deal_seeking,provider_selection,signup,purchase,booking,download,contact_sales,task_execution,onboarding_setup,troubleshooting,account_help,billing_help,follow_up,emotional_reflection education,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -product_discovery,0,13,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0 -comparison,2,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +product_discovery,0,14,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0 +comparison,2,0,12,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 evaluation,1,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0 deal_seeking,0,0,0,0,14,1,0,0,0,0,0,0,0,0,0,0,0,0 -provider_selection,0,0,1,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0 +provider_selection,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0 signup,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0 -purchase,0,0,0,0,0,0,0,14,0,0,0,0,1,0,0,0,0,0 -booking,0,0,0,0,0,0,1,0,13,0,0,1,0,0,0,0,0,0 -download,0,0,0,0,0,0,0,0,0,14,0,1,0,0,0,0,0,0 +purchase,0,0,1,0,0,0,0,12,0,0,0,0,2,0,0,0,0,0 +booking,0,0,0,0,0,0,1,0,11,1,1,1,0,0,0,0,0,0 +download,0,0,0,0,0,0,0,0,0,13,1,1,0,0,0,0,0,0 contact_sales,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0 -task_execution,0,0,0,0,0,0,1,0,0,0,0,17,0,0,0,0,0,0 -onboarding_setup,0,0,0,0,0,0,0,0,0,0,0,1,16,0,0,0,0,0 -troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,1 -account_help,0,0,0,0,0,0,2,0,0,0,0,0,0,0,12,1,0,0 -billing_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0 +task_execution,0,0,0,0,0,0,0,0,0,0,0,17,1,0,0,0,0,0 +onboarding_setup,0,0,0,0,0,0,0,0,0,0,0,0,17,0,0,0,0,0 +troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0,1,0,1 +account_help,0,0,0,0,0,0,0,0,0,0,0,1,0,3,11,0,0,0 +billing_help,0,0,0,0,0,0,0,0,0,0,0,0,1,4,3,7,0,0 follow_up,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0 emotional_reflection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15 diff --git a/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_report.json b/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_report.json index cdb32dd678bc9119b66fd7dba7b2d82ac5c66e0a..616ee132c6b65d75f8bb56fcf5dd7a999c31ef99 100644 --- a/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_report.json +++ b/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_report.json @@ -1,81 +1,81 @@ { - "accepted_accuracy": 0.9386, - "accepted_coverage": 1.0, - "accuracy": 0.9386, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv", + "accepted_accuracy": 0.8982, + "accepted_coverage": 0.9928, + "accuracy": 0.8917, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv", "count": 277, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/subtype_benchmark.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/subtype_benchmark.jsonl", "difficulty_breakdown": { "easy": { - "accepted_accuracy": 0.9565, + "accepted_accuracy": 0.9239, "accepted_coverage": 1.0, - "accuracy": 0.9565, + "accuracy": 0.9239, "count": 92, "fallback_rate": 0.0, - "macro_f1": 0.9579 + "macro_f1": 0.924 }, "hard": { - "accepted_accuracy": 0.8901, - "accepted_coverage": 1.0, - "accuracy": 0.8901, + "accepted_accuracy": 0.8539, + "accepted_coverage": 0.978, + "accuracy": 0.8352, "count": 91, - "fallback_rate": 0.0, - "macro_f1": 0.8913 + "fallback_rate": 0.022, + "macro_f1": 0.8241 }, "medium": { - "accepted_accuracy": 0.9681, + "accepted_accuracy": 0.9149, "accepted_coverage": 1.0, - "accuracy": 0.9681, + "accuracy": 0.9149, "count": 94, "fallback_rate": 0.0, - "macro_f1": 0.9671 + "macro_f1": 0.9094 } }, - "fallback_rate": 0.0, + "fallback_rate": 0.0072, "head": "intent_subtype", - "macro_f1": 0.9401, + "macro_f1": 0.8876, "per_class_metrics": { "account_help": { - "f1-score": 0.8888888888888888, - "precision": 1.0, - "recall": 0.8, + "f1-score": 0.7586206896551724, + "precision": 0.7857142857142857, + "recall": 0.7333333333333333, "support": 15.0 }, - "accuracy": 0.9386281588447654, + "accuracy": 0.8916967509025271, "billing_help": { - "f1-score": 0.967741935483871, - "precision": 0.9375, - "recall": 1.0, + "f1-score": 0.6086956521739131, + "precision": 0.875, + "recall": 0.4666666666666667, "support": 15.0 }, "booking": { - "f1-score": 0.9285714285714286, + "f1-score": 0.8461538461538461, "precision": 1.0, - "recall": 0.8666666666666667, + "recall": 0.7333333333333333, "support": 15.0 }, "comparison": { - "f1-score": 0.896551724137931, - "precision": 0.9285714285714286, - "recall": 0.8666666666666667, + "f1-score": 0.8571428571428571, + "precision": 0.9230769230769231, + "recall": 0.8, "support": 15.0 }, "contact_sales": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9375, + "precision": 0.8823529411764706, "recall": 1.0, "support": 15.0 }, "deal_seeking": { - "f1-score": 0.9333333333333333, - "precision": 0.9333333333333333, + "f1-score": 0.9655172413793104, + "precision": 1.0, "recall": 0.9333333333333333, "support": 15.0 }, "download": { - "f1-score": 0.9655172413793104, - "precision": 1.0, - "recall": 0.9333333333333333, + "f1-score": 0.8666666666666667, + "precision": 0.8666666666666667, + "recall": 0.8666666666666667, "support": 15.0 }, "education": { @@ -91,8 +91,8 @@ "support": 15.0 }, "evaluation": { - "f1-score": 0.9655172413793104, - "precision": 1.0, + "f1-score": 0.9333333333333333, + "precision": 0.9333333333333333, "recall": 0.9333333333333333, "support": 15.0 }, @@ -103,57 +103,57 @@ "support": 15.0 }, "macro avg": { - "f1-score": 0.9401067100194944, - "precision": 0.9476910208527856, - "recall": 0.9383215323166303, + "f1-score": 0.8875885571005383, + "precision": 0.9016030130000718, + "recall": 0.8895061728395063, "support": 277.0 }, "onboarding_setup": { - "f1-score": 0.9411764705882353, - "precision": 0.9411764705882353, - "recall": 0.9411764705882353, + "f1-score": 0.8947368421052632, + "precision": 0.8095238095238095, + "recall": 1.0, "support": 17.0 }, "product_discovery": { - "f1-score": 0.9285714285714286, + "f1-score": 0.9655172413793104, "precision": 1.0, - "recall": 0.8666666666666667, + "recall": 0.9333333333333333, "support": 15.0 }, "provider_selection": { - "f1-score": 0.9375, - "precision": 0.9375, - "recall": 0.9375, + "f1-score": 0.9696969696969697, + "precision": 0.9411764705882353, + "recall": 1.0, "support": 16.0 }, "purchase": { - "f1-score": 0.9655172413793104, + "f1-score": 0.8888888888888888, "precision": 1.0, - "recall": 0.9333333333333333, + "recall": 0.8, "support": 15.0 }, "signup": { - "f1-score": 0.8888888888888888, - "precision": 0.8, + "f1-score": 0.9696969696969697, + "precision": 0.9411764705882353, "recall": 1.0, "support": 16.0 }, "task_execution": { - "f1-score": 0.8717948717948718, - "precision": 0.8095238095238095, + "f1-score": 0.8947368421052632, + "precision": 0.85, "recall": 0.9444444444444444, "support": 18.0 }, "troubleshooting": { - "f1-score": 0.9655172413793104, - "precision": 1.0, - "recall": 0.9333333333333333, + "f1-score": 0.7428571428571429, + "precision": 0.65, + "recall": 0.8666666666666667, "support": 15.0 }, "weighted avg": { - "f1-score": 0.9391802821325396, - "precision": 0.9455776173285199, - "recall": 0.9386281588447654, + "f1-score": 0.8883104280399479, + "precision": 0.9006650327445612, + "recall": 0.8916967509025271, "support": 277.0 } }, diff --git a/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv b/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv index 26bf7c6a1e686fe3e5632e49940c9db43ac5ae4b..48afd42d6940b67225706592393568204bcc4de2 100644 --- a/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv @@ -1,8 +1,8 @@ ,education,product_discovery,comparison,evaluation,deal_seeking,provider_selection,signup,purchase,booking,download,contact_sales,task_execution,onboarding_setup,troubleshooting,account_help,billing_help,follow_up,emotional_reflection education,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 product_discovery,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -comparison,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 -evaluation,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 +comparison,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +evaluation,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 deal_seeking,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0 provider_selection,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0 signup,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 @@ -15,5 +15,5 @@ onboarding_setup,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0 troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0 account_help,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0 billing_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -follow_up,0,0,0,0,3,0,0,0,0,0,0,0,0,0,1,0,8,0 +follow_up,1,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,8,0 emotional_reflection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/artifacts/evaluation/latest/intent_subtype_extended_cases_report.json b/artifacts/evaluation/latest/intent_subtype_extended_cases_report.json index 656750a7fcc3123edb8ded70e44cb2a46381d3c5..fce9202c711a683f5f2c94352a47b1c987992f21 100644 --- a/artifacts/evaluation/latest/intent_subtype_extended_cases_report.json +++ b/artifacts/evaluation/latest/intent_subtype_extended_cases_report.json @@ -2,16 +2,16 @@ "accepted_accuracy": 0.8491, "accepted_coverage": 1.0, "accuracy": 0.8491, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv", + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv", "count": 53, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/subtype/extended_cases.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/subtype/extended_cases.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.8146, + "macro_f1": 0.7764, "per_class_metrics": { "account_help": { - "f1-score": 0.6666666666666666, - "precision": 0.6666666666666666, + "f1-score": 0.8, + "precision": 1.0, "recall": 0.6666666666666666, "support": 3.0 }, @@ -29,9 +29,9 @@ "support": 0.0 }, "comparison": { - "f1-score": 0.6666666666666666, + "f1-score": 1.0, "precision": 1.0, - "recall": 0.5, + "recall": 1.0, "support": 2.0 }, "contact_sales": { @@ -41,8 +41,8 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 0.8181818181818182, - "precision": 0.6923076923076923, + "f1-score": 0.9, + "precision": 0.8181818181818182, "recall": 1.0, "support": 9.0 }, @@ -53,8 +53,8 @@ "support": 0.0 }, "education": { - "f1-score": 0.9333333333333333, - "precision": 0.875, + "f1-score": 0.875, + "precision": 0.7777777777777778, "recall": 1.0, "support": 7.0 }, @@ -65,9 +65,9 @@ "support": 0.0 }, "evaluation": { - "f1-score": 0.5, - "precision": 1.0, - "recall": 0.3333333333333333, + "f1-score": 0.0, + "precision": 0.0, + "recall": 0.0, "support": 3.0 }, "follow_up": { @@ -77,9 +77,9 @@ "support": 12.0 }, "macro avg": { - "f1-score": 0.4978114478114478, - "precision": 0.531517094017094, - "recall": 0.5092592592592592, + "f1-score": 0.474472286972287, + "precision": 0.46035754369087706, + "recall": 0.5185185185185186, "support": 53.0 }, "onboarding_setup": { @@ -89,8 +89,8 @@ "support": 4.0 }, "product_discovery": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9230769230769231, + "precision": 0.8571428571428571, "recall": 1.0, "support": 6.0 }, @@ -113,8 +113,8 @@ "support": 0.0 }, "task_execution": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.6666666666666666, + "precision": 0.5, "recall": 1.0, "support": 1.0 }, @@ -125,8 +125,8 @@ "support": 1.0 }, "weighted avg": { - "f1-score": 0.8404230989136648, - "precision": 0.887215771649734, + "f1-score": 0.823438668249989, + "precision": 0.8324076342944268, "recall": 0.8490566037735849, "support": 53.0 } diff --git a/artifacts/evaluation/latest/intent_subtype_hard_cases_confusion_matrix.csv b/artifacts/evaluation/latest/intent_subtype_hard_cases_confusion_matrix.csv index 53ca2a289c0b5aa6fea57f711a74877861ddecca..5fd9792df29df77842748b968832f42b07788a2f 100644 --- a/artifacts/evaluation/latest/intent_subtype_hard_cases_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_subtype_hard_cases_confusion_matrix.csv @@ -2,18 +2,18 @@ education,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 product_discovery,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 comparison,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -evaluation,0,2,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -deal_seeking,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0 +evaluation,2,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +deal_seeking,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0 provider_selection,0,0,0,1,0,9,0,0,0,0,0,0,0,0,0,0,0,0 signup,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0 -purchase,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0 +purchase,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0 booking,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0 download,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 contact_sales,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 task_execution,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 onboarding_setup,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0 troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0 -account_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0 +account_help,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0 billing_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0 follow_up,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,11,0 emotional_reflection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/artifacts/evaluation/latest/intent_subtype_hard_cases_report.json b/artifacts/evaluation/latest/intent_subtype_hard_cases_report.json index 0d65dc588af71dd04662fca9fcdb98e9fdfa6eab..63781df427301e5b3bf3adc754dd534b28212a0f 100644 --- a/artifacts/evaluation/latest/intent_subtype_hard_cases_report.json +++ b/artifacts/evaluation/latest/intent_subtype_hard_cases_report.json @@ -1,21 +1,21 @@ { - "accepted_accuracy": 0.9468, + "accepted_accuracy": 0.883, "accepted_coverage": 1.0, - "accuracy": 0.9468, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_hard_cases_confusion_matrix.csv", + "accuracy": 0.883, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_hard_cases_confusion_matrix.csv", "count": 94, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/subtype/hard_cases.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/subtype/hard_cases.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.9191, + "macro_f1": 0.8137, "per_class_metrics": { "account_help": { - "f1-score": 0.8, - "precision": 0.6666666666666666, - "recall": 1.0, + "f1-score": 0.5, + "precision": 0.5, + "recall": 0.5, "support": 2.0 }, - "accuracy": 0.9468085106382979, + "accuracy": 0.8829787234042553, "billing_help": { "f1-score": 1.0, "precision": 1.0, @@ -41,9 +41,9 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 1.0, + "f1-score": 0.8, "precision": 1.0, - "recall": 1.0, + "recall": 0.6666666666666666, "support": 3.0 }, "download": { @@ -53,8 +53,8 @@ "support": 0.0 }, "education": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9666666666666667, + "precision": 0.9354838709677419, "recall": 1.0, "support": 29.0 }, @@ -65,9 +65,9 @@ "support": 0.0 }, "evaluation": { - "f1-score": 0.7272727272727273, - "precision": 0.8, - "recall": 0.6666666666666666, + "f1-score": 0.25, + "precision": 0.5, + "recall": 0.16666666666666666, "support": 6.0 }, "follow_up": { @@ -77,9 +77,9 @@ "support": 12.0 }, "macro avg": { - "f1-score": 0.7659288023895194, - "precision": 0.7648148148148147, - "recall": 0.786111111111111, + "f1-score": 0.6780983255239549, + "precision": 0.693301292494841, + "recall": 0.6935185185185184, "support": 94.0 }, "onboarding_setup": { @@ -89,26 +89,26 @@ "support": 6.0 }, "product_discovery": { - "f1-score": 0.8888888888888888, - "precision": 0.8, + "f1-score": 0.8421052631578947, + "precision": 0.7272727272727273, "recall": 1.0, "support": 8.0 }, "provider_selection": { - "f1-score": 0.9473684210526315, - "precision": 1.0, + "f1-score": 0.9, + "precision": 0.9, "recall": 0.9, "support": 10.0 }, "purchase": { - "f1-score": 1.0, + "f1-score": 0.8, "precision": 1.0, - "recall": 1.0, + "recall": 0.6666666666666666, "support": 3.0 }, "signup": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.8571428571428571, + "precision": 0.75, "recall": 1.0, "support": 3.0 }, @@ -119,15 +119,15 @@ "support": 1.0 }, "troubleshooting": { - "f1-score": 0.8, - "precision": 1.0, + "f1-score": 0.6666666666666666, + "precision": 0.6666666666666666, "recall": 0.6666666666666666, "support": 3.0 }, "weighted avg": { - "f1-score": 0.9478016938458051, - "precision": 0.9578014184397163, - "recall": 0.9468085106382979, + "f1-score": 0.8700694845346483, + "precision": 0.879757596555812, + "recall": 0.8829787234042553, "support": 94.0 } }, diff --git a/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv b/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv index b52e8d3e2fd8ddfdfc51f7e7fce1b1ffe32cea47..9644424fd4bbe5b7b60779ac52488bcb2bea26d0 100644 --- a/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv @@ -1,8 +1,8 @@ ,education,product_discovery,comparison,evaluation,deal_seeking,provider_selection,signup,purchase,booking,download,contact_sales,task_execution,onboarding_setup,troubleshooting,account_help,billing_help,follow_up,emotional_reflection education,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -product_discovery,0,7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +product_discovery,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 comparison,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -evaluation,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +evaluation,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 deal_seeking,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0 provider_selection,0,0,0,1,0,5,0,0,0,0,0,0,0,0,0,0,0,0 signup,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0 @@ -10,10 +10,10 @@ purchase,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 booking,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0 download,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 contact_sales,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -task_execution,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0 +task_execution,0,0,0,0,0,0,1,0,0,0,0,5,0,0,0,0,0,0 onboarding_setup,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0 troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0 -account_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0 +account_help,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0 billing_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -follow_up,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,8,0 +follow_up,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,8,0 emotional_reflection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5 diff --git a/artifacts/evaluation/latest/intent_subtype_test_report.json b/artifacts/evaluation/latest/intent_subtype_test_report.json index 439c10fb4aafd745a25e4f8c2ecf965b90346180..9addb41d38fd7bd90cf43ab5791e8964f32dbd7d 100644 --- a/artifacts/evaluation/latest/intent_subtype_test_report.json +++ b/artifacts/evaluation/latest/intent_subtype_test_report.json @@ -1,21 +1,21 @@ { - "accepted_accuracy": 0.9, + "accepted_accuracy": 0.8714, "accepted_coverage": 1.0, - "accuracy": 0.9, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv", + "accuracy": 0.8714, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv", "count": 70, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/subtype/test.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/subtype/test.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.863, + "macro_f1": 0.7807, "per_class_metrics": { "account_help": { - "f1-score": 1.0, + "f1-score": 0.6666666666666666, "precision": 1.0, - "recall": 1.0, + "recall": 0.5, "support": 2.0 }, - "accuracy": 0.9, + "accuracy": 0.8714285714285714, "billing_help": { "f1-score": 0.0, "precision": 0.0, @@ -29,8 +29,8 @@ "support": 3.0 }, "comparison": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.8571428571428571, + "precision": 0.75, "recall": 1.0, "support": 3.0 }, @@ -53,8 +53,8 @@ "support": 0.0 }, "education": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9655172413793104, + "precision": 0.9333333333333333, "recall": 1.0, "support": 14.0 }, @@ -65,9 +65,9 @@ "support": 5.0 }, "evaluation": { - "f1-score": 0.4, - "precision": 0.3333333333333333, - "recall": 0.5, + "f1-score": 0.0, + "precision": 0.0, + "recall": 0.0, "support": 2.0 }, "follow_up": { @@ -77,9 +77,9 @@ "support": 11.0 }, "macro avg": { - "f1-score": 0.6712084293224644, - "precision": 0.6671296296296296, - "recall": 0.6908670033670034, + "f1-score": 0.6071895459070292, + "precision": 0.6101851851851853, + "recall": 0.632996632996633, "support": 70.0 }, "onboarding_setup": { @@ -89,9 +89,9 @@ "support": 4.0 }, "product_discovery": { - "f1-score": 0.875, - "precision": 0.875, - "recall": 0.875, + "f1-score": 1.0, + "precision": 1.0, + "recall": 1.0, "support": 8.0 }, "provider_selection": { @@ -107,15 +107,15 @@ "support": 0.0 }, "signup": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.8, + "precision": 0.6666666666666666, "recall": 1.0, "support": 2.0 }, "task_execution": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.8333333333333334, + "precision": 0.8333333333333334, + "recall": 0.8333333333333334, "support": 6.0 }, "troubleshooting": { @@ -125,9 +125,9 @@ "support": 2.0 }, "weighted avg": { - "f1-score": 0.9058084605453025, - "precision": 0.9266666666666667, - "recall": 0.9, + "f1-score": 0.8661227931749063, + "precision": 0.8835714285714285, + "recall": 0.8714285714285714, "support": 70.0 } }, diff --git a/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv b/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv index a6e08053c992f73a143c1e3a90ed7ae2c380845a..12eeec20f88c4081f52b5935fc34fddcd4b70607 100644 --- a/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv @@ -1,19 +1,19 @@ ,education,product_discovery,comparison,evaluation,deal_seeking,provider_selection,signup,purchase,booking,download,contact_sales,task_execution,onboarding_setup,troubleshooting,account_help,billing_help,follow_up,emotional_reflection education,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -product_discovery,0,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +product_discovery,0,29,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0 comparison,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -evaluation,1,2,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -deal_seeking,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0 +evaluation,3,5,0,8,1,0,0,0,0,0,0,0,0,0,0,0,0,0 +deal_seeking,0,0,0,0,10,1,0,0,0,0,0,0,0,0,0,0,0,0 provider_selection,0,0,0,0,0,24,0,0,0,0,0,0,0,0,0,0,1,0 signup,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0 -purchase,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0 -booking,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0 +purchase,0,0,0,0,0,0,1,5,0,0,0,0,0,0,0,0,0,0 +booking,0,0,0,0,0,0,2,0,3,0,0,0,0,0,0,0,0,0 download,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0 -contact_sales,0,0,0,0,0,0,2,0,0,0,7,0,0,0,0,0,0,0 +contact_sales,0,0,0,0,0,0,2,0,1,0,6,0,0,0,0,0,0,0 task_execution,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,0,0,0 -onboarding_setup,0,0,0,0,0,0,0,0,0,0,0,0,17,0,0,0,0,0 -troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,12,1,0,0,0 -account_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0 +onboarding_setup,0,0,0,0,0,0,0,0,1,0,0,0,16,0,0,0,0,0 +troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,9,2,0,2,0 +account_help,0,0,0,0,0,0,0,0,0,0,0,0,0,1,6,0,0,0 billing_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0 -follow_up,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,32,0 +follow_up,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,31,0 emotional_reflection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20 diff --git a/artifacts/evaluation/latest/intent_subtype_train_report.json b/artifacts/evaluation/latest/intent_subtype_train_report.json index 7df4bd721a2d0c128678f01627f3ed50a3104b84..0a7a52fff81d56807dd4dbd3a4705f7fce1e4c0c 100644 --- a/artifacts/evaluation/latest/intent_subtype_train_report.json +++ b/artifacts/evaluation/latest/intent_subtype_train_report.json @@ -1,21 +1,21 @@ { - "accepted_accuracy": 0.9649, + "accepted_accuracy": 0.9042, "accepted_coverage": 1.0, - "accuracy": 0.9649, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv", + "accuracy": 0.9042, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv", "count": 313, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/subtype/train.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/subtype/train.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.9649, + "macro_f1": 0.8791, "per_class_metrics": { "account_help": { - "f1-score": 0.9333333333333333, - "precision": 0.875, - "recall": 1.0, + "f1-score": 0.8, + "precision": 0.75, + "recall": 0.8571428571428571, "support": 7.0 }, - "accuracy": 0.9648562300319489, + "accuracy": 0.9041533546325878, "billing_help": { "f1-score": 1.0, "precision": 1.0, @@ -23,27 +23,27 @@ "support": 6.0 }, "booking": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.6, + "precision": 0.6, + "recall": 0.6, "support": 5.0 }, "comparison": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.967741935483871, + "precision": 0.9375, "recall": 1.0, "support": 15.0 }, "contact_sales": { - "f1-score": 0.875, + "f1-score": 0.8, "precision": 1.0, - "recall": 0.7777777777777778, + "recall": 0.6666666666666666, "support": 9.0 }, "deal_seeking": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.9090909090909091, + "precision": 0.9090909090909091, + "recall": 0.9090909090909091, "support": 11.0 }, "download": { @@ -53,8 +53,8 @@ "support": 8.0 }, "education": { - "f1-score": 0.9904761904761905, - "precision": 0.9811320754716981, + "f1-score": 0.9719626168224299, + "precision": 0.9454545454545454, "recall": 1.0, "support": 52.0 }, @@ -65,69 +65,69 @@ "support": 20.0 }, "evaluation": { - "f1-score": 0.9032258064516129, + "f1-score": 0.64, "precision": 1.0, - "recall": 0.8235294117647058, + "recall": 0.47058823529411764, "support": 17.0 }, "follow_up": { - "f1-score": 0.927536231884058, - "precision": 0.9696969696969697, - "recall": 0.8888888888888888, + "f1-score": 0.8732394366197183, + "precision": 0.8857142857142857, + "recall": 0.8611111111111112, "support": 36.0 }, "macro avg": { - "f1-score": 0.9649442256020961, - "precision": 0.9689347311202658, - "recall": 0.9651818334171275, + "f1-score": 0.8791291644367831, + "precision": 0.9052373525167643, + "recall": 0.8737167303612591, "support": 313.0 }, "onboarding_setup": { - "f1-score": 1.0, + "f1-score": 0.9696969696969697, "precision": 1.0, - "recall": 1.0, + "recall": 0.9411764705882353, "support": 17.0 }, "product_discovery": { - "f1-score": 0.96875, - "precision": 0.9393939393939394, - "recall": 1.0, + "f1-score": 0.8923076923076924, + "precision": 0.8529411764705882, + "recall": 0.9354838709677419, "support": 31.0 }, "provider_selection": { - "f1-score": 0.9795918367346939, - "precision": 1.0, + "f1-score": 0.96, + "precision": 0.96, "recall": 0.96, "support": 25.0 }, "purchase": { - "f1-score": 1.0, + "f1-score": 0.9090909090909091, "precision": 1.0, - "recall": 1.0, + "recall": 0.8333333333333334, "support": 6.0 }, "signup": { - "f1-score": 0.9411764705882353, - "precision": 0.8888888888888888, + "f1-score": 0.8648648648648649, + "precision": 0.7619047619047619, "recall": 1.0, "support": 16.0 }, "task_execution": { - "f1-score": 0.926829268292683, - "precision": 0.8636363636363636, + "f1-score": 0.8837209302325582, + "precision": 0.7916666666666666, "recall": 1.0, "support": 19.0 }, "troubleshooting": { - "f1-score": 0.9230769230769231, - "precision": 0.9230769230769231, - "recall": 0.9230769230769231, + "f1-score": 0.782608695652174, + "precision": 0.9, + "recall": 0.6923076923076923, "support": 13.0 }, "weighted avg": { - "f1-score": 0.9643733669039578, - "precision": 0.967429661617075, - "recall": 0.9648562300319489, + "f1-score": 0.8996108171948927, + "precision": 0.9128919168596861, + "recall": 0.9041533546325878, "support": 313.0 } }, diff --git a/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv b/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv index f0ef85e2e79125375ad7c3eb1268144ccf8691ac..c73f0ccc02388f8e75c47cb5f09b563f5696fd83 100644 --- a/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv @@ -1,19 +1,19 @@ ,education,product_discovery,comparison,evaluation,deal_seeking,provider_selection,signup,purchase,booking,download,contact_sales,task_execution,onboarding_setup,troubleshooting,account_help,billing_help,follow_up,emotional_reflection education,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 product_discovery,0,10,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 -comparison,0,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0 +comparison,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 evaluation,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0 deal_seeking,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0 provider_selection,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0 signup,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0 -purchase,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0 +purchase,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0 booking,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0 download,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 contact_sales,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 task_execution,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0 -onboarding_setup,0,1,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0 +onboarding_setup,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0 troubleshooting,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0 account_help,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0 -billing_help,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0 -follow_up,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,10,0 +billing_help,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 +follow_up,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,9,0 emotional_reflection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5 diff --git a/artifacts/evaluation/latest/intent_subtype_val_report.json b/artifacts/evaluation/latest/intent_subtype_val_report.json index eda86d49328c06db9b306e66ef50a45858e04508..5ff1ef348e2735d7f6a57e091c6e45bcb2f4ea10 100644 --- a/artifacts/evaluation/latest/intent_subtype_val_report.json +++ b/artifacts/evaluation/latest/intent_subtype_val_report.json @@ -1,13 +1,13 @@ { - "accepted_accuracy": 0.875, + "accepted_accuracy": 0.9, "accepted_coverage": 1.0, - "accuracy": 0.875, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv", + "accuracy": 0.9, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv", "count": 80, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/subtype/val.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/subtype/val.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.725, + "macro_f1": 0.7496, "per_class_metrics": { "account_help": { "f1-score": 0.5, @@ -15,11 +15,11 @@ "recall": 0.5, "support": 2.0 }, - "accuracy": 0.875, + "accuracy": 0.9, "billing_help": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.0, + "precision": 0.0, + "recall": 0.0, "support": 1.0 }, "booking": { @@ -29,9 +29,9 @@ "support": 3.0 }, "comparison": { - "f1-score": 0.4, + "f1-score": 0.8571428571428571, "precision": 1.0, - "recall": 0.25, + "recall": 0.75, "support": 4.0 }, "contact_sales": { @@ -41,8 +41,8 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 0.5714285714285714, - "precision": 0.4, + "f1-score": 0.6666666666666666, + "precision": 0.5, "recall": 1.0, "support": 2.0 }, @@ -65,32 +65,32 @@ "support": 5.0 }, "evaluation": { - "f1-score": 0.6666666666666666, - "precision": 0.5, + "f1-score": 0.8, + "precision": 0.6666666666666666, "recall": 1.0, "support": 2.0 }, "follow_up": { - "f1-score": 0.9523809523809523, + "f1-score": 0.9, "precision": 1.0, - "recall": 0.9090909090909091, + "recall": 0.8181818181818182, "support": 11.0 }, "macro avg": { - "f1-score": 0.6444203944203944, - "precision": 0.6542087542087542, - "recall": 0.687121212121212, + "f1-score": 0.6662846956964604, + "precision": 0.6697530864197531, + "recall": 0.6931818181818182, "support": 80.0 }, "onboarding_setup": { - "f1-score": 0.8, - "precision": 0.8, - "recall": 0.8, + "f1-score": 0.9090909090909091, + "precision": 0.8333333333333334, + "recall": 1.0, "support": 5.0 }, "product_discovery": { - "f1-score": 0.9090909090909091, - "precision": 0.9090909090909091, + "f1-score": 0.9523809523809523, + "precision": 1.0, "recall": 0.9090909090909091, "support": 11.0 }, @@ -101,9 +101,9 @@ "support": 7.0 }, "purchase": { - "f1-score": 0.0, - "precision": 0.0, - "recall": 0.0, + "f1-score": 0.6666666666666666, + "precision": 1.0, + "recall": 0.5, "support": 2.0 }, "signup": { @@ -113,8 +113,8 @@ "support": 2.0 }, "task_execution": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9411764705882353, + "precision": 0.8888888888888888, "recall": 1.0, "support": 8.0 }, @@ -125,9 +125,9 @@ "support": 1.0 }, "weighted avg": { - "f1-score": 0.8644047619047619, - "precision": 0.8891666666666665, - "recall": 0.875, + "f1-score": 0.8968286860198624, + "precision": 0.9118055555555555, + "recall": 0.9, "support": 80.0 } }, diff --git a/artifacts/evaluation/latest/intent_type_difficulty_benchmark_confusion_matrix.csv b/artifacts/evaluation/latest/intent_type_difficulty_benchmark_confusion_matrix.csv index 4cd7b1b18e4f1c835bcad96639d997c04d39007c..0fd2c90c7df1c2ebc420e3110a58be9ab6c8c67a 100644 --- a/artifacts/evaluation/latest/intent_type_difficulty_benchmark_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_type_difficulty_benchmark_confusion_matrix.csv @@ -7,5 +7,5 @@ support,0,0,0,0,15,0,0,0,0,0 personal_reflection,0,0,0,0,0,15,0,0,0,0 creative_generation,0,0,0,0,0,0,15,0,0,0 chit_chat,0,0,0,0,0,1,0,14,0,0 -ambiguous,0,0,0,0,0,0,0,0,15,0 +ambiguous,1,0,0,0,0,0,0,0,14,0 prohibited,0,0,0,0,1,0,0,0,0,14 diff --git a/artifacts/evaluation/latest/intent_type_difficulty_benchmark_report.json b/artifacts/evaluation/latest/intent_type_difficulty_benchmark_report.json index c3e957a754d0a419f4b491d929e9bd8349b6a219..70596617aee3d6a9fc6b590310a77944249d5041 100644 --- a/artifacts/evaluation/latest/intent_type_difficulty_benchmark_report.json +++ b/artifacts/evaluation/latest/intent_type_difficulty_benchmark_report.json @@ -1,10 +1,10 @@ { - "accepted_accuracy": 0.9867, + "accepted_accuracy": 0.98, "accepted_coverage": 1.0, - "accuracy": 0.9867, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_difficulty_benchmark_confusion_matrix.csv", + "accuracy": 0.98, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_difficulty_benchmark_confusion_matrix.csv", "count": 150, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/intent_type_benchmark.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/intent_type_benchmark.jsonl", "difficulty_breakdown": { "easy": { "accepted_accuracy": 1.0, @@ -23,23 +23,23 @@ "macro_f1": 0.9596 }, "medium": { - "accepted_accuracy": 1.0, + "accepted_accuracy": 0.98, "accepted_coverage": 1.0, - "accuracy": 1.0, + "accuracy": 0.98, "count": 50, "fallback_rate": 0.0, - "macro_f1": 1.0 + "macro_f1": 0.9798 } }, "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 0.9867, + "macro_f1": 0.98, "per_class_metrics": { - "accuracy": 0.9866666666666667, + "accuracy": 0.98, "ambiguous": { - "f1-score": 1.0, + "f1-score": 0.9655172413793104, "precision": 1.0, - "recall": 1.0, + "recall": 0.9333333333333333, "support": 15.0 }, "chit_chat": { @@ -67,15 +67,15 @@ "support": 15.0 }, "informational": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.967741935483871, + "precision": 0.9375, "recall": 1.0, "support": 15.0 }, "macro avg": { - "f1-score": 0.9866518353726363, - "precision": 0.9875, - "recall": 0.9866666666666667, + "f1-score": 0.9799777530589543, + "precision": 0.98125, + "recall": 0.9800000000000001, "support": 150.0 }, "personal_reflection": { @@ -103,9 +103,9 @@ "support": 15.0 }, "weighted avg": { - "f1-score": 0.9866518353726362, - "precision": 0.9875, - "recall": 0.9866666666666667, + "f1-score": 0.9799777530589544, + "precision": 0.98125, + "recall": 0.98, "support": 150.0 } }, diff --git a/artifacts/evaluation/latest/intent_type_hard_cases_report.json b/artifacts/evaluation/latest/intent_type_hard_cases_report.json index 1b91d153977310c157ef6a0ff77084014a59d5cb..58addb5a1191d591a2c06ea406fc788f755b9d2f 100644 --- a/artifacts/evaluation/latest/intent_type_hard_cases_report.json +++ b/artifacts/evaluation/latest/intent_type_hard_cases_report.json @@ -2,9 +2,9 @@ "accepted_accuracy": 1.0, "accepted_coverage": 1.0, "accuracy": 1.0, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_hard_cases_confusion_matrix.csv", + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_hard_cases_confusion_matrix.csv", "count": 61, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/hard_cases.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/hard_cases.jsonl", "fallback_rate": 0.0, "head": "intent_type", "macro_f1": 1.0, diff --git a/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv b/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv index ae111974327c2b500ed0e283c71736c7666b7fa6..8f5bbe095d770bf5b9d4f76916e9bc3711182fe0 100644 --- a/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv @@ -1,11 +1,11 @@ ,informational,exploratory,commercial,transactional,support,personal_reflection,creative_generation,chit_chat,ambiguous,prohibited informational,8,0,0,0,0,0,0,0,0,0 exploratory,0,1,0,0,0,0,0,0,0,0 -commercial,1,0,9,0,0,0,0,0,0,0 -transactional,0,0,0,8,0,0,0,0,0,0 +commercial,0,0,10,0,0,0,0,0,0,0 +transactional,0,0,0,7,0,0,1,0,0,0 support,0,0,0,0,2,0,0,0,0,1 personal_reflection,0,0,0,0,0,5,0,0,0,0 -creative_generation,0,0,0,0,0,0,1,0,0,0 +creative_generation,0,0,0,1,0,0,0,0,0,0 chit_chat,0,0,0,0,0,0,0,1,0,0 ambiguous,1,0,1,0,0,0,0,0,7,0 prohibited,0,0,0,0,0,0,0,0,0,1 diff --git a/artifacts/evaluation/latest/intent_type_test_report.json b/artifacts/evaluation/latest/intent_type_test_report.json index fe3094356bca65f2a6918583d93ed832073b6fe2..4362c17ef3c88a8e31e422817031a31023e840a0 100644 --- a/artifacts/evaluation/latest/intent_type_test_report.json +++ b/artifacts/evaluation/latest/intent_type_test_report.json @@ -1,15 +1,15 @@ { - "accepted_accuracy": 0.9149, + "accepted_accuracy": 0.8936, "accepted_coverage": 1.0, - "accuracy": 0.9149, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv", + "accuracy": 0.8936, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv", "count": 47, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/test.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/test.jsonl", "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 0.9131, + "macro_f1": 0.811, "per_class_metrics": { - "accuracy": 0.9148936170212766, + "accuracy": 0.8936170212765957, "ambiguous": { "f1-score": 0.875, "precision": 1.0, @@ -23,15 +23,15 @@ "support": 1.0 }, "commercial": { - "f1-score": 0.9, - "precision": 0.9, - "recall": 0.9, + "f1-score": 0.9523809523809523, + "precision": 0.9090909090909091, + "recall": 1.0, "support": 10.0 }, "creative_generation": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.0, + "precision": 0.0, + "recall": 0.0, "support": 1.0 }, "exploratory": { @@ -41,15 +41,15 @@ "support": 1.0 }, "informational": { - "f1-score": 0.8888888888888888, - "precision": 0.8, + "f1-score": 0.9411764705882353, + "precision": 0.8888888888888888, "recall": 1.0, "support": 8.0 }, "macro avg": { - "f1-score": 0.9130555555555555, - "precision": 0.9199999999999999, - "recall": 0.9344444444444445, + "f1-score": 0.8110224089635854, + "precision": 0.8172979797979798, + "recall": 0.8319444444444443, "support": 47.0 }, "personal_reflection": { @@ -71,15 +71,15 @@ "support": 3.0 }, "transactional": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.875, + "precision": 0.875, + "recall": 0.875, "support": 8.0 }, "weighted avg": { - "f1-score": 0.916016548463357, - "precision": 0.9340425531914893, - "recall": 0.9148936170212766, + "f1-score": 0.893508254365576, + "precision": 0.9085536213195787, + "recall": 0.8936170212765957, "support": 47.0 } }, diff --git a/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv b/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv index df9bf4f2bcd7e9729d68a10eba9ddc7610c0862b..dbbd11b91a1bf127e1eb34fd2392aca759cfb272 100644 --- a/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv @@ -1,11 +1,11 @@ ,informational,exploratory,commercial,transactional,support,personal_reflection,creative_generation,chit_chat,ambiguous,prohibited informational,0,0,0,0,0,0,0,0,0,0 exploratory,0,1,0,0,0,0,0,0,0,0 -commercial,1,0,11,0,0,0,0,0,0,0 +commercial,0,0,12,0,0,0,0,0,0,0 transactional,0,0,0,0,0,0,0,0,0,0 support,0,0,0,0,0,0,0,0,0,0 personal_reflection,0,0,0,0,0,0,0,0,0,0 creative_generation,0,0,0,0,0,0,1,0,0,0 chit_chat,0,0,0,0,0,0,0,1,0,0 -ambiguous,1,0,1,0,0,0,0,0,8,0 +ambiguous,1,0,2,0,0,0,0,0,7,0 prohibited,0,0,0,0,0,0,0,0,0,1 diff --git a/artifacts/evaluation/latest/intent_type_third_wave_cases_report.json b/artifacts/evaluation/latest/intent_type_third_wave_cases_report.json index f8a3a8d6448d80c4688505170858a87ff7061612..d6b743c3c5ed6dbce615ea44e1ca9639cfe56d39 100644 --- a/artifacts/evaluation/latest/intent_type_third_wave_cases_report.json +++ b/artifacts/evaluation/latest/intent_type_third_wave_cases_report.json @@ -2,18 +2,18 @@ "accepted_accuracy": 0.8846, "accepted_coverage": 1.0, "accuracy": 0.8846, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv", + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv", "count": 26, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/third_wave_cases.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/third_wave_cases.jsonl", "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 0.8294, + "macro_f1": 0.8209, "per_class_metrics": { "accuracy": 0.8846153846153846, "ambiguous": { - "f1-score": 0.8888888888888888, + "f1-score": 0.8235294117647058, "precision": 1.0, - "recall": 0.8, + "recall": 0.7, "support": 10.0 }, "chit_chat": { @@ -23,9 +23,9 @@ "support": 1.0 }, "commercial": { - "f1-score": 0.9166666666666666, - "precision": 0.9166666666666666, - "recall": 0.9166666666666666, + "f1-score": 0.9230769230769231, + "precision": 0.8571428571428571, + "recall": 1.0, "support": 12.0 }, "creative_generation": { @@ -47,9 +47,9 @@ "support": 0.0 }, "macro avg": { - "f1-score": 0.5805555555555555, - "precision": 0.5916666666666666, - "recall": 0.5716666666666667, + "f1-score": 0.5746606334841629, + "precision": 0.5857142857142857, + "recall": 0.5700000000000001, "support": 26.0 }, "personal_reflection": { @@ -77,8 +77,8 @@ "support": 0.0 }, "weighted avg": { - "f1-score": 0.9188034188034189, - "precision": 0.9615384615384616, + "f1-score": 0.8966237382526975, + "precision": 0.9340659340659341, "recall": 0.8846153846153846, "support": 26.0 } diff --git a/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv b/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv index 0bccb99598127decfb008208f7a31553bf527feb..31124c0a5cd76ceb2cf49e69d4edb42d2328352b 100644 --- a/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv @@ -2,7 +2,7 @@ informational,38,0,0,0,0,0,0,0,0,0 exploratory,0,5,0,0,0,0,0,0,0,0 commercial,0,0,36,0,0,0,0,0,0,0 -transactional,0,0,0,28,0,0,0,0,0,0 +transactional,0,0,0,27,0,0,1,0,0,0 support,0,0,0,0,10,0,0,0,0,0 personal_reflection,0,0,0,0,0,20,0,0,0,0 creative_generation,0,0,0,0,0,0,5,0,0,0 diff --git a/artifacts/evaluation/latest/intent_type_train_report.json b/artifacts/evaluation/latest/intent_type_train_report.json index c7955783ad88ff52491b4fdc11ee37d096f8d073..b5e2283292c2680159d4d3c697e12a584a950151 100644 --- a/artifacts/evaluation/latest/intent_type_train_report.json +++ b/artifacts/evaluation/latest/intent_type_train_report.json @@ -1,15 +1,15 @@ { - "accepted_accuracy": 1.0, + "accepted_accuracy": 0.9945, "accepted_coverage": 1.0, - "accuracy": 1.0, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv", + "accuracy": 0.9945, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv", "count": 183, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/train.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/train.jsonl", "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 1.0, + "macro_f1": 0.9891, "per_class_metrics": { - "accuracy": 1.0, + "accuracy": 0.994535519125683, "ambiguous": { "f1-score": 1.0, "precision": 1.0, @@ -29,8 +29,8 @@ "support": 36.0 }, "creative_generation": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9090909090909091, + "precision": 0.8333333333333334, "recall": 1.0, "support": 5.0 }, @@ -47,9 +47,9 @@ "support": 38.0 }, "macro avg": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.989090909090909, + "precision": 0.9833333333333334, + "recall": 0.9964285714285716, "support": 183.0 }, "personal_reflection": { @@ -71,15 +71,15 @@ "support": 10.0 }, "transactional": { - "f1-score": 1.0, + "f1-score": 0.9818181818181818, "precision": 1.0, - "recall": 1.0, + "recall": 0.9642857142857143, "support": 28.0 }, "weighted avg": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.9947342275211128, + "precision": 0.9954462659380693, + "recall": 0.994535519125683, "support": 183.0 } }, diff --git a/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv b/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv index 9334f4e2f201ffc95c64a0ba367b67909288c200..1336483beb8b86a0e8d4a93616d2317909077bab 100644 --- a/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv +++ b/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv @@ -1,11 +1,11 @@ ,informational,exploratory,commercial,transactional,support,personal_reflection,creative_generation,chit_chat,ambiguous,prohibited informational,8,0,0,0,0,0,0,0,0,0 exploratory,0,1,0,0,0,0,0,0,0,0 -commercial,0,0,10,0,0,0,0,0,0,0 +commercial,0,1,9,0,0,0,0,0,0,0 transactional,0,0,0,7,0,0,1,0,0,0 support,0,0,0,0,2,0,0,0,0,1 personal_reflection,0,0,0,0,0,5,0,0,0,0 creative_generation,0,0,0,0,0,0,1,0,0,0 chit_chat,0,0,0,0,0,0,0,1,0,0 -ambiguous,0,0,0,0,0,0,0,0,9,0 +ambiguous,0,0,1,0,0,0,0,0,8,0 prohibited,0,0,0,0,0,0,0,0,0,1 diff --git a/artifacts/evaluation/latest/intent_type_val_report.json b/artifacts/evaluation/latest/intent_type_val_report.json index 50578ee1add368ed98a4a6abafdfb91a61c9f28d..408641ff3c191a16a65b5b2a64644387c0085b29 100644 --- a/artifacts/evaluation/latest/intent_type_val_report.json +++ b/artifacts/evaluation/latest/intent_type_val_report.json @@ -1,19 +1,19 @@ { - "accepted_accuracy": 0.9574, + "accepted_accuracy": 0.9149, "accepted_coverage": 1.0, - "accuracy": 0.9574, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv", + "accuracy": 0.9149, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv", "count": 47, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/val.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/val.jsonl", "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 0.9067, + "macro_f1": 0.8575, "per_class_metrics": { - "accuracy": 0.9574468085106383, + "accuracy": 0.9148936170212766, "ambiguous": { - "f1-score": 1.0, + "f1-score": 0.9411764705882353, "precision": 1.0, - "recall": 1.0, + "recall": 0.8888888888888888, "support": 9.0 }, "chit_chat": { @@ -23,9 +23,9 @@ "support": 1.0 }, "commercial": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.9, + "precision": 0.9, + "recall": 0.9, "support": 10.0 }, "creative_generation": { @@ -35,8 +35,8 @@ "support": 1.0 }, "exploratory": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.6666666666666666, + "precision": 0.5, "recall": 1.0, "support": 1.0 }, @@ -47,9 +47,9 @@ "support": 8.0 }, "macro avg": { - "f1-score": 0.9066666666666666, - "precision": 0.9, - "recall": 0.9541666666666666, + "f1-score": 0.8574509803921568, + "precision": 0.8400000000000001, + "recall": 0.9330555555555555, "support": 47.0 }, "personal_reflection": { @@ -77,9 +77,9 @@ "support": 8.0 }, "weighted avg": { - "f1-score": 0.9617021276595744, - "precision": 0.9787234042553191, - "recall": 0.9574468085106383, + "f1-score": 0.9220692532332081, + "precision": 0.9468085106382979, + "recall": 0.9148936170212766, "support": 47.0 } }, diff --git a/artifacts/evaluation/latest/known_failure_regression.json b/artifacts/evaluation/latest/known_failure_regression.json index 1c91b1b9687f251cde5936a59af2e2ba5a43f84a..3337adb08ef6ba13a44d195c73324f67143025c6 100644 --- a/artifacts/evaluation/latest/known_failure_regression.json +++ b/artifacts/evaluation/latest/known_failure_regression.json @@ -6,20 +6,20 @@ "total": 2 }, "must_fix": { - "failed": 4, - "passed": 11, + "failed": 3, + "passed": 12, "total": 15 } }, - "cases_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/examples/known_failure_cases.json", + "cases_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/examples/known_failure_cases.json", "count": 17, - "failed": 6, - "passed": 11, + "failed": 5, + "passed": 12, "results": [ { "actual": { "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Type", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.intent.type": "commercial", "system_decision.policy.monetization_eligibility": "allowed" }, @@ -32,7 +32,7 @@ "id": "auto-buying-query-allowed", "mismatches": [ { - "actual": "Auto Type", + "actual": null, "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" } @@ -236,7 +236,7 @@ "model_output.classification.intent.decision_phase": "consideration", "model_output.classification.intent.subtype": "deal_seeking", "system_decision.opportunity.type": "soft_recommendation", - "system_decision.policy.monetization_eligibility": "restricted" + "system_decision.policy.monetization_eligibility": "allowed_with_caution" }, "expected": { "model_output.classification.intent.decision_phase": "awareness", @@ -256,6 +256,11 @@ "expected": "awareness", "path": "model_output.classification.intent.decision_phase" }, + { + "actual": "allowed_with_caution", + "expected": "restricted", + "path": "system_decision.policy.monetization_eligibility" + }, { "actual": "soft_recommendation", "expected": "none", @@ -307,7 +312,7 @@ }, { "actual": { - "model_output.classification.intent.decision_phase": "decision", + "model_output.classification.intent.decision_phase": "consideration", "model_output.classification.intent.subtype": "evaluation" }, "expected": { @@ -315,15 +320,9 @@ "model_output.classification.intent.subtype": "evaluation" }, "id": "evaluation-subtype-fit-check", - "mismatches": [ - { - "actual": "decision", - "expected": "consideration", - "path": "model_output.classification.intent.decision_phase" - } - ], + "mismatches": [], "notes": "Single-vendor fit checks should map to evaluation rather than broad discovery.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Would ClickUp be a good fit for a remote ops team?" }, diff --git a/artifacts/evaluation/latest/summary.json b/artifacts/evaluation/latest/summary.json index 6603dd1995c53296c6cb83100e98de686a77f523..873b0051228f16b112ade67a6d314008360b75c5 100644 --- a/artifacts/evaluation/latest/summary.json +++ b/artifacts/evaluation/latest/summary.json @@ -1,29 +1,29 @@ { "combined": { "demo_benchmark": { - "benchmark_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/examples/demo_prompt_suite.json", + "benchmark_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/examples/demo_prompt_suite.json", "count": 15, "fallback_rate": 0.2667, - "output_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/combined_demo_benchmark.json" + "output_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/combined_demo_benchmark.json" }, "iab_behavior_lock_regression": { "by_status": { "must_fix": { - "failed": 0, - "passed": 12, + "failed": 12, + "passed": 0, "total": 12 } }, - "cases_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/examples/iab_behavior_lock_cases.json", + "cases_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/examples/iab_behavior_lock_cases.json", "count": 12, - "failed": 0, - "passed": 12, + "failed": 12, + "passed": 0, "results": [ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Type" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -31,15 +31,21 @@ "model_output.classification.iab_content.tier2.label": "Auto Type" }, "id": "car-buying-maps-to-automotive-buying", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Auto Type", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Vehicle shopping queries should map into the automotive buying branch, not business sales.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which car to buy in 2026" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Laptops" @@ -51,9 +57,15 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptop-buying-maps-to-laptops", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Laptop shopping should resolve into the laptops branch, not business sales.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which laptop to buy in 2026" }, @@ -61,8 +73,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Laptops" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -71,18 +83,29 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "labtop-buying-maps-to-laptops", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Laptops", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Common typo handling should still land in the laptops branch.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which labtop to buy in 2026" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -91,18 +114,34 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "crm-awareness-maps-to-sales", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "CRM education should resolve to the closest business/sales path, not generic software.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What is CRM software?" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -111,9 +150,25 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "crm-comparison-maps-to-sales", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Direct CRM vendor comparison should map cleanly into the sales domain.", - "pass": true, + "pass": false, "status": "must_fix", "text": "HubSpot vs Zoho for a small team" }, @@ -121,8 +176,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -131,34 +186,51 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "marketing-tools-map-to-marketing", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Marketing tool discovery should map to the marketing and advertising branch.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best AI SEO tools for content teams" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing" + "model_output.classification.iab_content.tier1.label": "Careers" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing" }, "id": "ml-explanation-maps-to-ai", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + } + ], "notes": "ML and NLP educational prompts should land in the AI branch.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What is intent classification in NLP?" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -167,15 +239,31 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "support-credential-help-maps-to-business-it", - "mismatches": [], + "mismatches": [ + { + "actual": "Personal Finance", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Credential and account help should map to business IT rather than generic business.", - "pass": true, + "pass": false, "status": "must_fix", "text": "How do I reset my password?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": "Dining Out" }, @@ -185,18 +273,24 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "restaurant-booking-maps-to-dining-out", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Generic dining requests should not inherit the repo's business default.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Book a table for 2 tonight" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Hobbies & Interests", - "model_output.classification.iab_content.tier2.label": "Content Production", - "model_output.classification.iab_content.tier3.label": "Freelance Writing" + "model_output.classification.iab_content.tier1.label": "Sensitive Topics", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -205,19 +299,35 @@ "model_output.classification.iab_content.tier3.label": "Freelance Writing" }, "id": "trial-signup-maps-to-software", - "mismatches": [], + "mismatches": [ + { + "actual": "Sensitive Topics", + "expected": "Hobbies & Interests", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Content Production", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Freelance Writing", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Software action queries should map to the software/application branch.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Start my free trial" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Communication" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Remote Working", + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -227,15 +337,41 @@ "model_output.classification.iab_content.tier4.label": "Communication" }, "id": "communication-software-maps-to-tier4", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Remote Working", + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Communication", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Full taxonomy support should preserve the tier4 communication branch.", - "pass": true, + "pass": false, "status": "must_fix", "text": "best communication software for remote teams" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "expected": { @@ -243,9 +379,15 @@ "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "id": "vodka-query-maps-to-alcoholic-beverages", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Food and beverage prompts should not fall through to the business default.", - "pass": true, + "pass": false, "status": "must_fix", "text": "what is best vodka drink should i try" } @@ -254,21 +396,21 @@ "iab_cross_vertical_behavior_lock_regression": { "by_status": { "must_fix": { - "failed": 0, - "passed": 90, + "failed": 89, + "passed": 1, "total": 90 } }, - "cases_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/examples/iab_cross_vertical_behavior_lock_cases.json", + "cases_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/examples/iab_cross_vertical_behavior_lock_cases.json", "count": 90, - "failed": 0, - "passed": 90, + "failed": 89, + "passed": 1, "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type" + "model_output.classification.iab_content.tier2.label": "Travel Accessories" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -276,15 +418,26 @@ "model_output.classification.iab_content.tier2.label": "Travel Type" }, "id": "auto-buying-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Travel Accessories", + "expected": "Travel Type", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Automotive > Auto Buying and Selling.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which car should I buy for commuting?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, @@ -294,17 +447,23 @@ "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, "id": "auto-buying-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Automotive > Auto Buying and Selling.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best used SUV for a family of four" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Type" + "model_output.classification.iab_content.tier2.label": "Auto Shows" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -312,18 +471,29 @@ "model_output.classification.iab_content.tier2.label": "Auto Type" }, "id": "auto-buying-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Auto Shows", + "expected": "Auto Type", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Automotive > Auto Buying and Selling.", - "pass": true, + "pass": false, "status": "must_fix", "text": "I need a shortlist of practical cars before making a purchase this month" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -332,18 +502,34 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "sales-crm-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Business and Finance > Business > Sales.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What is CRM software?" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -352,9 +538,25 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "sales-crm-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Business and Finance > Business > Sales.", - "pass": true, + "pass": false, "status": "must_fix", "text": "HubSpot vs Zoho for a small team" }, @@ -362,8 +564,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": "Sales" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -372,9 +574,20 @@ "model_output.classification.iab_content.tier3.label": "Sales" }, "id": "sales-crm-hard", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Sales", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Business and Finance > Business > Sales.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need software to manage leads and pipeline for a startup sales team" }, @@ -382,8 +595,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -392,9 +605,20 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "marketing-tools-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Business and Finance > Business > Marketing and Advertising.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best SEO tools for content teams" }, @@ -402,7 +626,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -410,18 +634,24 @@ "model_output.classification.iab_content.tier2.label": "Business" }, "id": "marketing-tools-medium", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Business and Finance > Business > Marketing and Advertising.", - "pass": true, + "pass": false, "status": "must_fix", "text": "How should I compare ad attribution platforms?" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier1.label": "Business and Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -430,9 +660,25 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "marketing-tools-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Business and Finance", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Business and Finance > Business > Marketing and Advertising.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need software to measure channel performance across paid and organic campaigns" }, @@ -441,7 +687,7 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -450,17 +696,23 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "business-it-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Business and Finance > Business > Business I.T..", - "pass": true, + "pass": false, "status": "must_fix", "text": "How do I reset my work password?" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search" + "model_output.classification.iab_content.tier1.label": "Business and Finance", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -468,9 +720,20 @@ "model_output.classification.iab_content.tier2.label": "Job Search" }, "id": "business-it-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Business and Finance", + "expected": "Careers", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Job Search", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Business and Finance > Business > Business I.T..", - "pass": true, + "pass": false, "status": "must_fix", "text": "My employees keep getting locked out of their accounts" }, @@ -479,7 +742,7 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -488,15 +751,21 @@ "model_output.classification.iab_content.tier3.label": "Internet" }, "id": "business-it-hard", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Business and Finance > Business > Business I.T..", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need identity and access software for login, permissions, and account security" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": "Dining Out" }, @@ -506,15 +775,21 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Food & Drink > Dining Out.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Book a table for six tonight" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": "Dining Out" }, @@ -524,15 +799,21 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Food & Drink > Dining Out.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Good restaurants for a client dinner downtown" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": "Dining Out" }, @@ -542,15 +823,21 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Food & Drink > Dining Out.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a place to eat tonight where I can make a reservation online" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "expected": { @@ -558,15 +845,21 @@ "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "id": "alcoholic-beverages-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Food & Drink > Alcoholic Beverages.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which whiskey cocktail should I order?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "expected": { @@ -574,15 +867,21 @@ "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "id": "alcoholic-beverages-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Food & Drink > Alcoholic Beverages.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best vodka drinks for beginners" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "expected": { @@ -590,31 +889,43 @@ "model_output.classification.iab_content.tier1.label": "Food & Drink" }, "id": "alcoholic-beverages-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Food & Drink > Alcoholic Beverages.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Want a spirit-forward drink recommendation, not a restaurant suggestion" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing" + "model_output.classification.iab_content.tier1.label": "Careers" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing" }, "id": "artificial-intelligence-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Artificial Intelligence.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What is intent classification in NLP?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "Language Learning" }, @@ -624,15 +935,21 @@ "model_output.classification.iab_content.tier2.label": "Language Learning" }, "id": "artificial-intelligence-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Artificial Intelligence.", - "pass": true, + "pass": false, "status": "must_fix", "text": "How do large language models handle text classification?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "Language Learning" }, @@ -642,27 +959,44 @@ "model_output.classification.iab_content.tier2.label": "Language Learning" }, "id": "artificial-intelligence-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Artificial Intelligence.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need the machine learning concept behind language understanding, not software to buy" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business" - }, + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null + }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Business and Finance", "model_output.classification.iab_content.tier2.label": "Business" }, "id": "software-apps-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Technology & Computing", + "expected": "Business and Finance", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best workflow software for a small operations team" }, @@ -670,9 +1004,9 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Cloud Computing" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -682,9 +1016,25 @@ "model_output.classification.iab_content.tier4.label": "Cloud Computing" }, "id": "software-apps-medium", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Cloud Computing", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need project management software for a distributed team" }, @@ -692,8 +1042,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -702,19 +1052,30 @@ "model_output.classification.iab_content.tier3.label": "Software and Applications" }, "id": "software-apps-hard", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Looking for a business software platform to organize internal workflows" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Communication" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Remote Working", + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -724,9 +1085,35 @@ "model_output.classification.iab_content.tier4.label": "Communication" }, "id": "communication-software-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Remote Working", + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Communication", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Computer Software and Applications > Communication.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best communication software for remote teams" }, @@ -735,8 +1122,8 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Communication" + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -746,19 +1133,30 @@ "model_output.classification.iab_content.tier4.label": "Communication" }, "id": "communication-software-medium", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Communication", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Computer Software and Applications > Communication.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Slack vs Teams for internal messaging" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Communication" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -768,9 +1166,30 @@ "model_output.classification.iab_content.tier4.label": "Communication" }, "id": "communication-software-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Communication", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Computer Software and Applications > Communication.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a workplace chat tool for cross-functional collaboration" }, @@ -779,8 +1198,8 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -790,15 +1209,26 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Vercel vs Netlify for website hosting" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Internet", @@ -812,15 +1242,21 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best hosting platform for a startup website" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Internet", @@ -834,15 +1270,21 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a managed hosting provider to deploy and run our marketing site" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Laptops" @@ -854,15 +1296,21 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which laptop should I buy for college?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Laptops" @@ -874,18 +1322,24 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best laptop for work and study under 1200" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Consumer Electronics", - "model_output.classification.iab_content.tier3.label": "Smartphones" + "model_output.classification.iab_content.tier2.label": "Computing", + "model_output.classification.iab_content.tier3.label": "Laptops" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -894,9 +1348,25 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "laptops-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Computing", + "expected": "Consumer Electronics", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": "Laptops", + "expected": "Smartphones", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a portable computer with good battery life for everyday work" }, @@ -906,7 +1376,7 @@ "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Photo Editing Software" + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -916,15 +1386,21 @@ "model_output.classification.iab_content.tier4.label": "Photo Editing Software" }, "id": "desktops-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Photo Editing Software", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best desktop for video editing" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Desktops" @@ -936,15 +1412,21 @@ "model_output.classification.iab_content.tier3.label": "Desktops" }, "id": "desktops-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Which desktop computer should I buy for a home office?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Desktops" @@ -956,15 +1438,21 @@ "model_output.classification.iab_content.tier3.label": "Desktops" }, "id": "desktops-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a desktop PC with strong performance for creative work" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics" }, @@ -974,15 +1462,21 @@ "model_output.classification.iab_content.tier2.label": "Consumer Electronics" }, "id": "smartphones-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best phone with a good camera under 700" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", "model_output.classification.iab_content.tier3.label": "Smartphones" @@ -994,15 +1488,21 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "smartphones-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Should I buy an iPhone or Pixel this year?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", "model_output.classification.iab_content.tier3.label": "Smartphones" @@ -1014,9 +1514,15 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "smartphones-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a new smartphone with strong battery life and a clean software experience" }, @@ -1024,8 +1530,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1034,18 +1540,29 @@ "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "id": "style-fashion-parent-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Women's Shoes and Footwear", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Style & Fashion.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best shoes under 100 dollars" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Clothing" + "model_output.classification.iab_content.tier2.label": "High Fashion", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1054,18 +1571,34 @@ "model_output.classification.iab_content.tier3.label": "Women's Clothing" }, "id": "style-fashion-parent-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "High Fashion", + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Women's Clothing", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Style & Fashion.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Affordable fashion accessories for everyday wear" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" + "model_output.classification.iab_content.tier2.label": "Children's Clothing", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1074,18 +1607,34 @@ "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "id": "style-fashion-parent-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Children's Clothing", + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Women's Shoes and Footwear", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Style & Fashion.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need style recommendations for clothing and footwear without a specific brand in mind" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Clothing" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1094,18 +1643,39 @@ "model_output.classification.iab_content.tier3.label": "Women's Clothing" }, "id": "womens-shoes-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Sports", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Walking", + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Women's Clothing", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Style & Fashion > Women's Fashion > Women's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best women's running shoes under 100 dollars" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Clothing" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1114,15 +1684,36 @@ "model_output.classification.iab_content.tier3.label": "Women's Clothing" }, "id": "womens-shoes-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Sports", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Walking", + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Women's Clothing", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Style & Fashion > Women's Fashion > Women's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Comfortable women's sneakers for walking all day" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Women's Fashion", "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" @@ -1134,9 +1725,15 @@ "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "id": "womens-shoes-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Style & Fashion > Women's Fashion > Women's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need women's footwear for commuting that looks polished but feels comfortable" }, @@ -1145,7 +1742,7 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Men's Fashion", - "model_output.classification.iab_content.tier3.label": "Men's Clothing" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1154,15 +1751,21 @@ "model_output.classification.iab_content.tier3.label": "Men's Clothing" }, "id": "mens-shoes-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Men's Clothing", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Style & Fashion > Men's Fashion > Men's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best men's sneakers for daily wear" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Men's Fashion", "model_output.classification.iab_content.tier3.label": "Men's Clothing" @@ -1174,15 +1777,21 @@ "model_output.classification.iab_content.tier3.label": "Men's Clothing" }, "id": "mens-shoes-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Style & Fashion > Men's Fashion > Men's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Good men's dress shoes for office use" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Men's Fashion", "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" @@ -1194,17 +1803,23 @@ "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" }, "id": "mens-shoes-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Style & Fashion > Men's Fashion > Men's Shoes and Footwear.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need men's footwear that works for workdays and weekend walking" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Real Estate", - "model_output.classification.iab_content.tier2.label": "Hotel Properties" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1212,15 +1827,31 @@ "model_output.classification.iab_content.tier2.label": "Hotel Properties" }, "id": "hotels-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Travel", + "expected": "Real Estate", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Travel Type", + "expected": "Hotel Properties", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Travel > Travel Type > Hotels and Motels.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need a hotel in Chicago for two nights" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", "model_output.classification.iab_content.tier2.label": "Travel Type", "model_output.classification.iab_content.tier3.label": "Hotels and Motels" @@ -1232,17 +1863,23 @@ "model_output.classification.iab_content.tier3.label": "Hotels and Motels" }, "id": "hotels-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Travel > Travel Type > Hotels and Motels.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best hotels near Times Square for a weekend trip" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1250,15 +1887,26 @@ "model_output.classification.iab_content.tier2.label": "Travel Type" }, "id": "hotels-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Travel Type", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Travel > Travel Type > Hotels and Motels.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Looking for a place to stay during a work trip, not general travel advice" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Real Estate", "model_output.classification.iab_content.tier2.label": "Apartments" }, @@ -1268,15 +1916,21 @@ "model_output.classification.iab_content.tier2.label": "Apartments" }, "id": "real-estate-rentals-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Real Estate > Real Estate Renting and Leasing.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Apartments for rent near downtown Austin" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Real Estate", "model_output.classification.iab_content.tier2.label": "Apartments" }, @@ -1286,18 +1940,24 @@ "model_output.classification.iab_content.tier2.label": "Apartments" }, "id": "real-estate-rentals-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Real Estate > Real Estate Renting and Leasing.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best neighborhoods to lease a two-bedroom apartment in Seattle" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Personal Debt", - "model_output.classification.iab_content.tier3.label": "Home Financing" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Real Estate", + "model_output.classification.iab_content.tier2.label": "Real Estate Renting and Leasing", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1306,18 +1966,39 @@ "model_output.classification.iab_content.tier3.label": "Home Financing" }, "id": "real-estate-rentals-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Real Estate", + "expected": "Personal Finance", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Real Estate Renting and Leasing", + "expected": "Personal Debt", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Home Financing", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Real Estate > Real Estate Renting and Leasing.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need rental listings for a short move, not home-buying advice" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": "Green Solutions" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1326,18 +2007,39 @@ "model_output.classification.iab_content.tier3.label": "Green Solutions" }, "id": "running-and-jogging-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Sports", + "expected": "Business and Finance", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Walking", + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Green Solutions", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best running plan for a first 10k" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", - "model_output.classification.iab_content.tier3.label": "Running and Jogging" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1346,18 +2048,39 @@ "model_output.classification.iab_content.tier3.label": "Running and Jogging" }, "id": "running-and-jogging-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Sports", + "expected": "Healthy Living", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Walking", + "expected": "Fitness and Exercise", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Running and Jogging", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", - "pass": true, + "pass": false, "status": "must_fix", "text": "How should I train for a half marathon as a beginner?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", - "model_output.classification.iab_content.tier3.label": "Running and Jogging" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1366,15 +2089,36 @@ "model_output.classification.iab_content.tier3.label": "Running and Jogging" }, "id": "running-and-jogging-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Sports", + "expected": "Healthy Living", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Walking", + "expected": "Fitness and Exercise", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Running and Jogging", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need guidance on building a weekly jogging routine without getting injured" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", "model_output.classification.iab_content.tier2.label": "Soccer" }, @@ -1384,15 +2128,21 @@ "model_output.classification.iab_content.tier2.label": "Soccer" }, "id": "soccer-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Sports > Soccer.", - "pass": true, + "pass": false, "status": "must_fix", "text": "How do offside rules work in soccer?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", "model_output.classification.iab_content.tier2.label": "Soccer" }, @@ -1402,15 +2152,21 @@ "model_output.classification.iab_content.tier2.label": "Soccer" }, "id": "soccer-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Sports > Soccer.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best soccer drills for beginner players" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports" }, "expected": { @@ -1418,17 +2174,23 @@ "model_output.classification.iab_content.tier1.label": "Sports" }, "id": "soccer-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Sports > Soccer.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need help understanding football tactics for the Premier League, not fantasy sports" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Books and Literature", - "model_output.classification.iab_content.tier2.label": "Fiction" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Fantasy" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1436,17 +2198,33 @@ "model_output.classification.iab_content.tier2.label": "Fiction" }, "id": "fiction-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Genres", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Fantasy", + "expected": "Fiction", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Books and Literature > Fiction.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Recommend a good fantasy novel to read" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Books and Literature", - "model_output.classification.iab_content.tier2.label": "Fiction" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1454,31 +2232,58 @@ "model_output.classification.iab_content.tier2.label": "Fiction" }, "id": "fiction-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Travel", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Travel Type", + "expected": "Fiction", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Books and Literature > Fiction.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best fiction books for a long flight" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Books and Literature" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Genres" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Books and Literature" }, "id": "fiction-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Genres", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Books and Literature > Fiction.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Looking for a character-driven novel, not comics or poetry" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", "model_output.classification.iab_content.tier2.label": "Remodeling & Construction" }, @@ -1488,18 +2293,24 @@ "model_output.classification.iab_content.tier2.label": "Remodeling & Construction" }, "id": "home-improvement-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Home & Garden > Home Improvement.", - "pass": true, + "pass": false, "status": "must_fix", "text": "How much does a kitchen remodel usually cost?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Personal Care", - "model_output.classification.iab_content.tier3.label": "Bath and Shower" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Home & Garden", + "model_output.classification.iab_content.tier2.label": "Indoor Environmental Quality", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1508,17 +2319,38 @@ "model_output.classification.iab_content.tier3.label": "Bath and Shower" }, "id": "home-improvement-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Home & Garden", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Indoor Environmental Quality", + "expected": "Personal Care", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Bath and Shower", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Home & Garden > Home Improvement.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best tools for a DIY bathroom renovation" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Interior Decorating" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1526,17 +2358,28 @@ "model_output.classification.iab_content.tier2.label": "Interior Decorating" }, "id": "home-improvement-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Interior Decorating", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Home & Garden > Home Improvement.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need practical advice for upgrading an older house, not interior decor inspiration" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Education", - "model_output.classification.iab_content.tier2.label": "Language Learning" + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1544,15 +2387,26 @@ "model_output.classification.iab_content.tier2.label": "Language Learning" }, "id": "online-education-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Technology & Computing", + "expected": "Education", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Language Learning", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Education > Online Education.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best online courses for learning Python" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Careers", "model_output.classification.iab_content.tier2.label": "Remote Working" }, @@ -1562,9 +2416,15 @@ "model_output.classification.iab_content.tier2.label": "Remote Working" }, "id": "online-education-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Education > Online Education.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What are good platforms for remote professional classes?" }, @@ -1586,7 +2446,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "College Education" }, @@ -1596,9 +2456,15 @@ "model_output.classification.iab_content.tier2.label": "College Education" }, "id": "postgraduate-education-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Education > College Education > Postgraduate Education.", - "pass": true, + "pass": false, "status": "must_fix", "text": "best universities to study masters" }, @@ -1606,9 +2472,9 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Education", - "model_output.classification.iab_content.tier2.label": "College Education", - "model_output.classification.iab_content.tier3.label": "Postgraduate Education", - "model_output.classification.iab_content.tier4.label": "Professional School" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1618,15 +2484,31 @@ "model_output.classification.iab_content.tier4.label": "Professional School" }, "id": "postgraduate-education-medium", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "College Education", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Postgraduate Education", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Professional School", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Education > College Education > Postgraduate Education.", - "pass": true, + "pass": false, "status": "must_fix", "text": "which graduate schools have strong data science programs" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "College Education" }, @@ -1636,9 +2518,15 @@ "model_output.classification.iab_content.tier2.label": "College Education" }, "id": "postgraduate-education-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Education > College Education > Postgraduate Education.", - "pass": true, + "pass": false, "status": "must_fix", "text": "need postgraduate options for a master's degree, not short online courses" }, @@ -1646,8 +2534,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Medical Health", - "model_output.classification.iab_content.tier2.label": "Diseases and Conditions", - "model_output.classification.iab_content.tier3.label": "Allergies" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1656,9 +2544,20 @@ "model_output.classification.iab_content.tier3.label": "Allergies" }, "id": "medical-health-easy", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Diseases and Conditions", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Allergies", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Medical Health.", - "pass": true, + "pass": false, "status": "must_fix", "text": "what do these allergy symptoms mean" }, @@ -1667,8 +2566,8 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Medical Health", "model_output.classification.iab_content.tier2.label": "Diseases and Conditions", - "model_output.classification.iab_content.tier3.label": "Injuries", - "model_output.classification.iab_content.tier4.label": "First Aid" + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1678,18 +2577,29 @@ "model_output.classification.iab_content.tier4.label": "First Aid" }, "id": "medical-health-medium", - "mismatches": [], + "mismatches": [ + { + "actual": null, + "expected": "Injuries", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "First Aid", + "path": "model_output.classification.iab_content.tier4.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Medical Health.", - "pass": true, + "pass": false, "status": "must_fix", "text": "when should i see a doctor for persistent knee pain" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Wellness", - "model_output.classification.iab_content.tier3.label": "Physical Therapy" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Medical Health", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1698,15 +2608,36 @@ "model_output.classification.iab_content.tier3.label": "Physical Therapy" }, "id": "medical-health-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Medical Health", + "expected": "Healthy Living", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Wellness", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Physical Therapy", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Medical Health.", - "pass": true, + "pass": false, "status": "must_fix", "text": "need medical advice about symptoms, not wellness or fitness tips" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Careers", "model_output.classification.iab_content.tier2.label": "Remote Working" }, @@ -1716,18 +2647,24 @@ "model_output.classification.iab_content.tier2.label": "Remote Working" }, "id": "careers-job-search-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Careers > Job Search.", - "pass": true, + "pass": false, "status": "must_fix", "text": "best remote jobs for data analysts" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": "Sales" + "model_output.classification.iab_content.tier2.label": "Industries", + "model_output.classification.iab_content.tier3.label": "Management Consulting Industry" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1736,17 +2673,33 @@ "model_output.classification.iab_content.tier3.label": "Sales" }, "id": "careers-job-search-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Industries", + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": "Management Consulting Industry", + "expected": "Sales", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Careers > Job Search.", - "pass": true, + "pass": false, "status": "must_fix", "text": "where should i look for product manager openings" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search" + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1754,17 +2707,28 @@ "model_output.classification.iab_content.tier2.label": "Job Search" }, "id": "careers-job-search-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "Genres", + "expected": "Careers", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Job Search", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Careers > Job Search.", - "pass": true, + "pass": false, "status": "must_fix", "text": "need help finding a new role and preparing for interviews" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": "Food Movements" + "model_output.classification.iab_content.tier1.label": "Personal Celebrations & Life Events", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1772,18 +2736,29 @@ "model_output.classification.iab_content.tier2.label": "Food Movements" }, "id": "personal-finance-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "Personal Celebrations & Life Events", + "expected": "Food & Drink", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Food Movements", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical easy IAB mapping case for Personal Finance > Financial Planning.", - "pass": true, + "pass": false, "status": "must_fix", "text": "how much should i save each month" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Economy", - "model_output.classification.iab_content.tier3.label": "Financial Reform" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1792,17 +2767,38 @@ "model_output.classification.iab_content.tier3.label": "Financial Reform" }, "id": "personal-finance-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Personal Finance", + "expected": "Business and Finance", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Economy", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Financial Reform", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Personal Finance > Financial Planning.", - "pass": true, + "pass": false, "status": "must_fix", "text": "best budgeting approach for a growing family" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Retirement Planning" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1810,15 +2806,26 @@ "model_output.classification.iab_content.tier2.label": "Retirement Planning" }, "id": "personal-finance-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Retirement Planning", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Personal Finance > Financial Planning.", - "pass": true, + "pass": false, "status": "must_fix", "text": "need help planning savings and retirement, not business finance advice" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Family and Relationships", "model_output.classification.iab_content.tier2.label": "Parenting" }, @@ -1828,17 +2835,23 @@ "model_output.classification.iab_content.tier2.label": "Parenting" }, "id": "parenting-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Family and Relationships > Parenting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "tips for parenting a toddler" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Family and Relationships", - "model_output.classification.iab_content.tier2.label": "Parenting" + "model_output.classification.iab_content.tier1.label": "Education", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1846,18 +2859,29 @@ "model_output.classification.iab_content.tier2.label": "Parenting" }, "id": "parenting-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Education", + "expected": "Family and Relationships", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Parenting", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Family and Relationships > Parenting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "how do i help my teenager spend less time online" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Family and Relationships", "model_output.classification.iab_content.tier2.label": "Parenting", - "model_output.classification.iab_content.tier3.label": "Special Needs Kids" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1866,15 +2890,26 @@ "model_output.classification.iab_content.tier3.label": "Special Needs Kids" }, "id": "parenting-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Special Needs Kids", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Family and Relationships > Parenting.", - "pass": true, + "pass": false, "status": "must_fix", "text": "need parenting advice for a child starting preschool" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden" }, "expected": { @@ -1882,17 +2917,23 @@ "model_output.classification.iab_content.tier1.label": "Home & Garden" }, "id": "gardening-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Home & Garden > Gardening.", - "pass": true, + "pass": false, "status": "must_fix", "text": "best plants for a small balcony garden" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Gardening" + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1900,15 +2941,26 @@ "model_output.classification.iab_content.tier2.label": "Gardening" }, "id": "gardening-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Food & Drink", + "expected": "Home & Garden", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": null, + "expected": "Gardening", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Home & Garden > Gardening.", - "pass": true, + "pass": false, "status": "must_fix", "text": "how often should i water tomato plants" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden" }, "expected": { @@ -1916,15 +2968,21 @@ "model_output.classification.iab_content.tier1.label": "Home & Garden" }, "id": "gardening-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Home & Garden > Gardening.", - "pass": true, + "pass": false, "status": "must_fix", "text": "need gardening advice for a shady backyard, not interior decor ideas" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Entertainment", "model_output.classification.iab_content.tier2.label": "Movies" }, @@ -1934,18 +2992,24 @@ "model_output.classification.iab_content.tier2.label": "Movies" }, "id": "movies-easy", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical easy IAB mapping case for Entertainment > Movies.", - "pass": true, + "pass": false, "status": "must_fix", "text": "What movie should we watch tonight?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Video Gaming", - "model_output.classification.iab_content.tier2.label": "Video Game Genres", - "model_output.classification.iab_content.tier3.label": "Horror Video Games" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Horror", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1954,18 +3018,39 @@ "model_output.classification.iab_content.tier3.label": "Horror Video Games" }, "id": "movies-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Genres", + "expected": "Video Gaming", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Horror", + "expected": "Video Game Genres", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Horror Video Games", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Entertainment > Movies.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best thriller movies from the last few years" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Entertainment", - "model_output.classification.iab_content.tier2.label": "Music", - "model_output.classification.iab_content.tier3.label": "Soundtracks, TV and Showtunes" + "model_output.classification.iab_content.tier2.label": "Movies", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -1974,9 +3059,25 @@ "model_output.classification.iab_content.tier3.label": "Soundtracks, TV and Showtunes" }, "id": "movies-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Movies", + "expected": "Music", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Soundtracks, TV and Showtunes", + "path": "model_output.classification.iab_content.tier3.label" + } + ], "notes": "Cross-vertical hard IAB mapping case for Entertainment > Movies.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Looking for film recommendations, not TV shows or music" } @@ -1985,21 +3086,21 @@ "iab_cross_vertical_quality_target_eval": { "by_status": { "must_fix": { - "failed": 86, - "passed": 4, + "failed": 57, + "passed": 33, "total": 90 } }, - "cases_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/examples/iab_cross_vertical_mapping_cases.json", + "cases_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/examples/iab_cross_vertical_mapping_cases.json", "count": 90, - "failed": 86, - "passed": 4, + "failed": 57, + "passed": 33, "results": [ { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type" + "model_output.classification.iab_content.tier2.label": "Travel Accessories" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2014,7 +3115,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Travel Type", + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Travel Accessories", "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" } @@ -2026,7 +3132,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", "model_output.classification.iab_content.tier2.label": "Auto Body Styles" }, @@ -2037,6 +3143,11 @@ }, "id": "auto-buying-medium", "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, { "actual": "Auto Body Styles", "expected": "Auto Buying and Selling", @@ -2050,9 +3161,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Type" + "model_output.classification.iab_content.tier2.label": "Auto Shows" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -2062,7 +3173,12 @@ "id": "auto-buying-hard", "mismatches": [ { - "actual": "Auto Type", + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Auto Shows", "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" } @@ -2074,10 +3190,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2093,17 +3209,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Software and Applications", + "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" } @@ -2116,9 +3227,9 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2129,7 +3240,7 @@ "id": "sales-crm-medium", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Careers", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, @@ -2139,12 +3250,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Internet", + "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" } @@ -2158,8 +3269,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": "Sales" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2173,6 +3284,16 @@ "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Sales", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical hard IAB mapping case for Business and Finance > Business > Sales.", @@ -2184,8 +3305,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2206,12 +3327,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Internet", + "actual": null, "expected": "Marketing and Advertising", "path": "model_output.classification.iab_content.tier3.label" } @@ -2225,7 +3346,7 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -2241,6 +3362,11 @@ "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" }, + { + "actual": null, + "expected": "Business", + "path": "model_output.classification.iab_content.tier2.label" + }, { "actual": null, "expected": "Marketing and Advertising", @@ -2255,9 +3381,9 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier1.label": "Business and Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2267,23 +3393,18 @@ }, "id": "marketing-tools-hard", "mismatches": [ - { - "actual": "Technology & Computing", - "expected": "Business and Finance", - "path": "model_output.classification.iab_content.tier1.label" - }, { "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Software and Applications", + "actual": null, "expected": "Marketing and Advertising", "path": "model_output.classification.iab_content.tier3.label" } @@ -2298,7 +3419,7 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2324,7 +3445,7 @@ "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Internet", + "actual": null, "expected": "Business I.T.", "path": "model_output.classification.iab_content.tier3.label" } @@ -2337,8 +3458,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search", + "model_output.classification.iab_content.tier1.label": "Business and Finance", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -2349,18 +3470,13 @@ }, "id": "business-it-medium", "mismatches": [ - { - "actual": "Careers", - "expected": "Business and Finance", - "path": "model_output.classification.iab_content.tier1.label" - }, { "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Job Search", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, @@ -2380,7 +3496,7 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2406,7 +3522,7 @@ "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Internet", + "actual": null, "expected": "Business I.T.", "path": "model_output.classification.iab_content.tier3.label" } @@ -2418,7 +3534,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": "Dining Out" }, @@ -2428,21 +3544,15 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Food & Drink > Dining Out.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Book a table for six tonight" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": "Dining Out" }, @@ -2452,21 +3562,15 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Food & Drink > Dining Out.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Good restaurants for a client dinner downtown" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": "Dining Out" }, @@ -2476,23 +3580,17 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "dining-out-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Food & Drink > Dining Out.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a place to eat tonight where I can make a reservation online" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2500,28 +3598,17 @@ "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" }, "id": "alcoholic-beverages-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Alcoholic Beverages", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Food & Drink > Alcoholic Beverages.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Which whiskey cocktail should I order?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2529,28 +3616,17 @@ "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" }, "id": "alcoholic-beverages-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Alcoholic Beverages", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Food & Drink > Alcoholic Beverages.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best vodka drinks for beginners" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Non-Alcoholic Beverages" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2560,12 +3636,7 @@ "id": "alcoholic-beverages-hard", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, + "actual": "Non-Alcoholic Beverages", "expected": "Alcoholic Beverages", "path": "model_output.classification.iab_content.tier2.label" } @@ -2578,7 +3649,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier1.label": "Careers", "model_output.classification.iab_content.tier2.label": null }, "expected": { @@ -2588,6 +3659,11 @@ }, "id": "artificial-intelligence-easy", "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "nearest_equivalent", "expected": "exact", @@ -2606,7 +3682,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "Language Learning" }, @@ -2622,11 +3698,6 @@ "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Language Learning", "expected": "Artificial Intelligence", @@ -2640,7 +3711,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "Language Learning" }, @@ -2656,11 +3727,6 @@ "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Language Learning", "expected": "Artificial Intelligence", @@ -2675,8 +3741,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business", + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -2687,18 +3753,13 @@ }, "id": "software-apps-easy", "mismatches": [ - { - "actual": "Business and Finance", - "expected": "Technology & Computing", - "path": "model_output.classification.iab_content.tier1.label" - }, { "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Business", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, @@ -2717,8 +3778,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2734,7 +3795,12 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Internet", + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, "expected": "Software and Applications", "path": "model_output.classification.iab_content.tier3.label" } @@ -2748,8 +3814,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2763,6 +3829,16 @@ "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Computer Software and Applications.", @@ -2772,11 +3848,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Communication" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Remote Working", + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2788,9 +3864,24 @@ "id": "communication-software-easy", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Remote Working", + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Communication", + "path": "model_output.classification.iab_content.tier4.label" } ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Computer Software and Applications > Communication.", @@ -2803,8 +3894,8 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Communication" + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2819,6 +3910,16 @@ "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Communication", + "path": "model_output.classification.iab_content.tier4.label" } ], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Computer Software and Applications > Communication.", @@ -2829,10 +3930,10 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Communication" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2843,10 +3944,30 @@ }, "id": "communication-software-hard", "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Software and Applications", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Communication", + "path": "model_output.classification.iab_content.tier4.label" } ], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Computer Software and Applications > Communication.", @@ -2859,8 +3980,8 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet", - "model_output.classification.iab_content.tier4.label": "Web Hosting" + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -2875,6 +3996,16 @@ "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Internet", + "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": null, + "expected": "Web Hosting", + "path": "model_output.classification.iab_content.tier4.label" } ], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", @@ -2884,7 +4015,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Internet", @@ -2898,21 +4029,15 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best hosting platform for a startup website" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Internet", @@ -2926,21 +4051,15 @@ "model_output.classification.iab_content.tier4.label": "Web Hosting" }, "id": "web-hosting-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Internet > Web Hosting.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a managed hosting provider to deploy and run our marketing site" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Laptops" @@ -2952,21 +4071,15 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Which laptop should I buy for college?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Laptops" @@ -2978,24 +4091,18 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best laptop for work and study under 1200" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Consumer Electronics", - "model_output.classification.iab_content.tier3.label": "Smartphones" + "model_output.classification.iab_content.tier2.label": "Computing", + "model_output.classification.iab_content.tier3.label": "Laptops" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3004,25 +4111,9 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptops-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Consumer Electronics", - "expected": "Computing", - "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": "Smartphones", - "expected": "Laptops", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Laptops.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a portable computer with good battery life for everyday work" }, @@ -3059,7 +4150,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Desktops" @@ -3071,21 +4162,15 @@ "model_output.classification.iab_content.tier3.label": "Desktops" }, "id": "desktops-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Which desktop computer should I buy for a home office?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Desktops" @@ -3097,24 +4182,18 @@ "model_output.classification.iab_content.tier3.label": "Desktops" }, "id": "desktops-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Computing > Desktops.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a desktop PC with strong performance for creative work" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Smartphones" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3123,26 +4202,15 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "smartphones-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Smartphones", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best phone with a good camera under 700" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", "model_output.classification.iab_content.tier3.label": "Smartphones" @@ -3154,21 +4222,15 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "smartphones-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Should I buy an iPhone or Pixel this year?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Consumer Electronics", "model_output.classification.iab_content.tier3.label": "Smartphones" @@ -3180,15 +4242,9 @@ "model_output.classification.iab_content.tier3.label": "Smartphones" }, "id": "smartphones-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Technology & Computing > Consumer Electronics > Smartphones.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a new smartphone with strong battery life and a clean software experience" }, @@ -3210,7 +4266,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion" }, "expected": { @@ -3218,15 +4274,21 @@ "model_output.classification.iab_content.tier1.label": "Style & Fashion" }, "id": "style-fashion-parent-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical medium IAB mapping case for Style & Fashion.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Affordable fashion accessories for everyday wear" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion" }, "expected": { @@ -3234,18 +4296,24 @@ "model_output.classification.iab_content.tier1.label": "Style & Fashion" }, "id": "style-fashion-parent-hard", - "mismatches": [], + "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + } + ], "notes": "Cross-vertical hard IAB mapping case for Style & Fashion.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Need style recommendations for clothing and footwear without a specific brand in mind" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Clothing" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3256,12 +4324,17 @@ "id": "womens-shoes-easy", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Sports", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Walking", + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Women's Clothing", + "actual": null, "expected": "Women's Shoes and Footwear", "path": "model_output.classification.iab_content.tier3.label" } @@ -3273,10 +4346,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Women's Fashion", - "model_output.classification.iab_content.tier3.label": "Women's Clothing" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3287,12 +4360,17 @@ "id": "womens-shoes-medium", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Sports", + "expected": "Style & Fashion", + "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Women's Clothing", + "actual": "Walking", + "expected": "Women's Fashion", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, "expected": "Women's Shoes and Footwear", "path": "model_output.classification.iab_content.tier3.label" } @@ -3304,7 +4382,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Women's Fashion", "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" @@ -3316,15 +4394,9 @@ "model_output.classification.iab_content.tier3.label": "Women's Shoes and Footwear" }, "id": "womens-shoes-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Style & Fashion > Women's Fashion > Women's Shoes and Footwear.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need women's footwear for commuting that looks polished but feels comfortable" }, @@ -3333,7 +4405,7 @@ "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Men's Fashion", - "model_output.classification.iab_content.tier3.label": "Men's Clothing" + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3349,7 +4421,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Men's Clothing", + "actual": null, "expected": "Men's Shoes and Footwear", "path": "model_output.classification.iab_content.tier3.label" } @@ -3361,7 +4433,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Men's Fashion", "model_output.classification.iab_content.tier3.label": "Men's Clothing" @@ -3374,11 +4446,6 @@ }, "id": "mens-shoes-medium", "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Men's Clothing", "expected": "Men's Shoes and Footwear", @@ -3392,7 +4459,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Style & Fashion", "model_output.classification.iab_content.tier2.label": "Men's Fashion", "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" @@ -3404,24 +4471,18 @@ "model_output.classification.iab_content.tier3.label": "Men's Shoes and Footwear" }, "id": "mens-shoes-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Style & Fashion > Men's Fashion > Men's Shoes and Footwear.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need men's footwear that works for workdays and weekend walking" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Real Estate", - "model_output.classification.iab_content.tier2.label": "Hotel Properties", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type", + "model_output.classification.iab_content.tier3.label": "Hotels and Motels" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3430,36 +4491,15 @@ "model_output.classification.iab_content.tier3.label": "Hotels and Motels" }, "id": "hotels-easy", - "mismatches": [ - { - "actual": "Real Estate", - "expected": "Travel", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Hotel Properties", - "expected": "Travel Type", - "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": null, - "expected": "Hotels and Motels", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Travel > Travel Type > Hotels and Motels.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need a hotel in Chicago for two nights" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", "model_output.classification.iab_content.tier2.label": "Travel Type", "model_output.classification.iab_content.tier3.label": "Hotels and Motels" @@ -3471,23 +4511,17 @@ "model_output.classification.iab_content.tier3.label": "Hotels and Motels" }, "id": "hotels-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Travel > Travel Type > Hotels and Motels.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best hotels near Times Square for a weekend trip" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Travel", - "model_output.classification.iab_content.tier2.label": "Travel Type", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.iab_content.tier3.label": null }, "expected": { @@ -3499,9 +4533,9 @@ "id": "hotels-hard", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Travel Type", + "path": "model_output.classification.iab_content.tier2.label" }, { "actual": null, @@ -3516,7 +4550,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Real Estate", "model_output.classification.iab_content.tier2.label": "Apartments" }, @@ -3527,11 +4561,6 @@ }, "id": "real-estate-rentals-easy", "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Apartments", "expected": "Real Estate Renting and Leasing", @@ -3545,7 +4574,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Real Estate", "model_output.classification.iab_content.tier2.label": "Apartments" }, @@ -3556,6 +4585,11 @@ }, "id": "real-estate-rentals-medium", "mismatches": [ + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, { "actual": "Apartments", "expected": "Real Estate Renting and Leasing", @@ -3569,9 +4603,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Personal Debt" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Real Estate", + "model_output.classification.iab_content.tier2.label": "Real Estate Renting and Leasing" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3579,34 +4613,18 @@ "model_output.classification.iab_content.tier2.label": "Real Estate Renting and Leasing" }, "id": "real-estate-rentals-hard", - "mismatches": [ - { - "actual": "Personal Finance", - "expected": "Real Estate", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Personal Debt", - "expected": "Real Estate Renting and Leasing", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Real Estate > Real Estate Renting and Leasing.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need rental listings for a short move, not home-buying advice" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business", - "model_output.classification.iab_content.tier3.label": "Green Solutions" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3617,22 +4635,17 @@ "id": "running-and-jogging-easy", "mismatches": [ { - "actual": "Business and Finance", + "actual": "Sports", "expected": "Healthy Living", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Business", + "actual": "Walking", "expected": "Fitness and Exercise", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Green Solutions", + "actual": null, "expected": "Running and Jogging", "path": "model_output.classification.iab_content.tier3.label" } @@ -3644,10 +4657,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", - "model_output.classification.iab_content.tier3.label": "Running and Jogging" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3658,9 +4671,19 @@ "id": "running-and-jogging-medium", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Sports", + "expected": "Healthy Living", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Walking", + "expected": "Fitness and Exercise", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Running and Jogging", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", @@ -3670,10 +4693,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living", - "model_output.classification.iab_content.tier2.label": "Fitness and Exercise", - "model_output.classification.iab_content.tier3.label": "Running and Jogging" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Sports", + "model_output.classification.iab_content.tier2.label": "Walking", + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3684,9 +4707,19 @@ "id": "running-and-jogging-hard", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Sports", + "expected": "Healthy Living", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Walking", + "expected": "Fitness and Exercise", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Running and Jogging", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical hard IAB mapping case for Healthy Living > Fitness and Exercise > Running and Jogging.", @@ -3696,7 +4729,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", "model_output.classification.iab_content.tier2.label": "Soccer" }, @@ -3706,21 +4739,15 @@ "model_output.classification.iab_content.tier2.label": "Soccer" }, "id": "soccer-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Sports > Soccer.", - "pass": false, + "pass": true, "status": "must_fix", "text": "How do offside rules work in soccer?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", "model_output.classification.iab_content.tier2.label": "Soccer" }, @@ -3730,23 +4757,17 @@ "model_output.classification.iab_content.tier2.label": "Soccer" }, "id": "soccer-medium", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical medium IAB mapping case for Sports > Soccer.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Best soccer drills for beginner players" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Sports", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Soccer" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3754,28 +4775,17 @@ "model_output.classification.iab_content.tier2.label": "Soccer" }, "id": "soccer-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Soccer", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Sports > Soccer.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Need help understanding football tactics for the Premier League, not fantasy sports" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Books and Literature", - "model_output.classification.iab_content.tier2.label": "Fiction" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Fantasy" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3785,9 +4795,14 @@ "id": "fiction-easy", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Genres", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Fantasy", + "expected": "Fiction", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical easy IAB mapping case for Books and Literature > Fiction.", @@ -3797,9 +4812,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Books and Literature", - "model_output.classification.iab_content.tier2.label": "Fiction" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Travel", + "model_output.classification.iab_content.tier2.label": "Travel Type" }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -3807,17 +4822,33 @@ "model_output.classification.iab_content.tier2.label": "Fiction" }, "id": "fiction-medium", - "mismatches": [], + "mismatches": [ + { + "actual": "Travel", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": "Travel Type", + "expected": "Fiction", + "path": "model_output.classification.iab_content.tier2.label" + } + ], "notes": "Cross-vertical medium IAB mapping case for Books and Literature > Fiction.", - "pass": true, + "pass": false, "status": "must_fix", "text": "Best fiction books for a long flight" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Books and Literature", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Romance" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3827,12 +4858,12 @@ "id": "fiction-hard", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": "Genres", + "expected": "Books and Literature", + "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": null, + "actual": "Romance", "expected": "Fiction", "path": "model_output.classification.iab_content.tier2.label" } @@ -3844,7 +4875,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", "model_output.classification.iab_content.tier2.label": "Remodeling & Construction" }, @@ -3855,11 +4886,6 @@ }, "id": "home-improvement-easy", "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Remodeling & Construction", "expected": "Home Improvement", @@ -3873,9 +4899,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Style & Fashion", - "model_output.classification.iab_content.tier2.label": "Personal Care" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Home & Garden", + "model_output.classification.iab_content.tier2.label": "Indoor Environmental Quality" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3885,17 +4911,7 @@ "id": "home-improvement-medium", "mismatches": [ { - "actual": "Style & Fashion", - "expected": "Home & Garden", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Personal Care", + "actual": "Indoor Environmental Quality", "expected": "Home Improvement", "path": "model_output.classification.iab_content.tier2.label" } @@ -3907,9 +4923,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Interior Decorating" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3919,12 +4935,7 @@ "id": "home-improvement-hard", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Interior Decorating", + "actual": null, "expected": "Home Improvement", "path": "model_output.classification.iab_content.tier2.label" } @@ -3937,8 +4948,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Education", - "model_output.classification.iab_content.tier2.label": "Language Learning" + "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -3947,13 +4958,18 @@ }, "id": "online-education-easy", "mismatches": [ + { + "actual": "Technology & Computing", + "expected": "Education", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Language Learning", + "actual": null, "expected": "Online Education", "path": "model_output.classification.iab_content.tier2.label" } @@ -3965,7 +4981,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Careers", "model_output.classification.iab_content.tier2.label": "Remote Working" }, @@ -3981,11 +4997,6 @@ "expected": "Education", "path": "model_output.classification.iab_content.tier1.label" }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Remote Working", "expected": "Online Education", @@ -4033,10 +5044,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "College Education", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Postgraduate Education" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4045,20 +5056,9 @@ "model_output.classification.iab_content.tier3.label": "Postgraduate Education" }, "id": "postgraduate-education-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Postgraduate Education", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Education > College Education > Postgraduate Education.", - "pass": false, + "pass": true, "status": "must_fix", "text": "best universities to study masters" }, @@ -4066,8 +5066,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Education", - "model_output.classification.iab_content.tier2.label": "College Education", - "model_output.classification.iab_content.tier3.label": "Postgraduate Education" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4081,6 +5081,16 @@ "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "College Education", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Postgraduate Education", + "path": "model_output.classification.iab_content.tier3.label" } ], "notes": "Cross-vertical medium IAB mapping case for Education > College Education > Postgraduate Education.", @@ -4090,10 +5100,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Education", "model_output.classification.iab_content.tier2.label": "College Education", - "model_output.classification.iab_content.tier3.label": null + "model_output.classification.iab_content.tier3.label": "Postgraduate Education" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4102,20 +5112,9 @@ "model_output.classification.iab_content.tier3.label": "Postgraduate Education" }, "id": "postgraduate-education-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Postgraduate Education", - "path": "model_output.classification.iab_content.tier3.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Education > College Education > Postgraduate Education.", - "pass": false, + "pass": true, "status": "must_fix", "text": "need postgraduate options for a master's degree, not short online courses" }, @@ -4165,34 +5164,23 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Healthy Living" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Medical Health" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Medical Health" }, "id": "medical-health-hard", - "mismatches": [ - { - "actual": "Healthy Living", - "expected": "Medical Health", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Medical Health.", - "pass": false, + "pass": true, "status": "must_fix", "text": "need medical advice about symptoms, not wellness or fitness tips" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Careers", "model_output.classification.iab_content.tier2.label": "Remote Working" }, @@ -4203,11 +5191,6 @@ }, "id": "careers-job-search-easy", "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, { "actual": "Remote Working", "expected": "Job Search", @@ -4221,9 +5204,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Business" + "model_output.classification.iab_content.tier2.label": "Industries" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4238,12 +5221,7 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Business", + "actual": "Industries", "expected": "Job Search", "path": "model_output.classification.iab_content.tier2.label" } @@ -4256,8 +5234,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Careers", - "model_output.classification.iab_content.tier2.label": "Job Search" + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4266,10 +5244,20 @@ }, "id": "careers-job-search-hard", "mismatches": [ + { + "actual": "Genres", + "expected": "Careers", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Job Search", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical hard IAB mapping case for Careers > Job Search.", @@ -4280,8 +5268,8 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": "Food Movements" + "model_output.classification.iab_content.tier1.label": "Personal Celebrations & Life Events", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4291,7 +5279,7 @@ "id": "personal-finance-easy", "mismatches": [ { - "actual": "Food & Drink", + "actual": "Personal Celebrations & Life Events", "expected": "Personal Finance", "path": "model_output.classification.iab_content.tier1.label" }, @@ -4301,7 +5289,7 @@ "path": "model_output.classification.iab_content.mapping_mode" }, { - "actual": "Food Movements", + "actual": null, "expected": "Financial Planning", "path": "model_output.classification.iab_content.tier2.label" } @@ -4313,9 +5301,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Business and Finance", - "model_output.classification.iab_content.tier2.label": "Economy" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4325,17 +5313,7 @@ "id": "personal-finance-medium", "mismatches": [ { - "actual": "Business and Finance", - "expected": "Personal Finance", - "path": "model_output.classification.iab_content.tier1.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Economy", + "actual": null, "expected": "Financial Planning", "path": "model_output.classification.iab_content.tier2.label" } @@ -4347,9 +5325,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Personal Finance", - "model_output.classification.iab_content.tier2.label": "Retirement Planning" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4359,12 +5337,7 @@ "id": "personal-finance-hard", "mismatches": [ { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Retirement Planning", + "actual": null, "expected": "Financial Planning", "path": "model_output.classification.iab_content.tier2.label" } @@ -4376,7 +5349,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Family and Relationships", "model_output.classification.iab_content.tier2.label": "Parenting" }, @@ -4386,23 +5359,17 @@ "model_output.classification.iab_content.tier2.label": "Parenting" }, "id": "parenting-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Family and Relationships > Parenting.", - "pass": false, + "pass": true, "status": "must_fix", "text": "tips for parenting a toddler" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Family and Relationships", - "model_output.classification.iab_content.tier2.label": "Parenting" + "model_output.classification.iab_content.tier1.label": "Education", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4411,10 +5378,20 @@ }, "id": "parenting-medium", "mismatches": [ + { + "actual": "Education", + "expected": "Family and Relationships", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Parenting", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical medium IAB mapping case for Family and Relationships > Parenting.", @@ -4424,7 +5401,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Family and Relationships", "model_output.classification.iab_content.tier2.label": "Parenting" }, @@ -4434,23 +5411,17 @@ "model_output.classification.iab_content.tier2.label": "Parenting" }, "id": "parenting-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Family and Relationships > Parenting.", - "pass": false, + "pass": true, "status": "must_fix", "text": "need parenting advice for a child starting preschool" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Gardening" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4458,28 +5429,17 @@ "model_output.classification.iab_content.tier2.label": "Gardening" }, "id": "gardening-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Gardening", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Home & Garden > Gardening.", - "pass": false, + "pass": true, "status": "must_fix", "text": "best plants for a small balcony garden" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": "Gardening" + "model_output.classification.iab_content.tier1.label": "Food & Drink", + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4488,10 +5448,20 @@ }, "id": "gardening-medium", "mismatches": [ + { + "actual": "Food & Drink", + "expected": "Home & Garden", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": "nearest_equivalent", "expected": "exact", "path": "model_output.classification.iab_content.mapping_mode" + }, + { + "actual": null, + "expected": "Gardening", + "path": "model_output.classification.iab_content.tier2.label" } ], "notes": "Cross-vertical medium IAB mapping case for Home & Garden > Gardening.", @@ -4501,9 +5471,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Home & Garden", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Gardening" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4511,26 +5481,15 @@ "model_output.classification.iab_content.tier2.label": "Gardening" }, "id": "gardening-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": null, - "expected": "Gardening", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Home & Garden > Gardening.", - "pass": false, + "pass": true, "status": "must_fix", "text": "need gardening advice for a shady backyard, not interior decor ideas" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Entertainment", "model_output.classification.iab_content.tier2.label": "Movies" }, @@ -4540,23 +5499,17 @@ "model_output.classification.iab_content.tier2.label": "Movies" }, "id": "movies-easy", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Cross-vertical easy IAB mapping case for Entertainment > Movies.", - "pass": false, + "pass": true, "status": "must_fix", "text": "What movie should we watch tonight?" }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Video Gaming", - "model_output.classification.iab_content.tier2.label": "Video Game Genres" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Genres", + "model_output.classification.iab_content.tier2.label": "Horror" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4566,17 +5519,12 @@ "id": "movies-medium", "mismatches": [ { - "actual": "Video Gaming", + "actual": "Genres", "expected": "Entertainment", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Video Game Genres", + "actual": "Horror", "expected": "Movies", "path": "model_output.classification.iab_content.tier2.label" } @@ -4588,9 +5536,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Entertainment", - "model_output.classification.iab_content.tier2.label": "Music" + "model_output.classification.iab_content.tier2.label": "Movies" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4598,20 +5546,9 @@ "model_output.classification.iab_content.tier2.label": "Movies" }, "id": "movies-hard", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - }, - { - "actual": "Music", - "expected": "Movies", - "path": "model_output.classification.iab_content.tier2.label" - } - ], + "mismatches": [], "notes": "Cross-vertical hard IAB mapping case for Entertainment > Movies.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Looking for film recommendations, not TV shows or music" } @@ -4620,21 +5557,21 @@ "iab_quality_target_eval": { "by_status": { "must_fix": { - "failed": 12, - "passed": 0, + "failed": 9, + "passed": 3, "total": 12 } }, - "cases_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/examples/iab_mapping_cases.json", + "cases_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/examples/iab_mapping_cases.json", "count": 12, - "failed": 12, - "passed": 0, + "failed": 9, + "passed": 3, "results": [ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Type" + "model_output.classification.iab_content.tier2.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4644,7 +5581,7 @@ "id": "car-buying-maps-to-automotive-buying", "mismatches": [ { - "actual": "Auto Type", + "actual": null, "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" }, @@ -4661,7 +5598,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", "model_output.classification.iab_content.tier2.label": "Computing", "model_output.classification.iab_content.tier3.label": "Laptops" @@ -4673,15 +5610,9 @@ "model_output.classification.iab_content.tier3.label": "Laptops" }, "id": "laptop-buying-maps-to-laptops", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Laptop shopping should resolve into the laptops branch, not business sales.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Which laptop to buy in 2026" }, @@ -4689,8 +5620,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Laptops" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4700,6 +5631,16 @@ }, "id": "labtop-buying-maps-to-laptops", "mismatches": [ + { + "actual": null, + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, + "expected": "Laptops", + "path": "model_output.classification.iab_content.tier3.label" + }, { "actual": "nearest_equivalent", "expected": "exact", @@ -4713,10 +5654,10 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -4732,14 +5673,19 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Software and Applications", + "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" + }, + { + "actual": "exact", + "expected": "nearest_equivalent", + "path": "model_output.classification.iab_content.mapping_mode" } ], "notes": "CRM education should resolve to the closest business/sales path, not generic software.", @@ -4750,9 +5696,9 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4763,17 +5709,17 @@ "id": "crm-comparison-maps-to-sales", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Careers", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Internet", + "actual": null, "expected": "Sales", "path": "model_output.classification.iab_content.tier3.label" }, @@ -4792,8 +5738,8 @@ "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4809,12 +5755,12 @@ "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Internet", + "actual": null, "expected": "Marketing and Advertising", "path": "model_output.classification.iab_content.tier3.label" }, @@ -4832,7 +5778,7 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", + "model_output.classification.iab_content.tier1.label": "Careers", "model_output.classification.iab_content.tier2.label": null }, "expected": { @@ -4842,6 +5788,11 @@ }, "id": "ml-explanation-maps-to-ai", "mismatches": [ + { + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, { "actual": null, "expected": "Artificial Intelligence", @@ -4861,9 +5812,9 @@ { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Internet" + "model_output.classification.iab_content.tier1.label": "Personal Finance", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -4874,17 +5825,17 @@ "id": "support-credential-help-maps-to-business-it", "mismatches": [ { - "actual": "Technology & Computing", + "actual": "Personal Finance", "expected": "Business and Finance", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Computing", + "actual": null, "expected": "Business", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Internet", + "actual": null, "expected": "Business I.T.", "path": "model_output.classification.iab_content.tier3.label" } @@ -4896,7 +5847,7 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", "model_output.classification.iab_content.tier2.label": "Dining Out" }, @@ -4906,24 +5857,18 @@ "model_output.classification.iab_content.tier2.label": "Dining Out" }, "id": "restaurant-booking-maps-to-dining-out", - "mismatches": [ - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Generic dining requests should not inherit the repo's business default.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Book a table for 2 tonight" }, { "actual": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Hobbies & Interests", - "model_output.classification.iab_content.tier2.label": "Content Production", - "model_output.classification.iab_content.tier3.label": "Freelance Writing" + "model_output.classification.iab_content.tier1.label": "Sensitive Topics", + "model_output.classification.iab_content.tier2.label": null, + "model_output.classification.iab_content.tier3.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", @@ -4934,17 +5879,17 @@ "id": "trial-signup-maps-to-software", "mismatches": [ { - "actual": "Hobbies & Interests", + "actual": "Sensitive Topics", "expected": "Technology & Computing", "path": "model_output.classification.iab_content.tier1.label" }, { - "actual": "Content Production", + "actual": null, "expected": "Computing", "path": "model_output.classification.iab_content.tier2.label" }, { - "actual": "Freelance Writing", + "actual": null, "expected": "Software and Applications", "path": "model_output.classification.iab_content.tier3.label" } @@ -4956,11 +5901,11 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", - "model_output.classification.iab_content.tier1.label": "Technology & Computing", - "model_output.classification.iab_content.tier2.label": "Computing", - "model_output.classification.iab_content.tier3.label": "Software and Applications", - "model_output.classification.iab_content.tier4.label": "Communication" + "model_output.classification.iab_content.mapping_mode": "exact", + "model_output.classification.iab_content.tier1.label": "Careers", + "model_output.classification.iab_content.tier2.label": "Remote Working", + "model_output.classification.iab_content.tier3.label": null, + "model_output.classification.iab_content.tier4.label": null }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4972,14 +5917,24 @@ "id": "communication-software-maps-to-tier4", "mismatches": [ { - "actual": "Software and Applications", + "actual": "Careers", + "expected": "Technology & Computing", + "path": "model_output.classification.iab_content.tier1.label" + }, + { + "actual": "Remote Working", + "expected": "Computing", + "path": "model_output.classification.iab_content.tier2.label" + }, + { + "actual": null, "expected": "Computer Software and Applications", "path": "model_output.classification.iab_content.tier3.label" }, { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" + "actual": null, + "expected": "Communication", + "path": "model_output.classification.iab_content.tier4.label" } ], "notes": "Full taxonomy support should preserve the tier4 communication branch.", @@ -4989,9 +5944,9 @@ }, { "actual": { - "model_output.classification.iab_content.mapping_mode": "nearest_equivalent", + "model_output.classification.iab_content.mapping_mode": "exact", "model_output.classification.iab_content.tier1.label": "Food & Drink", - "model_output.classification.iab_content.tier2.label": null + "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" }, "expected": { "model_output.classification.iab_content.mapping_mode": "exact", @@ -4999,20 +5954,9 @@ "model_output.classification.iab_content.tier2.label": "Alcoholic Beverages" }, "id": "vodka-query-maps-to-alcoholic-beverages", - "mismatches": [ - { - "actual": null, - "expected": "Alcoholic Beverages", - "path": "model_output.classification.iab_content.tier2.label" - }, - { - "actual": "nearest_equivalent", - "expected": "exact", - "path": "model_output.classification.iab_content.mapping_mode" - } - ], + "mismatches": [], "notes": "Food and beverage prompts should not fall through to the business default.", - "pass": false, + "pass": true, "status": "must_fix", "text": "what is best vodka drink should i try" } @@ -5026,20 +5970,20 @@ "total": 2 }, "must_fix": { - "failed": 4, - "passed": 11, + "failed": 3, + "passed": 12, "total": 15 } }, - "cases_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/examples/known_failure_cases.json", + "cases_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/examples/known_failure_cases.json", "count": 17, - "failed": 6, - "passed": 11, + "failed": 5, + "passed": 12, "results": [ { "actual": { "model_output.classification.iab_content.tier1.label": "Automotive", - "model_output.classification.iab_content.tier2.label": "Auto Type", + "model_output.classification.iab_content.tier2.label": null, "model_output.classification.intent.type": "commercial", "system_decision.policy.monetization_eligibility": "allowed" }, @@ -5052,7 +5996,7 @@ "id": "auto-buying-query-allowed", "mismatches": [ { - "actual": "Auto Type", + "actual": null, "expected": "Auto Buying and Selling", "path": "model_output.classification.iab_content.tier2.label" } @@ -5256,7 +6200,7 @@ "model_output.classification.intent.decision_phase": "consideration", "model_output.classification.intent.subtype": "deal_seeking", "system_decision.opportunity.type": "soft_recommendation", - "system_decision.policy.monetization_eligibility": "restricted" + "system_decision.policy.monetization_eligibility": "allowed_with_caution" }, "expected": { "model_output.classification.intent.decision_phase": "awareness", @@ -5276,6 +6220,11 @@ "expected": "awareness", "path": "model_output.classification.intent.decision_phase" }, + { + "actual": "allowed_with_caution", + "expected": "restricted", + "path": "system_decision.policy.monetization_eligibility" + }, { "actual": "soft_recommendation", "expected": "none", @@ -5327,7 +6276,7 @@ }, { "actual": { - "model_output.classification.intent.decision_phase": "decision", + "model_output.classification.intent.decision_phase": "consideration", "model_output.classification.intent.subtype": "evaluation" }, "expected": { @@ -5335,15 +6284,9 @@ "model_output.classification.intent.subtype": "evaluation" }, "id": "evaluation-subtype-fit-check", - "mismatches": [ - { - "actual": "decision", - "expected": "consideration", - "path": "model_output.classification.intent.decision_phase" - } - ], + "mismatches": [], "notes": "Single-vendor fit checks should map to evaluation rather than broad discovery.", - "pass": false, + "pass": true, "status": "must_fix", "text": "Would ClickUp be a good fit for a remote ops team?" }, @@ -5425,12 +6368,12 @@ "heads": { "decision_phase": { "difficulty_benchmark": { - "accepted_accuracy": 0.9524, + "accepted_accuracy": 0.981, "accepted_coverage": 1.0, - "accuracy": 0.9524, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv", + "accuracy": 0.981, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_difficulty_benchmark_confusion_matrix.csv", "count": 105, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/decision_phase_benchmark.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/decision_phase_benchmark.jsonl", "difficulty_breakdown": { "easy": { "accepted_accuracy": 0.9714, @@ -5441,12 +6384,12 @@ "macro_f1": 0.9711 }, "hard": { - "accepted_accuracy": 0.8857, + "accepted_accuracy": 0.9714, "accepted_coverage": 1.0, - "accuracy": 0.8857, + "accuracy": 0.9714, "count": 35, "fallback_rate": 0.0, - "macro_f1": 0.883 + "macro_f1": 0.9711 }, "medium": { "accepted_accuracy": 1.0, @@ -5459,19 +6402,19 @@ }, "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.9526, + "macro_f1": 0.9812, "per_class_metrics": { - "accuracy": 0.9523809523809523, + "accuracy": 0.9809523809523809, "action": { - "f1-score": 0.9285714285714286, + "f1-score": 0.9655172413793104, "precision": 1.0, - "recall": 0.8666666666666667, + "recall": 0.9333333333333333, "support": 15.0 }, "awareness": { - "f1-score": 0.9655172413793104, + "f1-score": 1.0, "precision": 1.0, - "recall": 0.9333333333333333, + "recall": 1.0, "support": 15.0 }, "consideration": { @@ -5481,27 +6424,27 @@ "support": 15.0 }, "decision": { - "f1-score": 0.967741935483871, - "precision": 0.9375, + "f1-score": 1.0, + "precision": 1.0, "recall": 1.0, "support": 15.0 }, "macro avg": { - "f1-score": 0.9525819504665047, - "precision": 0.9564075630252101, - "recall": 0.9523809523809523, + "f1-score": 0.9812192118226601, + "precision": 0.9831932773109244, + "recall": 0.980952380952381, "support": 105.0 }, "post_purchase": { - "f1-score": 0.9375, - "precision": 0.8823529411764706, + "f1-score": 1.0, + "precision": 1.0, "recall": 1.0, "support": 15.0 }, "research": { - "f1-score": 0.9032258064516129, - "precision": 0.875, - "recall": 0.9333333333333333, + "f1-score": 0.9375, + "precision": 0.8823529411764706, + "recall": 1.0, "support": 15.0 }, "support": { @@ -5511,9 +6454,9 @@ "support": 15.0 }, "weighted avg": { - "f1-score": 0.9525819504665048, - "precision": 0.9564075630252101, - "recall": 0.9523809523809523, + "f1-score": 0.9812192118226601, + "precision": 0.9831932773109243, + "recall": 0.9809523809523809, "support": 105.0 } }, @@ -5523,9 +6466,9 @@ "accepted_accuracy": 0.963, "accepted_coverage": 1.0, "accuracy": 0.963, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_final_wave_cases_confusion_matrix.csv", + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_final_wave_cases_confusion_matrix.csv", "count": 27, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/decision_phase/final_wave_cases.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/decision_phase/final_wave_cases.jsonl", "fallback_rate": 0.0, "head": "decision_phase", "macro_f1": 0.961, @@ -5589,17 +6532,17 @@ "suite": "final_wave_cases" }, "hard_cases": { - "accepted_accuracy": 0.9231, + "accepted_accuracy": 0.8974, "accepted_coverage": 1.0, - "accuracy": 0.9231, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_hard_cases_confusion_matrix.csv", + "accuracy": 0.8974, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_hard_cases_confusion_matrix.csv", "count": 39, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/decision_phase/hard_cases.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/decision_phase/hard_cases.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.9249, + "macro_f1": 0.9008, "per_class_metrics": { - "accuracy": 0.9230769230769231, + "accuracy": 0.8974358974358975, "action": { "f1-score": 0.0, "precision": 0.0, @@ -5613,9 +6556,9 @@ "support": 6.0 }, "consideration": { - "f1-score": 0.9230769230769231, + "f1-score": 0.8333333333333334, "precision": 1.0, - "recall": 0.8571428571428571, + "recall": 0.7142857142857143, "support": 7.0 }, "decision": { @@ -5625,9 +6568,9 @@ "support": 7.0 }, "macro avg": { - "f1-score": 0.792778649921507, - "precision": 0.7976190476190477, - "recall": 0.7959183673469388, + "f1-score": 0.772108843537415, + "precision": 0.7806122448979592, + "recall": 0.7755102040816327, "support": 39.0 }, "post_purchase": { @@ -5637,8 +6580,8 @@ "support": 6.0 }, "research": { - "f1-score": 0.7692307692307693, - "precision": 0.8333333333333334, + "f1-score": 0.7142857142857143, + "precision": 0.7142857142857143, "recall": 0.7142857142857143, "support": 7.0 }, @@ -5649,26 +6592,26 @@ "support": 6.0 }, "weighted avg": { - "f1-score": 0.9227951535643845, - "precision": 0.9316239316239316, - "recall": 0.9230769230769231, + "f1-score": 0.8968253968253967, + "precision": 0.9102564102564102, + "recall": 0.8974358974358975, "support": 39.0 } }, "suite": "hard_cases" }, "test": { - "accepted_accuracy": 0.8621, + "accepted_accuracy": 0.7931, "accepted_coverage": 1.0, - "accuracy": 0.8621, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_test_confusion_matrix.csv", + "accuracy": 0.7931, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_test_confusion_matrix.csv", "count": 29, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/decision_phase/test.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/decision_phase/test.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.8651, + "macro_f1": 0.801, "per_class_metrics": { - "accuracy": 0.8620689655172413, + "accuracy": 0.7931034482758621, "action": { "f1-score": 1.0, "precision": 1.0, @@ -5682,9 +6625,9 @@ "support": 3.0 }, "consideration": { - "f1-score": 0.8888888888888888, + "f1-score": 0.75, "precision": 1.0, - "recall": 0.8, + "recall": 0.6, "support": 5.0 }, "decision": { @@ -5694,50 +6637,50 @@ "support": 5.0 }, "macro avg": { - "f1-score": 0.865079365079365, - "precision": 0.8809523809523808, - "recall": 0.8857142857142858, + "f1-score": 0.8010204081632653, + "precision": 0.8285714285714285, + "recall": 0.8214285714285714, "support": 29.0 }, "post_purchase": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.8888888888888888, + "precision": 0.8, "recall": 1.0, "support": 4.0 }, "research": { - "f1-score": 0.5, - "precision": 0.6666666666666666, + "f1-score": 0.4444444444444444, + "precision": 0.5, "recall": 0.4, "support": 5.0 }, "support": { - "f1-score": 1.0, + "f1-score": 0.8571428571428571, "precision": 1.0, - "recall": 1.0, + "recall": 0.75, "support": 4.0 }, "weighted avg": { - "f1-score": 0.8601532567049808, - "precision": 0.8908045977011494, - "recall": 0.8620689655172413, + "f1-score": 0.7915982484948002, + "precision": 0.8344827586206897, + "recall": 0.7931034482758621, "support": 29.0 } }, "suite": "test" }, "train": { - "accepted_accuracy": 0.9902, + "accepted_accuracy": 0.9608, "accepted_coverage": 1.0, - "accuracy": 0.9902, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv", + "accuracy": 0.9608, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_train_confusion_matrix.csv", "count": 102, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/decision_phase/train.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/decision_phase/train.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.9907, + "macro_f1": 0.9638, "per_class_metrics": { - "accuracy": 0.9901960784313726, + "accuracy": 0.9607843137254902, "action": { "f1-score": 1.0, "precision": 1.0, @@ -5745,27 +6688,27 @@ "support": 10.0 }, "awareness": { - "f1-score": 0.9696969696969697, - "precision": 0.9411764705882353, + "f1-score": 0.9411764705882353, + "precision": 0.8888888888888888, "recall": 1.0, "support": 16.0 }, "consideration": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.9411764705882353, + "precision": 0.9411764705882353, + "recall": 0.9411764705882353, "support": 17.0 }, "decision": { - "f1-score": 1.0, + "f1-score": 0.967741935483871, "precision": 1.0, - "recall": 1.0, + "recall": 0.9375, "support": 16.0 }, "macro avg": { - "f1-score": 0.9907448872966115, - "precision": 0.9915966386554622, - "recall": 0.9904761904761905, + "f1-score": 0.9638066572568961, + "precision": 0.9655195411497932, + "recall": 0.9636204481792717, "support": 102.0 }, "post_purchase": { @@ -5775,9 +6718,9 @@ "support": 14.0 }, "research": { - "f1-score": 0.9655172413793104, - "precision": 1.0, - "recall": 0.9333333333333333, + "f1-score": 0.896551724137931, + "precision": 0.9285714285714286, + "recall": 0.8666666666666667, "support": 15.0 }, "support": { @@ -5787,26 +6730,26 @@ "support": 14.0 }, "weighted avg": { - "f1-score": 0.9901755895670704, - "precision": 0.9907727797001153, - "recall": 0.9901960784313726, + "f1-score": 0.9606957878355163, + "precision": 0.9622626828509181, + "recall": 0.9607843137254902, "support": 102.0 } }, "suite": "train" }, "val": { - "accepted_accuracy": 0.8966, + "accepted_accuracy": 0.8621, "accepted_coverage": 1.0, - "accuracy": 0.8966, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv", + "accuracy": 0.8621, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/decision_phase_val_confusion_matrix.csv", "count": 29, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/decision_phase/val.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/decision_phase/val.jsonl", "fallback_rate": 0.0, "head": "decision_phase", - "macro_f1": 0.8975, + "macro_f1": 0.8618, "per_class_metrics": { - "accuracy": 0.896551724137931, + "accuracy": 0.8620689655172413, "action": { "f1-score": 1.0, "precision": 1.0, @@ -5832,14 +6775,14 @@ "support": 4.0 }, "macro avg": { - "f1-score": 0.8974953617810761, - "precision": 0.9166666666666667, - "recall": 0.8928571428571429, + "f1-score": 0.8617810760667904, + "precision": 0.880952380952381, + "recall": 0.8571428571428571, "support": 29.0 }, "post_purchase": { - "f1-score": 0.8571428571428571, - "precision": 1.0, + "f1-score": 0.75, + "precision": 0.75, "recall": 0.75, "support": 4.0 }, @@ -5850,15 +6793,15 @@ "support": 4.0 }, "support": { - "f1-score": 1.0, + "f1-score": 0.8571428571428571, "precision": 1.0, - "recall": 1.0, + "recall": 0.75, "support": 4.0 }, "weighted avg": { - "f1-score": 0.8947604120017911, - "precision": 0.9080459770114944, - "recall": 0.896551724137931, + "f1-score": 0.8602776533811015, + "precision": 0.8735632183908046, + "recall": 0.8620689655172413, "support": 29.0 } }, @@ -5867,349 +6810,379 @@ }, "iab_content": { "cross_vertical_benchmark": { - "accepted_accuracy": 0.3444, - "accepted_coverage": 1.0, - "accuracy": 0.3444, + "accepted_accuracy": 0.4103, + "accepted_coverage": 0.8667, + "accuracy": 0.3667, "count": 90, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab_cross_vertical_benchmark.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab_cross_vertical_benchmark.jsonl", "difficulty_breakdown": { "easy": { - "accepted_accuracy": 0.2667, - "accepted_coverage": 1.0, - "accuracy": 0.2667, + "accepted_accuracy": 0.3846, + "accepted_coverage": 0.8667, + "accuracy": 0.3667, "count": 30, - "fallback_rate": 0.0, - "macro_f1": 0.1633 + "fallback_rate": 0.1333, + "macro_f1": 0.2619 }, "hard": { - "accepted_accuracy": 0.3667, - "accepted_coverage": 1.0, - "accuracy": 0.3667, + "accepted_accuracy": 0.5385, + "accepted_coverage": 0.8667, + "accuracy": 0.4667, "count": 30, - "fallback_rate": 0.0, - "macro_f1": 0.2174 + "fallback_rate": 0.1333, + "macro_f1": 0.3182 }, "medium": { - "accepted_accuracy": 0.4, - "accepted_coverage": 1.0, - "accuracy": 0.4, + "accepted_accuracy": 0.3077, + "accepted_coverage": 0.8667, + "accuracy": 0.2667, "count": 30, - "fallback_rate": 0.0, - "macro_f1": 0.2667 + "fallback_rate": 0.1333, + "macro_f1": 0.1633 } }, - "fallback_rate": 0.0, + "fallback_rate": 0.1333, "head": "iab_content", - "macro_f1": 0.1808, - "primary_source": "embedding_retrieval", + "macro_f1": 0.2081, + "primary_source": "supervised_classifier", "suite": "cross_vertical_benchmark", "tier_metrics": { - "average_prediction_depth": 2.5333, + "average_prediction_depth": 1.9889, "error_buckets": { - "exact_match": 31, - "parent_safe_stop": 5, - "right_tier1_wrong_tier2": 19, - "wrong_deep_leaf": 13, - "wrong_tier1": 22 - }, - "exact_path_accuracy": 0.3444, - "parent_safe_accuracy": 0.4889, - "tier1_accuracy": 0.7556, - "tier2_accuracy": 0.5238, - "tier3_accuracy": 0.4762, - "tier4_accuracy": 1.0 + "exact_match": 33, + "parent_safe_stop": 3, + "right_tier1_wrong_tier2": 20, + "wrong_deep_leaf": 6, + "wrong_tier1": 28 + }, + "exact_path_accuracy": 0.3667, + "parent_safe_accuracy": 0.5333, + "tier1_accuracy": 0.6889, + "tier2_accuracy": 0.4286, + "tier3_accuracy": 0.381, + "tier4_accuracy": 0.3333 }, "view_metrics": { + "classifier": { + "average_prediction_depth": 1.9889, + "error_buckets": { + "exact_match": 33, + "parent_safe_stop": 3, + "right_tier1_wrong_tier2": 20, + "wrong_deep_leaf": 6, + "wrong_tier1": 28 + }, + "exact_path_accuracy": 0.3667, + "parent_safe_accuracy": 0.5333, + "tier1_accuracy": 0.6889, + "tier2_accuracy": 0.4286, + "tier3_accuracy": 0.381, + "tier4_accuracy": 0.3333 + }, "combined_path": { - "average_prediction_depth": 2.5333, + "average_prediction_depth": 1.9889, "error_buckets": { - "exact_match": 27, - "parent_safe_stop": 5, - "right_tier1_wrong_tier2": 19, - "wrong_deep_leaf": 17, - "wrong_tier1": 22 - }, - "exact_path_accuracy": 0.3, - "fallback_overuse_count": 12, - "fallback_rate": 0.1333, - "parent_safe_accuracy": 0.4444, - "tier1_accuracy": 0.7556, - "tier2_accuracy": 0.5238, + "exact_match": 33, + "parent_safe_stop": 3, + "right_tier1_wrong_tier2": 20, + "wrong_deep_leaf": 6, + "wrong_tier1": 28 + }, + "exact_path_accuracy": 0.3667, + "fallback_overuse_count": 18, + "fallback_rate": 0.2, + "parent_safe_accuracy": 0.5333, + "tier1_accuracy": 0.6889, + "tier2_accuracy": 0.4286, "tier3_accuracy": 0.381, - "tier4_accuracy": 0.5 + "tier4_accuracy": 0.3333 }, "disagreements": { - "retrieval_vs_combined": 0 + "classifier_vs_combined": 0 }, - "embedding_retrieval": { - "average_prediction_depth": 2.5333, - "error_buckets": { - "exact_match": 27, - "parent_safe_stop": 5, - "right_tier1_wrong_tier2": 19, - "wrong_deep_leaf": 17, - "wrong_tier1": 22 - }, - "exact_path_accuracy": 0.3, - "parent_safe_accuracy": 0.4444, - "tier1_accuracy": 0.7556, - "tier2_accuracy": 0.5238, - "tier3_accuracy": 0.381, - "tier4_accuracy": 0.5 + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } }, "difficulty_benchmark": { - "accepted_accuracy": 0.3782, - "accepted_coverage": 1.0, - "accuracy": 0.3782, + "accepted_accuracy": 0.4959, + "accepted_coverage": 0.7885, + "accuracy": 0.391, "count": 156, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab_benchmark.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab_benchmark.jsonl", "difficulty_breakdown": { "easy": { - "accepted_accuracy": 0.4038, - "accepted_coverage": 1.0, - "accuracy": 0.4038, + "accepted_accuracy": 0.5778, + "accepted_coverage": 0.8654, + "accuracy": 0.5, "count": 52, - "fallback_rate": 0.0, - "macro_f1": 0.2171 + "fallback_rate": 0.1346, + "macro_f1": 0.3025 }, "hard": { - "accepted_accuracy": 0.3077, - "accepted_coverage": 1.0, - "accuracy": 0.3077, + "accepted_accuracy": 0.35, + "accepted_coverage": 0.7692, + "accuracy": 0.2692, "count": 52, - "fallback_rate": 0.0, - "macro_f1": 0.1626 + "fallback_rate": 0.2308, + "macro_f1": 0.1505 }, "medium": { - "accepted_accuracy": 0.4231, - "accepted_coverage": 1.0, - "accuracy": 0.4231, + "accepted_accuracy": 0.5526, + "accepted_coverage": 0.7308, + "accuracy": 0.4038, "count": 52, - "fallback_rate": 0.0, - "macro_f1": 0.2265 + "fallback_rate": 0.2692, + "macro_f1": 0.2184 } }, - "fallback_rate": 0.0, + "fallback_rate": 0.2115, "head": "iab_content", - "macro_f1": 0.1593, - "primary_source": "embedding_retrieval", + "macro_f1": 0.1715, + "primary_source": "supervised_classifier", "suite": "difficulty_benchmark", "tier_metrics": { - "average_prediction_depth": 2.5833, + "average_prediction_depth": 1.9936, "error_buckets": { - "exact_match": 59, - "parent_safe_stop": 17, - "right_tier1_wrong_tier2": 42, - "wrong_deep_leaf": 13, - "wrong_tier1": 25 - }, - "exact_path_accuracy": 0.3782, - "parent_safe_accuracy": 0.6154, - "tier1_accuracy": 0.8397, - "tier2_accuracy": 0.5705, - "tier3_accuracy": 0.5648, - "tier4_accuracy": 0.5833 + "exact_match": 61, + "parent_safe_stop": 4, + "right_tier1_wrong_tier2": 41, + "wrong_tier1": 50 + }, + "exact_path_accuracy": 0.391, + "parent_safe_accuracy": 0.6218, + "tier1_accuracy": 0.6795, + "tier2_accuracy": 0.4167, + "tier3_accuracy": 0.4259, + "tier4_accuracy": 0.4167 }, "view_metrics": { + "classifier": { + "average_prediction_depth": 1.9936, + "error_buckets": { + "exact_match": 56, + "parent_safe_stop": 4, + "right_tier1_wrong_tier2": 41, + "wrong_deep_leaf": 5, + "wrong_tier1": 50 + }, + "exact_path_accuracy": 0.359, + "parent_safe_accuracy": 0.5897, + "tier1_accuracy": 0.6795, + "tier2_accuracy": 0.4167, + "tier3_accuracy": 0.3796, + "tier4_accuracy": 0.25 + }, "combined_path": { - "average_prediction_depth": 2.5833, + "average_prediction_depth": 1.9936, "error_buckets": { - "exact_match": 48, - "parent_safe_stop": 17, - "right_tier1_wrong_tier2": 42, - "wrong_deep_leaf": 24, - "wrong_tier1": 25 + "exact_match": 56, + "parent_safe_stop": 4, + "right_tier1_wrong_tier2": 41, + "wrong_deep_leaf": 5, + "wrong_tier1": 50 }, - "exact_path_accuracy": 0.3077, + "exact_path_accuracy": 0.359, "fallback_overuse_count": 11, "fallback_rate": 0.0705, - "parent_safe_accuracy": 0.5449, - "tier1_accuracy": 0.8397, - "tier2_accuracy": 0.5705, - "tier3_accuracy": 0.4352, + "parent_safe_accuracy": 0.5897, + "tier1_accuracy": 0.6795, + "tier2_accuracy": 0.4167, + "tier3_accuracy": 0.3796, "tier4_accuracy": 0.25 }, "disagreements": { - "retrieval_vs_combined": 0 + "classifier_vs_combined": 0 }, - "embedding_retrieval": { - "average_prediction_depth": 2.5833, - "error_buckets": { - "exact_match": 48, - "parent_safe_stop": 17, - "right_tier1_wrong_tier2": 42, - "wrong_deep_leaf": 24, - "wrong_tier1": 25 - }, - "exact_path_accuracy": 0.3077, - "parent_safe_accuracy": 0.5449, - "tier1_accuracy": 0.8397, - "tier2_accuracy": 0.5705, - "tier3_accuracy": 0.4352, - "tier4_accuracy": 0.25 + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } }, "extended_cases": { - "accepted_accuracy": 0.25, - "accepted_coverage": 1.0, - "accuracy": 0.25, + "accepted_accuracy": 0.6, + "accepted_coverage": 0.625, + "accuracy": 0.5, "count": 8, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab/extended_cases.jsonl", - "fallback_rate": 0.0, + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab/extended_cases.jsonl", + "fallback_rate": 0.375, "head": "iab_content", - "macro_f1": 0.1429, - "primary_source": "embedding_retrieval", + "macro_f1": 0.3333, + "primary_source": "supervised_classifier", "suite": "extended_cases", "tier_metrics": { - "average_prediction_depth": 2.375, + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 2, - "right_tier1_wrong_tier2": 3, - "wrong_deep_leaf": 2, + "exact_match": 4, + "right_tier1_wrong_tier2": 2, + "wrong_deep_leaf": 1, "wrong_tier1": 1 }, - "exact_path_accuracy": 0.25, - "parent_safe_accuracy": 0.375, + "exact_path_accuracy": 0.5, + "parent_safe_accuracy": 0.625, "tier1_accuracy": 0.875, - "tier2_accuracy": 0.4286, - "tier3_accuracy": 1.0, + "tier2_accuracy": 0.5714, + "tier3_accuracy": 0.0, "tier4_accuracy": 0.0 }, "view_metrics": { - "combined_path": { - "average_prediction_depth": 2.375, + "classifier": { + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 2, - "right_tier1_wrong_tier2": 3, - "wrong_deep_leaf": 2, + "exact_match": 4, + "right_tier1_wrong_tier2": 2, + "wrong_deep_leaf": 1, "wrong_tier1": 1 }, - "exact_path_accuracy": 0.25, - "fallback_overuse_count": 1, - "fallback_rate": 0.125, - "parent_safe_accuracy": 0.375, + "exact_path_accuracy": 0.5, + "parent_safe_accuracy": 0.625, "tier1_accuracy": 0.875, - "tier2_accuracy": 0.4286, + "tier2_accuracy": 0.5714, "tier3_accuracy": 0.0, "tier4_accuracy": 0.0 }, - "disagreements": { - "retrieval_vs_combined": 0 - }, - "embedding_retrieval": { - "average_prediction_depth": 2.375, + "combined_path": { + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 2, - "right_tier1_wrong_tier2": 3, - "wrong_deep_leaf": 2, + "exact_match": 4, + "right_tier1_wrong_tier2": 2, + "wrong_deep_leaf": 1, "wrong_tier1": 1 }, - "exact_path_accuracy": 0.25, - "parent_safe_accuracy": 0.375, + "exact_path_accuracy": 0.5, + "fallback_overuse_count": 2, + "fallback_rate": 0.25, + "parent_safe_accuracy": 0.625, "tier1_accuracy": 0.875, - "tier2_accuracy": 0.4286, + "tier2_accuracy": 0.5714, "tier3_accuracy": 0.0, "tier4_accuracy": 0.0 + }, + "disagreements": { + "classifier_vs_combined": 0 + }, + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } }, "hard_cases": { - "accepted_accuracy": 0.25, - "accepted_coverage": 1.0, - "accuracy": 0.25, + "accepted_accuracy": 0.5, + "accepted_coverage": 0.75, + "accuracy": 0.375, "count": 8, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab/hard_cases.jsonl", - "fallback_rate": 0.0, + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab/hard_cases.jsonl", + "fallback_rate": 0.25, "head": "iab_content", - "macro_f1": 0.1429, - "primary_source": "embedding_retrieval", + "macro_f1": 0.2308, + "primary_source": "supervised_classifier", "suite": "hard_cases", "tier_metrics": { - "average_prediction_depth": 2.375, + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 2, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 2, - "wrong_tier1": 3 + "exact_match": 3, + "right_tier1_wrong_tier2": 1, + "wrong_tier1": 4 }, - "exact_path_accuracy": 0.25, + "exact_path_accuracy": 0.375, "parent_safe_accuracy": 0.5, - "tier1_accuracy": 0.625, + "tier1_accuracy": 0.5, "tier2_accuracy": 0.375, - "tier3_accuracy": 0.2, - "tier4_accuracy": 1.0 + "tier3_accuracy": 0.4, + "tier4_accuracy": 0.0 }, "view_metrics": { + "classifier": { + "average_prediction_depth": 1.75, + "error_buckets": { + "exact_match": 3, + "right_tier1_wrong_tier2": 1, + "wrong_tier1": 4 + }, + "exact_path_accuracy": 0.375, + "parent_safe_accuracy": 0.5, + "tier1_accuracy": 0.5, + "tier2_accuracy": 0.375, + "tier3_accuracy": 0.4, + "tier4_accuracy": 0.0 + }, "combined_path": { - "average_prediction_depth": 2.375, + "average_prediction_depth": 1.75, "error_buckets": { - "exact_match": 1, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 2, - "wrong_deep_leaf": 1, - "wrong_tier1": 3 + "exact_match": 3, + "right_tier1_wrong_tier2": 1, + "wrong_tier1": 4 }, - "exact_path_accuracy": 0.125, + "exact_path_accuracy": 0.375, "fallback_overuse_count": 1, "fallback_rate": 0.125, - "parent_safe_accuracy": 0.375, - "tier1_accuracy": 0.625, + "parent_safe_accuracy": 0.5, + "tier1_accuracy": 0.5, "tier2_accuracy": 0.375, - "tier3_accuracy": 0.0, + "tier3_accuracy": 0.4, "tier4_accuracy": 0.0 }, "disagreements": { - "retrieval_vs_combined": 0 + "classifier_vs_combined": 0 }, - "embedding_retrieval": { - "average_prediction_depth": 2.375, - "error_buckets": { - "exact_match": 1, - "parent_safe_stop": 1, - "right_tier1_wrong_tier2": 2, - "wrong_deep_leaf": 1, - "wrong_tier1": 3 - }, - "exact_path_accuracy": 0.125, - "parent_safe_accuracy": 0.375, - "tier1_accuracy": 0.625, - "tier2_accuracy": 0.375, - "tier3_accuracy": 0.0, - "tier4_accuracy": 0.0 + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } }, "test": { - "accepted_accuracy": 0.6527, - "accepted_coverage": 1.0, - "accuracy": 0.6527, + "accepted_accuracy": 0.916, + "accepted_coverage": 0.9973, + "accuracy": 0.915, "count": 3282, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab/test.jsonl", - "fallback_rate": 0.0, + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab/test.jsonl", + "fallback_rate": 0.0027, "head": "iab_content", - "macro_f1": 0.6922, - "primary_source": "embedding_retrieval", + "macro_f1": 0.8686, + "primary_source": "supervised_classifier", "suite": "test", "tier_metrics": { - "average_prediction_depth": 2.1889, + "average_prediction_depth": 2.1804, "error_buckets": { - "exact_match": 2142, - "parent_safe_stop": 115, - "right_tier1_wrong_tier2": 674, - "wrong_deep_leaf": 236, - "wrong_tier1": 115 - }, - "exact_path_accuracy": 0.6527, - "parent_safe_accuracy": 0.7721, - "tier1_accuracy": 0.965, - "tier2_accuracy": 0.7587, - "tier3_accuracy": 0.8041, - "tier4_accuracy": 0.7929 + "exact_match": 3003, + "parent_safe_stop": 65, + "right_tier1_wrong_tier2": 73, + "wrong_deep_leaf": 90, + "wrong_tier1": 51 + }, + "exact_path_accuracy": 0.915, + "parent_safe_accuracy": 0.9442, + "tier1_accuracy": 0.9845, + "tier2_accuracy": 0.9606, + "tier3_accuracy": 0.8528, + "tier4_accuracy": 0.5286 }, "view_metrics": { + "classifier": { + "average_prediction_depth": 2.1804, + "error_buckets": { + "exact_match": 2965, + "parent_safe_stop": 63, + "right_tier1_wrong_tier2": 85, + "wrong_deep_leaf": 118, + "wrong_tier1": 51 + }, + "exact_path_accuracy": 0.9034, + "parent_safe_accuracy": 0.9321, + "tier1_accuracy": 0.9845, + "tier2_accuracy": 0.9565, + "tier3_accuracy": 0.8218, + "tier4_accuracy": 0.3429 + }, "combined_path": { "count": 3282, "max_combined_rows": 500, @@ -6222,52 +7195,57 @@ "reason": "dataset_too_large_for_combined_view", "skipped": true }, - "embedding_retrieval": { - "average_prediction_depth": 2.1889, - "error_buckets": { - "exact_match": 2107, - "parent_safe_stop": 109, - "right_tier1_wrong_tier2": 680, - "wrong_deep_leaf": 271, - "wrong_tier1": 115 - }, - "exact_path_accuracy": 0.642, - "parent_safe_accuracy": 0.7596, - "tier1_accuracy": 0.965, - "tier2_accuracy": 0.7566, - "tier3_accuracy": 0.7679, - "tier4_accuracy": 0.6071 + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } }, "train": { - "accepted_accuracy": 0.8115, - "accepted_coverage": 1.0, - "accuracy": 0.8115, + "accepted_accuracy": 0.9221, + "accepted_coverage": 0.998, + "accuracy": 0.9212, "count": 13211, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab/train.jsonl", - "fallback_rate": 0.0, + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab/train.jsonl", + "fallback_rate": 0.002, "head": "iab_content", - "macro_f1": 0.8293, - "primary_source": "embedding_retrieval", + "macro_f1": 0.8805, + "primary_source": "supervised_classifier", "suite": "train", "tier_metrics": { - "average_prediction_depth": 2.2368, + "average_prediction_depth": 2.1738, "error_buckets": { - "exact_match": 10721, - "parent_safe_stop": 346, - "right_tier1_wrong_tier2": 812, - "wrong_deep_leaf": 809, - "wrong_tier1": 523 - }, - "exact_path_accuracy": 0.8115, - "parent_safe_accuracy": 0.8753, - "tier1_accuracy": 0.9604, - "tier2_accuracy": 0.9208, - "tier3_accuracy": 0.8788, - "tier4_accuracy": 0.8732 + "exact_match": 12170, + "parent_safe_stop": 238, + "right_tier1_wrong_tier2": 294, + "wrong_deep_leaf": 337, + "wrong_tier1": 172 + }, + "exact_path_accuracy": 0.9212, + "parent_safe_accuracy": 0.9492, + "tier1_accuracy": 0.987, + "tier2_accuracy": 0.9629, + "tier3_accuracy": 0.8617, + "tier4_accuracy": 0.5554 }, "view_metrics": { + "classifier": { + "average_prediction_depth": 2.1738, + "error_buckets": { + "exact_match": 12011, + "parent_safe_stop": 232, + "right_tier1_wrong_tier2": 342, + "wrong_deep_leaf": 454, + "wrong_tier1": 172 + }, + "exact_path_accuracy": 0.9092, + "parent_safe_accuracy": 0.9367, + "tier1_accuracy": 0.987, + "tier2_accuracy": 0.9588, + "tier3_accuracy": 0.8293, + "tier4_accuracy": 0.3607 + }, "combined_path": { "count": 13211, "max_combined_rows": 500, @@ -6280,52 +7258,57 @@ "reason": "dataset_too_large_for_combined_view", "skipped": true }, - "embedding_retrieval": { - "average_prediction_depth": 2.2368, - "error_buckets": { - "exact_match": 10569, - "parent_safe_stop": 338, - "right_tier1_wrong_tier2": 834, - "wrong_deep_leaf": 947, - "wrong_tier1": 523 - }, - "exact_path_accuracy": 0.8, - "parent_safe_accuracy": 0.8631, - "tier1_accuracy": 0.9604, - "tier2_accuracy": 0.9189, - "tier3_accuracy": 0.843, - "tier4_accuracy": 0.6589 + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } }, "val": { - "accepted_accuracy": 0.6545, - "accepted_coverage": 1.0, - "accuracy": 0.6545, + "accepted_accuracy": 0.9138, + "accepted_coverage": 0.9963, + "accuracy": 0.9126, "count": 3282, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/iab/val.jsonl", - "fallback_rate": 0.0, + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/iab/val.jsonl", + "fallback_rate": 0.0037, "head": "iab_content", - "macro_f1": 0.6957, - "primary_source": "embedding_retrieval", + "macro_f1": 0.8708, + "primary_source": "supervised_classifier", "suite": "val", "tier_metrics": { - "average_prediction_depth": 2.1813, + "average_prediction_depth": 2.1795, "error_buckets": { - "exact_match": 2148, - "parent_safe_stop": 105, - "right_tier1_wrong_tier2": 684, - "wrong_deep_leaf": 234, - "wrong_tier1": 111 - }, - "exact_path_accuracy": 0.6545, - "parent_safe_accuracy": 0.7821, - "tier1_accuracy": 0.9662, - "tier2_accuracy": 0.7577, - "tier3_accuracy": 0.8352, - "tier4_accuracy": 0.7214 + "exact_match": 2995, + "parent_safe_stop": 63, + "right_tier1_wrong_tier2": 81, + "wrong_deep_leaf": 90, + "wrong_tier1": 53 + }, + "exact_path_accuracy": 0.9126, + "parent_safe_accuracy": 0.9427, + "tier1_accuracy": 0.9839, + "tier2_accuracy": 0.9565, + "tier3_accuracy": 0.8549, + "tier4_accuracy": 0.5429 }, "view_metrics": { + "classifier": { + "average_prediction_depth": 2.1795, + "error_buckets": { + "exact_match": 2958, + "parent_safe_stop": 60, + "right_tier1_wrong_tier2": 93, + "wrong_deep_leaf": 118, + "wrong_tier1": 53 + }, + "exact_path_accuracy": 0.9013, + "parent_safe_accuracy": 0.9305, + "tier1_accuracy": 0.9839, + "tier2_accuracy": 0.9524, + "tier3_accuracy": 0.8238, + "tier4_accuracy": 0.3643 + }, "combined_path": { "count": 3282, "max_combined_rows": 500, @@ -6338,104 +7321,93 @@ "reason": "dataset_too_large_for_combined_view", "skipped": true }, - "embedding_retrieval": { - "average_prediction_depth": 2.1813, - "error_buckets": { - "exact_match": 2116, - "parent_safe_stop": 100, - "right_tier1_wrong_tier2": 689, - "wrong_deep_leaf": 266, - "wrong_tier1": 111 - }, - "exact_path_accuracy": 0.6447, - "parent_safe_accuracy": 0.7709, - "tier1_accuracy": 0.9662, - "tier2_accuracy": 0.756, - "tier3_accuracy": 0.799, - "tier4_accuracy": 0.55 + "shadow_embedding_retrieval": { + "hint": "Set IAB_EVAL_INCLUDE_SHADOW_RETRIEVAL=1 to run shadow embedding retrieval (downloads/loads gte-Qwen2 when index is present).", + "reason": "disabled_by_default", + "skipped": true } } } }, "intent_subtype": { "difficulty_benchmark": { - "accepted_accuracy": 0.9386, - "accepted_coverage": 1.0, - "accuracy": 0.9386, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv", + "accepted_accuracy": 0.8982, + "accepted_coverage": 0.9928, + "accuracy": 0.8917, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_difficulty_benchmark_confusion_matrix.csv", "count": 277, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/subtype_benchmark.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/subtype_benchmark.jsonl", "difficulty_breakdown": { "easy": { - "accepted_accuracy": 0.9565, + "accepted_accuracy": 0.9239, "accepted_coverage": 1.0, - "accuracy": 0.9565, + "accuracy": 0.9239, "count": 92, "fallback_rate": 0.0, - "macro_f1": 0.9579 + "macro_f1": 0.924 }, "hard": { - "accepted_accuracy": 0.8901, - "accepted_coverage": 1.0, - "accuracy": 0.8901, + "accepted_accuracy": 0.8539, + "accepted_coverage": 0.978, + "accuracy": 0.8352, "count": 91, - "fallback_rate": 0.0, - "macro_f1": 0.8913 + "fallback_rate": 0.022, + "macro_f1": 0.8241 }, "medium": { - "accepted_accuracy": 0.9681, + "accepted_accuracy": 0.9149, "accepted_coverage": 1.0, - "accuracy": 0.9681, + "accuracy": 0.9149, "count": 94, "fallback_rate": 0.0, - "macro_f1": 0.9671 + "macro_f1": 0.9094 } }, - "fallback_rate": 0.0, + "fallback_rate": 0.0072, "head": "intent_subtype", - "macro_f1": 0.9401, + "macro_f1": 0.8876, "per_class_metrics": { "account_help": { - "f1-score": 0.8888888888888888, - "precision": 1.0, - "recall": 0.8, + "f1-score": 0.7586206896551724, + "precision": 0.7857142857142857, + "recall": 0.7333333333333333, "support": 15.0 }, - "accuracy": 0.9386281588447654, + "accuracy": 0.8916967509025271, "billing_help": { - "f1-score": 0.967741935483871, - "precision": 0.9375, - "recall": 1.0, + "f1-score": 0.6086956521739131, + "precision": 0.875, + "recall": 0.4666666666666667, "support": 15.0 }, "booking": { - "f1-score": 0.9285714285714286, + "f1-score": 0.8461538461538461, "precision": 1.0, - "recall": 0.8666666666666667, + "recall": 0.7333333333333333, "support": 15.0 }, "comparison": { - "f1-score": 0.896551724137931, - "precision": 0.9285714285714286, - "recall": 0.8666666666666667, + "f1-score": 0.8571428571428571, + "precision": 0.9230769230769231, + "recall": 0.8, "support": 15.0 }, "contact_sales": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9375, + "precision": 0.8823529411764706, "recall": 1.0, "support": 15.0 }, "deal_seeking": { - "f1-score": 0.9333333333333333, - "precision": 0.9333333333333333, + "f1-score": 0.9655172413793104, + "precision": 1.0, "recall": 0.9333333333333333, "support": 15.0 }, "download": { - "f1-score": 0.9655172413793104, - "precision": 1.0, - "recall": 0.9333333333333333, + "f1-score": 0.8666666666666667, + "precision": 0.8666666666666667, + "recall": 0.8666666666666667, "support": 15.0 }, "education": { @@ -6451,8 +7423,8 @@ "support": 15.0 }, "evaluation": { - "f1-score": 0.9655172413793104, - "precision": 1.0, + "f1-score": 0.9333333333333333, + "precision": 0.9333333333333333, "recall": 0.9333333333333333, "support": 15.0 }, @@ -6463,57 +7435,57 @@ "support": 15.0 }, "macro avg": { - "f1-score": 0.9401067100194944, - "precision": 0.9476910208527856, - "recall": 0.9383215323166303, + "f1-score": 0.8875885571005383, + "precision": 0.9016030130000718, + "recall": 0.8895061728395063, "support": 277.0 }, "onboarding_setup": { - "f1-score": 0.9411764705882353, - "precision": 0.9411764705882353, - "recall": 0.9411764705882353, + "f1-score": 0.8947368421052632, + "precision": 0.8095238095238095, + "recall": 1.0, "support": 17.0 }, "product_discovery": { - "f1-score": 0.9285714285714286, + "f1-score": 0.9655172413793104, "precision": 1.0, - "recall": 0.8666666666666667, + "recall": 0.9333333333333333, "support": 15.0 }, "provider_selection": { - "f1-score": 0.9375, - "precision": 0.9375, - "recall": 0.9375, + "f1-score": 0.9696969696969697, + "precision": 0.9411764705882353, + "recall": 1.0, "support": 16.0 }, "purchase": { - "f1-score": 0.9655172413793104, + "f1-score": 0.8888888888888888, "precision": 1.0, - "recall": 0.9333333333333333, + "recall": 0.8, "support": 15.0 }, "signup": { - "f1-score": 0.8888888888888888, - "precision": 0.8, + "f1-score": 0.9696969696969697, + "precision": 0.9411764705882353, "recall": 1.0, "support": 16.0 }, "task_execution": { - "f1-score": 0.8717948717948718, - "precision": 0.8095238095238095, + "f1-score": 0.8947368421052632, + "precision": 0.85, "recall": 0.9444444444444444, "support": 18.0 }, "troubleshooting": { - "f1-score": 0.9655172413793104, - "precision": 1.0, - "recall": 0.9333333333333333, + "f1-score": 0.7428571428571429, + "precision": 0.65, + "recall": 0.8666666666666667, "support": 15.0 }, "weighted avg": { - "f1-score": 0.9391802821325396, - "precision": 0.9455776173285199, - "recall": 0.9386281588447654, + "f1-score": 0.8883104280399479, + "precision": 0.9006650327445612, + "recall": 0.8916967509025271, "support": 277.0 } }, @@ -6523,16 +7495,16 @@ "accepted_accuracy": 0.8491, "accepted_coverage": 1.0, "accuracy": 0.8491, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv", + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_extended_cases_confusion_matrix.csv", "count": 53, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/subtype/extended_cases.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/subtype/extended_cases.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.8146, + "macro_f1": 0.7764, "per_class_metrics": { "account_help": { - "f1-score": 0.6666666666666666, - "precision": 0.6666666666666666, + "f1-score": 0.8, + "precision": 1.0, "recall": 0.6666666666666666, "support": 3.0 }, @@ -6550,9 +7522,9 @@ "support": 0.0 }, "comparison": { - "f1-score": 0.6666666666666666, + "f1-score": 1.0, "precision": 1.0, - "recall": 0.5, + "recall": 1.0, "support": 2.0 }, "contact_sales": { @@ -6562,8 +7534,8 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 0.8181818181818182, - "precision": 0.6923076923076923, + "f1-score": 0.9, + "precision": 0.8181818181818182, "recall": 1.0, "support": 9.0 }, @@ -6574,8 +7546,8 @@ "support": 0.0 }, "education": { - "f1-score": 0.9333333333333333, - "precision": 0.875, + "f1-score": 0.875, + "precision": 0.7777777777777778, "recall": 1.0, "support": 7.0 }, @@ -6586,9 +7558,9 @@ "support": 0.0 }, "evaluation": { - "f1-score": 0.5, - "precision": 1.0, - "recall": 0.3333333333333333, + "f1-score": 0.0, + "precision": 0.0, + "recall": 0.0, "support": 3.0 }, "follow_up": { @@ -6598,9 +7570,9 @@ "support": 12.0 }, "macro avg": { - "f1-score": 0.4978114478114478, - "precision": 0.531517094017094, - "recall": 0.5092592592592592, + "f1-score": 0.474472286972287, + "precision": 0.46035754369087706, + "recall": 0.5185185185185186, "support": 53.0 }, "onboarding_setup": { @@ -6610,8 +7582,8 @@ "support": 4.0 }, "product_discovery": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9230769230769231, + "precision": 0.8571428571428571, "recall": 1.0, "support": 6.0 }, @@ -6634,8 +7606,8 @@ "support": 0.0 }, "task_execution": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.6666666666666666, + "precision": 0.5, "recall": 1.0, "support": 1.0 }, @@ -6646,8 +7618,8 @@ "support": 1.0 }, "weighted avg": { - "f1-score": 0.8404230989136648, - "precision": 0.887215771649734, + "f1-score": 0.823438668249989, + "precision": 0.8324076342944268, "recall": 0.8490566037735849, "support": 53.0 } @@ -6655,23 +7627,23 @@ "suite": "extended_cases" }, "hard_cases": { - "accepted_accuracy": 0.9468, + "accepted_accuracy": 0.883, "accepted_coverage": 1.0, - "accuracy": 0.9468, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_hard_cases_confusion_matrix.csv", + "accuracy": 0.883, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_hard_cases_confusion_matrix.csv", "count": 94, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/subtype/hard_cases.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/subtype/hard_cases.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.9191, + "macro_f1": 0.8137, "per_class_metrics": { "account_help": { - "f1-score": 0.8, - "precision": 0.6666666666666666, - "recall": 1.0, + "f1-score": 0.5, + "precision": 0.5, + "recall": 0.5, "support": 2.0 }, - "accuracy": 0.9468085106382979, + "accuracy": 0.8829787234042553, "billing_help": { "f1-score": 1.0, "precision": 1.0, @@ -6697,9 +7669,9 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 1.0, + "f1-score": 0.8, "precision": 1.0, - "recall": 1.0, + "recall": 0.6666666666666666, "support": 3.0 }, "download": { @@ -6709,8 +7681,8 @@ "support": 0.0 }, "education": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9666666666666667, + "precision": 0.9354838709677419, "recall": 1.0, "support": 29.0 }, @@ -6721,9 +7693,9 @@ "support": 0.0 }, "evaluation": { - "f1-score": 0.7272727272727273, - "precision": 0.8, - "recall": 0.6666666666666666, + "f1-score": 0.25, + "precision": 0.5, + "recall": 0.16666666666666666, "support": 6.0 }, "follow_up": { @@ -6733,9 +7705,9 @@ "support": 12.0 }, "macro avg": { - "f1-score": 0.7659288023895194, - "precision": 0.7648148148148147, - "recall": 0.786111111111111, + "f1-score": 0.6780983255239549, + "precision": 0.693301292494841, + "recall": 0.6935185185185184, "support": 94.0 }, "onboarding_setup": { @@ -6745,26 +7717,26 @@ "support": 6.0 }, "product_discovery": { - "f1-score": 0.8888888888888888, - "precision": 0.8, + "f1-score": 0.8421052631578947, + "precision": 0.7272727272727273, "recall": 1.0, "support": 8.0 }, "provider_selection": { - "f1-score": 0.9473684210526315, - "precision": 1.0, + "f1-score": 0.9, + "precision": 0.9, "recall": 0.9, "support": 10.0 }, "purchase": { - "f1-score": 1.0, + "f1-score": 0.8, "precision": 1.0, - "recall": 1.0, + "recall": 0.6666666666666666, "support": 3.0 }, "signup": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.8571428571428571, + "precision": 0.75, "recall": 1.0, "support": 3.0 }, @@ -6775,38 +7747,38 @@ "support": 1.0 }, "troubleshooting": { - "f1-score": 0.8, - "precision": 1.0, + "f1-score": 0.6666666666666666, + "precision": 0.6666666666666666, "recall": 0.6666666666666666, "support": 3.0 }, "weighted avg": { - "f1-score": 0.9478016938458051, - "precision": 0.9578014184397163, - "recall": 0.9468085106382979, + "f1-score": 0.8700694845346483, + "precision": 0.879757596555812, + "recall": 0.8829787234042553, "support": 94.0 } }, "suite": "hard_cases" }, "test": { - "accepted_accuracy": 0.9, + "accepted_accuracy": 0.8714, "accepted_coverage": 1.0, - "accuracy": 0.9, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv", + "accuracy": 0.8714, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_test_confusion_matrix.csv", "count": 70, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/subtype/test.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/subtype/test.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.863, + "macro_f1": 0.7807, "per_class_metrics": { "account_help": { - "f1-score": 1.0, + "f1-score": 0.6666666666666666, "precision": 1.0, - "recall": 1.0, + "recall": 0.5, "support": 2.0 }, - "accuracy": 0.9, + "accuracy": 0.8714285714285714, "billing_help": { "f1-score": 0.0, "precision": 0.0, @@ -6820,8 +7792,8 @@ "support": 3.0 }, "comparison": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.8571428571428571, + "precision": 0.75, "recall": 1.0, "support": 3.0 }, @@ -6844,8 +7816,8 @@ "support": 0.0 }, "education": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9655172413793104, + "precision": 0.9333333333333333, "recall": 1.0, "support": 14.0 }, @@ -6856,9 +7828,9 @@ "support": 5.0 }, "evaluation": { - "f1-score": 0.4, - "precision": 0.3333333333333333, - "recall": 0.5, + "f1-score": 0.0, + "precision": 0.0, + "recall": 0.0, "support": 2.0 }, "follow_up": { @@ -6868,9 +7840,9 @@ "support": 11.0 }, "macro avg": { - "f1-score": 0.6712084293224644, - "precision": 0.6671296296296296, - "recall": 0.6908670033670034, + "f1-score": 0.6071895459070292, + "precision": 0.6101851851851853, + "recall": 0.632996632996633, "support": 70.0 }, "onboarding_setup": { @@ -6880,9 +7852,9 @@ "support": 4.0 }, "product_discovery": { - "f1-score": 0.875, - "precision": 0.875, - "recall": 0.875, + "f1-score": 1.0, + "precision": 1.0, + "recall": 1.0, "support": 8.0 }, "provider_selection": { @@ -6898,15 +7870,15 @@ "support": 0.0 }, "signup": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.8, + "precision": 0.6666666666666666, "recall": 1.0, "support": 2.0 }, "task_execution": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.8333333333333334, + "precision": 0.8333333333333334, + "recall": 0.8333333333333334, "support": 6.0 }, "troubleshooting": { @@ -6916,32 +7888,32 @@ "support": 2.0 }, "weighted avg": { - "f1-score": 0.9058084605453025, - "precision": 0.9266666666666667, - "recall": 0.9, + "f1-score": 0.8661227931749063, + "precision": 0.8835714285714285, + "recall": 0.8714285714285714, "support": 70.0 } }, "suite": "test" }, "train": { - "accepted_accuracy": 0.9649, + "accepted_accuracy": 0.9042, "accepted_coverage": 1.0, - "accuracy": 0.9649, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv", + "accuracy": 0.9042, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_train_confusion_matrix.csv", "count": 313, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/subtype/train.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/subtype/train.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.9649, + "macro_f1": 0.8791, "per_class_metrics": { "account_help": { - "f1-score": 0.9333333333333333, - "precision": 0.875, - "recall": 1.0, + "f1-score": 0.8, + "precision": 0.75, + "recall": 0.8571428571428571, "support": 7.0 }, - "accuracy": 0.9648562300319489, + "accuracy": 0.9041533546325878, "billing_help": { "f1-score": 1.0, "precision": 1.0, @@ -6949,27 +7921,27 @@ "support": 6.0 }, "booking": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.6, + "precision": 0.6, + "recall": 0.6, "support": 5.0 }, "comparison": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.967741935483871, + "precision": 0.9375, "recall": 1.0, "support": 15.0 }, "contact_sales": { - "f1-score": 0.875, + "f1-score": 0.8, "precision": 1.0, - "recall": 0.7777777777777778, + "recall": 0.6666666666666666, "support": 9.0 }, "deal_seeking": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.9090909090909091, + "precision": 0.9090909090909091, + "recall": 0.9090909090909091, "support": 11.0 }, "download": { @@ -6979,8 +7951,8 @@ "support": 8.0 }, "education": { - "f1-score": 0.9904761904761905, - "precision": 0.9811320754716981, + "f1-score": 0.9719626168224299, + "precision": 0.9454545454545454, "recall": 1.0, "support": 52.0 }, @@ -6991,84 +7963,84 @@ "support": 20.0 }, "evaluation": { - "f1-score": 0.9032258064516129, + "f1-score": 0.64, "precision": 1.0, - "recall": 0.8235294117647058, + "recall": 0.47058823529411764, "support": 17.0 }, "follow_up": { - "f1-score": 0.927536231884058, - "precision": 0.9696969696969697, - "recall": 0.8888888888888888, + "f1-score": 0.8732394366197183, + "precision": 0.8857142857142857, + "recall": 0.8611111111111112, "support": 36.0 }, "macro avg": { - "f1-score": 0.9649442256020961, - "precision": 0.9689347311202658, - "recall": 0.9651818334171275, + "f1-score": 0.8791291644367831, + "precision": 0.9052373525167643, + "recall": 0.8737167303612591, "support": 313.0 }, "onboarding_setup": { - "f1-score": 1.0, + "f1-score": 0.9696969696969697, "precision": 1.0, - "recall": 1.0, + "recall": 0.9411764705882353, "support": 17.0 }, "product_discovery": { - "f1-score": 0.96875, - "precision": 0.9393939393939394, - "recall": 1.0, + "f1-score": 0.8923076923076924, + "precision": 0.8529411764705882, + "recall": 0.9354838709677419, "support": 31.0 }, "provider_selection": { - "f1-score": 0.9795918367346939, - "precision": 1.0, + "f1-score": 0.96, + "precision": 0.96, "recall": 0.96, "support": 25.0 }, "purchase": { - "f1-score": 1.0, + "f1-score": 0.9090909090909091, "precision": 1.0, - "recall": 1.0, + "recall": 0.8333333333333334, "support": 6.0 }, "signup": { - "f1-score": 0.9411764705882353, - "precision": 0.8888888888888888, + "f1-score": 0.8648648648648649, + "precision": 0.7619047619047619, "recall": 1.0, "support": 16.0 }, "task_execution": { - "f1-score": 0.926829268292683, - "precision": 0.8636363636363636, + "f1-score": 0.8837209302325582, + "precision": 0.7916666666666666, "recall": 1.0, "support": 19.0 }, "troubleshooting": { - "f1-score": 0.9230769230769231, - "precision": 0.9230769230769231, - "recall": 0.9230769230769231, + "f1-score": 0.782608695652174, + "precision": 0.9, + "recall": 0.6923076923076923, "support": 13.0 }, "weighted avg": { - "f1-score": 0.9643733669039578, - "precision": 0.967429661617075, - "recall": 0.9648562300319489, + "f1-score": 0.8996108171948927, + "precision": 0.9128919168596861, + "recall": 0.9041533546325878, "support": 313.0 } }, "suite": "train" }, "val": { - "accepted_accuracy": 0.875, + "accepted_accuracy": 0.9, "accepted_coverage": 1.0, - "accuracy": 0.875, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv", + "accuracy": 0.9, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_subtype_val_confusion_matrix.csv", "count": 80, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/subtype/val.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/subtype/val.jsonl", "fallback_rate": 0.0, "head": "intent_subtype", - "macro_f1": 0.725, + "macro_f1": 0.7496, "per_class_metrics": { "account_help": { "f1-score": 0.5, @@ -7076,11 +8048,11 @@ "recall": 0.5, "support": 2.0 }, - "accuracy": 0.875, + "accuracy": 0.9, "billing_help": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.0, + "precision": 0.0, + "recall": 0.0, "support": 1.0 }, "booking": { @@ -7090,9 +8062,9 @@ "support": 3.0 }, "comparison": { - "f1-score": 0.4, + "f1-score": 0.8571428571428571, "precision": 1.0, - "recall": 0.25, + "recall": 0.75, "support": 4.0 }, "contact_sales": { @@ -7102,8 +8074,8 @@ "support": 0.0 }, "deal_seeking": { - "f1-score": 0.5714285714285714, - "precision": 0.4, + "f1-score": 0.6666666666666666, + "precision": 0.5, "recall": 1.0, "support": 2.0 }, @@ -7126,32 +8098,32 @@ "support": 5.0 }, "evaluation": { - "f1-score": 0.6666666666666666, - "precision": 0.5, + "f1-score": 0.8, + "precision": 0.6666666666666666, "recall": 1.0, "support": 2.0 }, "follow_up": { - "f1-score": 0.9523809523809523, + "f1-score": 0.9, "precision": 1.0, - "recall": 0.9090909090909091, + "recall": 0.8181818181818182, "support": 11.0 }, "macro avg": { - "f1-score": 0.6444203944203944, - "precision": 0.6542087542087542, - "recall": 0.687121212121212, + "f1-score": 0.6662846956964604, + "precision": 0.6697530864197531, + "recall": 0.6931818181818182, "support": 80.0 }, "onboarding_setup": { - "f1-score": 0.8, - "precision": 0.8, - "recall": 0.8, + "f1-score": 0.9090909090909091, + "precision": 0.8333333333333334, + "recall": 1.0, "support": 5.0 }, "product_discovery": { - "f1-score": 0.9090909090909091, - "precision": 0.9090909090909091, + "f1-score": 0.9523809523809523, + "precision": 1.0, "recall": 0.9090909090909091, "support": 11.0 }, @@ -7162,9 +8134,9 @@ "support": 7.0 }, "purchase": { - "f1-score": 0.0, - "precision": 0.0, - "recall": 0.0, + "f1-score": 0.6666666666666666, + "precision": 1.0, + "recall": 0.5, "support": 2.0 }, "signup": { @@ -7174,8 +8146,8 @@ "support": 2.0 }, "task_execution": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9411764705882353, + "precision": 0.8888888888888888, "recall": 1.0, "support": 8.0 }, @@ -7186,9 +8158,9 @@ "support": 1.0 }, "weighted avg": { - "f1-score": 0.8644047619047619, - "precision": 0.8891666666666665, - "recall": 0.875, + "f1-score": 0.8968286860198624, + "precision": 0.9118055555555555, + "recall": 0.9, "support": 80.0 } }, @@ -7197,12 +8169,12 @@ }, "intent_type": { "difficulty_benchmark": { - "accepted_accuracy": 0.9867, + "accepted_accuracy": 0.98, "accepted_coverage": 1.0, - "accuracy": 0.9867, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_difficulty_benchmark_confusion_matrix.csv", + "accuracy": 0.98, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_difficulty_benchmark_confusion_matrix.csv", "count": 150, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/intent_type_benchmark.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/intent_type_benchmark.jsonl", "difficulty_breakdown": { "easy": { "accepted_accuracy": 1.0, @@ -7221,23 +8193,23 @@ "macro_f1": 0.9596 }, "medium": { - "accepted_accuracy": 1.0, + "accepted_accuracy": 0.98, "accepted_coverage": 1.0, - "accuracy": 1.0, + "accuracy": 0.98, "count": 50, "fallback_rate": 0.0, - "macro_f1": 1.0 + "macro_f1": 0.9798 } }, "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 0.9867, + "macro_f1": 0.98, "per_class_metrics": { - "accuracy": 0.9866666666666667, + "accuracy": 0.98, "ambiguous": { - "f1-score": 1.0, + "f1-score": 0.9655172413793104, "precision": 1.0, - "recall": 1.0, + "recall": 0.9333333333333333, "support": 15.0 }, "chit_chat": { @@ -7265,15 +8237,15 @@ "support": 15.0 }, "informational": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.967741935483871, + "precision": 0.9375, "recall": 1.0, "support": 15.0 }, "macro avg": { - "f1-score": 0.9866518353726363, - "precision": 0.9875, - "recall": 0.9866666666666667, + "f1-score": 0.9799777530589543, + "precision": 0.98125, + "recall": 0.9800000000000001, "support": 150.0 }, "personal_reflection": { @@ -7301,9 +8273,9 @@ "support": 15.0 }, "weighted avg": { - "f1-score": 0.9866518353726362, - "precision": 0.9875, - "recall": 0.9866666666666667, + "f1-score": 0.9799777530589544, + "precision": 0.98125, + "recall": 0.98, "support": 150.0 } }, @@ -7313,9 +8285,9 @@ "accepted_accuracy": 1.0, "accepted_coverage": 1.0, "accuracy": 1.0, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_hard_cases_confusion_matrix.csv", + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_hard_cases_confusion_matrix.csv", "count": 61, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/hard_cases.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/hard_cases.jsonl", "fallback_rate": 0.0, "head": "intent_type", "macro_f1": 1.0, @@ -7397,17 +8369,17 @@ "suite": "hard_cases" }, "test": { - "accepted_accuracy": 0.9149, + "accepted_accuracy": 0.8936, "accepted_coverage": 1.0, - "accuracy": 0.9149, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv", + "accuracy": 0.8936, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_test_confusion_matrix.csv", "count": 47, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/test.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/test.jsonl", "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 0.9131, + "macro_f1": 0.811, "per_class_metrics": { - "accuracy": 0.9148936170212766, + "accuracy": 0.8936170212765957, "ambiguous": { "f1-score": 0.875, "precision": 1.0, @@ -7421,15 +8393,15 @@ "support": 1.0 }, "commercial": { - "f1-score": 0.9, - "precision": 0.9, - "recall": 0.9, + "f1-score": 0.9523809523809523, + "precision": 0.9090909090909091, + "recall": 1.0, "support": 10.0 }, "creative_generation": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.0, + "precision": 0.0, + "recall": 0.0, "support": 1.0 }, "exploratory": { @@ -7439,15 +8411,15 @@ "support": 1.0 }, "informational": { - "f1-score": 0.8888888888888888, - "precision": 0.8, + "f1-score": 0.9411764705882353, + "precision": 0.8888888888888888, "recall": 1.0, "support": 8.0 }, "macro avg": { - "f1-score": 0.9130555555555555, - "precision": 0.9199999999999999, - "recall": 0.9344444444444445, + "f1-score": 0.8110224089635854, + "precision": 0.8172979797979798, + "recall": 0.8319444444444443, "support": 47.0 }, "personal_reflection": { @@ -7469,15 +8441,15 @@ "support": 3.0 }, "transactional": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.875, + "precision": 0.875, + "recall": 0.875, "support": 8.0 }, "weighted avg": { - "f1-score": 0.916016548463357, - "precision": 0.9340425531914893, - "recall": 0.9148936170212766, + "f1-score": 0.893508254365576, + "precision": 0.9085536213195787, + "recall": 0.8936170212765957, "support": 47.0 } }, @@ -7487,18 +8459,18 @@ "accepted_accuracy": 0.8846, "accepted_coverage": 1.0, "accuracy": 0.8846, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv", + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_third_wave_cases_confusion_matrix.csv", "count": 26, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/third_wave_cases.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/third_wave_cases.jsonl", "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 0.8294, + "macro_f1": 0.8209, "per_class_metrics": { "accuracy": 0.8846153846153846, "ambiguous": { - "f1-score": 0.8888888888888888, + "f1-score": 0.8235294117647058, "precision": 1.0, - "recall": 0.8, + "recall": 0.7, "support": 10.0 }, "chit_chat": { @@ -7508,9 +8480,9 @@ "support": 1.0 }, "commercial": { - "f1-score": 0.9166666666666666, - "precision": 0.9166666666666666, - "recall": 0.9166666666666666, + "f1-score": 0.9230769230769231, + "precision": 0.8571428571428571, + "recall": 1.0, "support": 12.0 }, "creative_generation": { @@ -7532,9 +8504,9 @@ "support": 0.0 }, "macro avg": { - "f1-score": 0.5805555555555555, - "precision": 0.5916666666666666, - "recall": 0.5716666666666667, + "f1-score": 0.5746606334841629, + "precision": 0.5857142857142857, + "recall": 0.5700000000000001, "support": 26.0 }, "personal_reflection": { @@ -7562,8 +8534,8 @@ "support": 0.0 }, "weighted avg": { - "f1-score": 0.9188034188034189, - "precision": 0.9615384615384616, + "f1-score": 0.8966237382526975, + "precision": 0.9340659340659341, "recall": 0.8846153846153846, "support": 26.0 } @@ -7571,17 +8543,17 @@ "suite": "third_wave_cases" }, "train": { - "accepted_accuracy": 1.0, + "accepted_accuracy": 0.9945, "accepted_coverage": 1.0, - "accuracy": 1.0, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv", + "accuracy": 0.9945, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_train_confusion_matrix.csv", "count": 183, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/train.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/train.jsonl", "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 1.0, + "macro_f1": 0.9891, "per_class_metrics": { - "accuracy": 1.0, + "accuracy": 0.994535519125683, "ambiguous": { "f1-score": 1.0, "precision": 1.0, @@ -7601,8 +8573,8 @@ "support": 36.0 }, "creative_generation": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.9090909090909091, + "precision": 0.8333333333333334, "recall": 1.0, "support": 5.0 }, @@ -7619,9 +8591,9 @@ "support": 38.0 }, "macro avg": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.989090909090909, + "precision": 0.9833333333333334, + "recall": 0.9964285714285716, "support": 183.0 }, "personal_reflection": { @@ -7643,36 +8615,36 @@ "support": 10.0 }, "transactional": { - "f1-score": 1.0, + "f1-score": 0.9818181818181818, "precision": 1.0, - "recall": 1.0, + "recall": 0.9642857142857143, "support": 28.0 }, "weighted avg": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.9947342275211128, + "precision": 0.9954462659380693, + "recall": 0.994535519125683, "support": 183.0 } }, "suite": "train" }, "val": { - "accepted_accuracy": 0.9574, + "accepted_accuracy": 0.9149, "accepted_coverage": 1.0, - "accuracy": 0.9574, - "confusion_matrix_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv", + "accuracy": 0.9149, + "confusion_matrix_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/artifacts/evaluation/latest/intent_type_val_confusion_matrix.csv", "count": 47, - "dataset_path": "/Users/manikumargouni/Desktop/AdMesh/protocol/agentic-intent-classifier/data/val.jsonl", + "dataset_path": "/root/.cache/huggingface/hub/models--admesh--agentic-intent-classifier/snapshots/0584798f8efee6beccd778b0afa06782ab5add60/agentic-intent-classifier/agentic-intent-classifier/data/val.jsonl", "fallback_rate": 0.0, "head": "intent_type", - "macro_f1": 0.9067, + "macro_f1": 0.8575, "per_class_metrics": { - "accuracy": 0.9574468085106383, + "accuracy": 0.9148936170212766, "ambiguous": { - "f1-score": 1.0, + "f1-score": 0.9411764705882353, "precision": 1.0, - "recall": 1.0, + "recall": 0.8888888888888888, "support": 9.0 }, "chit_chat": { @@ -7682,9 +8654,9 @@ "support": 1.0 }, "commercial": { - "f1-score": 1.0, - "precision": 1.0, - "recall": 1.0, + "f1-score": 0.9, + "precision": 0.9, + "recall": 0.9, "support": 10.0 }, "creative_generation": { @@ -7694,8 +8666,8 @@ "support": 1.0 }, "exploratory": { - "f1-score": 1.0, - "precision": 1.0, + "f1-score": 0.6666666666666666, + "precision": 0.5, "recall": 1.0, "support": 1.0 }, @@ -7706,9 +8678,9 @@ "support": 8.0 }, "macro avg": { - "f1-score": 0.9066666666666666, - "precision": 0.9, - "recall": 0.9541666666666666, + "f1-score": 0.8574509803921568, + "precision": 0.8400000000000001, + "recall": 0.9330555555555555, "support": 47.0 }, "personal_reflection": { @@ -7736,9 +8708,9 @@ "support": 8.0 }, "weighted avg": { - "f1-score": 0.9617021276595744, - "precision": 0.9787234042553191, - "recall": 0.9574468085106383, + "f1-score": 0.9220692532332081, + "precision": 0.9468085106382979, + "recall": 0.9148936170212766, "support": 47.0 } }, diff --git a/combined_inference.py b/combined_inference.py index 5a9d46353a324e76673a4bd91a10450d865bbdc9..2e40abc7c013bee8ebf6852aacaf03cabe6d8059 100644 --- a/combined_inference.py +++ b/combined_inference.py @@ -19,6 +19,8 @@ from inference_intent_type import predict as predict_intent_type from inference_decision_phase import predict as predict_decision_phase from inference_iab_classifier import predict as predict_iab_content_classifier from inference_subtype import predict as predict_intent_subtype +from model_runtime import get_head +from multitask_runtime import get_multitask_runtime from schemas import validate_classify_response # Degraded fallback only: production requires `training/train_iab.py` and @@ -362,6 +364,39 @@ def build_iab_content( return classifier_pred["content"], classifier_pred +def _classify_multitask_fused( + text: str, + threshold_overrides: dict[str, float], +) -> tuple[dict, dict, dict]: + """Run the shared DistilBERT encoder exactly once and decode all three heads. + + This is the hot-path replacement for the three separate predict_intent_type / + predict_intent_subtype / predict_decision_phase calls. On CPU with + DistilBERT it cuts encoder invocations from 3 → 1, roughly halving the + per-query latency for the multitask heads. + """ + runtime = get_multitask_runtime() + all_logits = runtime.predict_all_heads_batch([text]) + + intent_proxy = get_head("intent_type") + subtype_proxy = get_head("intent_subtype") + phase_proxy = get_head("decision_phase") + + intent_pred = intent_proxy.predict_from_logits( + all_logits["intent_type_logits"][0], + confidence_threshold=threshold_overrides.get("intent_type"), + ) + subtype_pred = subtype_proxy.predict_from_logits( + all_logits["intent_subtype_logits"][0], + confidence_threshold=threshold_overrides.get("intent_subtype"), + ) + phase_pred = phase_proxy.predict_from_logits( + all_logits["decision_phase_logits"][0], + confidence_threshold=threshold_overrides.get("decision_phase"), + ) + return intent_pred, subtype_pred, phase_pred + + def classify_query( text: str, threshold_overrides: dict[str, float] | None = None, @@ -370,9 +405,9 @@ def classify_query( ) -> dict: threshold_overrides = threshold_overrides or {} force_iab_placeholder = _force_iab_placeholder(force_iab_placeholder) - intent_pred = predict_intent_type(text, confidence_threshold=threshold_overrides.get("intent_type")) - subtype_pred = predict_intent_subtype(text, confidence_threshold=threshold_overrides.get("intent_subtype")) - phase_pred = predict_decision_phase(text, confidence_threshold=threshold_overrides.get("decision_phase")) + + # Single encoder pass for all three multitask heads (hot path). + intent_pred, subtype_pred, phase_pred = _classify_multitask_fused(text, threshold_overrides) intent_type = intent_pred["label"] subtype = subtype_pred["label"] diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2be36173a046c80cf0cf71a3fde2efeeff86f738 --- /dev/null +++ b/config.json @@ -0,0 +1,11 @@ +{ + "model_type": "admesh-intent-classifier", + "pipeline_tag": "text-classification", + "custom_pipelines": { + "admesh-intent": { + "impl": "pipeline.AdmeshIntentPipeline", + "pt": [], + "tf": [] + } + } +} diff --git a/iab_classifier_model_output/train_metrics.json b/iab_classifier_model_output/train_metrics.json index 9a0657ee90f1b5891cf3d2331f5d6a3602e7e0cd..64144dfdb23ede2a1707b10056586ef3fe543320 100644 --- a/iab_classifier_model_output/train_metrics.json +++ b/iab_classifier_model_output/train_metrics.json @@ -4,22 +4,22 @@ "test_count": 3282, "test_metrics": { "epoch": 3.0, - "test_accuracy": 0.9408897014015845, - "test_loss": 1.6729823350906372, - "test_macro_f1": 0.9183354517881372, - "test_runtime": 10.3715, - "test_samples_per_second": 316.444, - "test_steps_per_second": 19.862 + "test_accuracy": 0.9195612431444241, + "test_loss": 1.8592056035995483, + "test_macro_f1": 0.8793483981890693, + "test_runtime": 10.4204, + "test_samples_per_second": 314.96, + "test_steps_per_second": 19.769 }, "train_count": 13211, "val_count": 3282, "val_metrics": { "epoch": 3.0, - "val_accuracy": 0.9442413162705667, - "val_loss": 1.6669942140579224, - "val_macro_f1": 0.9219584149840212, - "val_runtime": 10.3648, - "val_samples_per_second": 316.65, - "val_steps_per_second": 19.875 + "val_accuracy": 0.9159049360146252, + "val_loss": 1.8564603328704834, + "val_macro_f1": 0.8782665132955957, + "val_runtime": 10.3978, + "val_samples_per_second": 315.645, + "val_steps_per_second": 19.812 } } diff --git a/model_runtime.py b/model_runtime.py index 4a80e5eeb46e014201b02c8231bca1187eac903a..8fae752be0312a248d6a6e8ed56f486ad12dd716 100644 --- a/model_runtime.py +++ b/model_runtime.py @@ -139,7 +139,7 @@ class SequenceClassifierHead: def _predict_probs(self, texts: list[str]) -> tuple[torch.Tensor, torch.Tensor]: inputs = self._encode(texts) - with torch.no_grad(): + with torch.inference_mode(): outputs = self.model(**inputs) raw_probs = torch.softmax(outputs.logits, dim=-1) calibrated_probs = torch.softmax(outputs.logits / self.calibration.temperature, dim=-1) diff --git a/multitask_intent_model_output/multitask_intent.onnx b/multitask_intent_model_output/multitask_intent.onnx index 78d83e3c3cff0e49e795545fd031cdd905f1d4a9..6a8415e11c84ba2c361ff7414ffdef11532c102a 100644 --- a/multitask_intent_model_output/multitask_intent.onnx +++ b/multitask_intent_model_output/multitask_intent.onnx @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4b5ddd35af5f1384a78f3e6e308eaa10f60782c8bf86120b66aceddcee6318d +oid sha256:87577db2f1291b648f221382702632ed4a55126cc22fadc1688f55b40bed6f6a size 59456 diff --git a/multitask_intent_model_output/multitask_intent.onnx.data b/multitask_intent_model_output/multitask_intent.onnx.data index 56c5d50b11b48a226c7055364587d7e4ab485ca3..95d84a3ce8d78a2ea33184633678cb16641cc8b3 100644 --- a/multitask_intent_model_output/multitask_intent.onnx.data +++ b/multitask_intent_model_output/multitask_intent.onnx.data @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e43ca569ca9418fd2801e6be9ba3a4561c7c2593906c8e86c0f6490fb0ac7dd +oid sha256:5faa733b1c72e1539c4b11390d09b46dae4301196a1e60b18aceb87c143a1a15 size 265598976 diff --git a/multitask_intent_model_output/multitask_model.pt b/multitask_intent_model_output/multitask_model.pt index 998c90afe5070b4848f4d510ff3d98cf768388d6..02651daa9b0b13a7abab1ffe35c921ea31d7b377 100644 --- a/multitask_intent_model_output/multitask_model.pt +++ b/multitask_intent_model_output/multitask_model.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:387c9939f451c52244e30a7ffc0437189c7707371b1f71d9be0caf4118adb7b3 +oid sha256:da4caa467a41f326b47e8f9fcbe475d6753cf947d9208ddfd77adceccd4fb857 size 265602027 diff --git a/multitask_intent_model_output/train_metrics.json b/multitask_intent_model_output/train_metrics.json index bbce384d78801b0045e28bef8e6d0ec7d4bc0c13..e766273255ad5373610744eb38905749d7fb9f88 100644 --- a/multitask_intent_model_output/train_metrics.json +++ b/multitask_intent_model_output/train_metrics.json @@ -9,29 +9,29 @@ "test_metrics": { "epoch": 4.0, "test_decision_phase_accuracy": 0.7931034482758621, - "test_decision_phase_macro_f1": 0.796485260770975, - "test_intent_subtype_accuracy": 0.8857142857142857, - "test_intent_subtype_macro_f1": 0.8093880020247664, + "test_decision_phase_macro_f1": 0.8010204081632653, + "test_intent_subtype_accuracy": 0.8714285714285714, + "test_intent_subtype_macro_f1": 0.7806722733090375, "test_intent_type_accuracy": 0.8936170212765957, "test_intent_type_macro_f1": 0.8110224089635854, - "test_loss": 1.488637924194336, - "test_runtime": 0.1528, - "test_samples_per_second": 458.026, - "test_steps_per_second": 32.716 + "test_loss": 1.4918248653411865, + "test_runtime": 0.151, + "test_samples_per_second": 463.681, + "test_steps_per_second": 33.12 }, "train_count": 1590, "val_count": 473, "val_metrics": { "epoch": 4.0, - "val_decision_phase_accuracy": 0.9707317073170731, - "val_decision_phase_macro_f1": 0.964698796984366, - "val_intent_subtype_accuracy": 0.9290123456790124, - "val_intent_subtype_macro_f1": 0.923158190894806, + "val_decision_phase_accuracy": 0.9560975609756097, + "val_decision_phase_macro_f1": 0.9475689564578877, + "val_intent_subtype_accuracy": 0.9197530864197531, + "val_intent_subtype_macro_f1": 0.9028879526721083, "val_intent_type_accuracy": 0.9792387543252595, - "val_intent_type_macro_f1": 0.9630328385586321, - "val_loss": 0.5629223585128784, - "val_runtime": 1.0257, - "val_samples_per_second": 461.15, - "val_steps_per_second": 29.248 + "val_intent_type_macro_f1": 0.9703240839950261, + "val_loss": 0.5390442609786987, + "val_runtime": 0.9971, + "val_samples_per_second": 474.357, + "val_steps_per_second": 30.086 } } diff --git a/multitask_runtime.py b/multitask_runtime.py index e426f430732133a92bd9589e62a0cf9f7144af51..96d6a9c6b3a58b112d78d561718c6279169f91f8 100644 --- a/multitask_runtime.py +++ b/multitask_runtime.py @@ -100,10 +100,38 @@ class MultiTaskRuntime: def _predict_logits(self, task: str, texts: list[str]) -> torch.Tensor: config = TASK_TO_CONFIG[task] inputs = self._encode(texts, config.max_length) - with torch.no_grad(): + with torch.inference_mode(): outputs = self.model(**inputs) return outputs[TASK_TO_LOGIT_KEY[task]] + def predict_all_heads_batch( + self, texts: list[str] + ) -> dict[str, torch.Tensor]: + """Single encoder pass returning logits for all three heads at once. + + This is the hot-path entry point. Compared with calling + ``_predict_logits`` once per head it cuts the number of DistilBERT + forward passes from 3 → 1, roughly halving CPU latency for a single + query. + + Returns + ------- + dict with keys ``intent_type_logits``, ``intent_subtype_logits``, + ``decision_phase_logits`` — raw (pre-softmax) float tensors of shape + ``(len(texts), n_classes_for_head)``. + """ + # Use the maximum of the three head max_lengths so all heads see the + # same truncation boundary. + max_len = max(cfg.max_length for cfg in TASK_TO_CONFIG.values()) + inputs = self._encode(texts, max_len) + with torch.inference_mode(): + outputs = self.model(**inputs) + return { + "intent_type_logits": outputs["intent_type_logits"], + "intent_subtype_logits": outputs["intent_subtype_logits"], + "decision_phase_logits": outputs["decision_phase_logits"], + } + class MultiTaskHeadProxy: def __init__(self, task: str): @@ -126,7 +154,7 @@ class MultiTaskHeadProxy: config = type("ConfigView", (), {"id2label": proxy.config.id2label})() def forward(self, input_ids=None, attention_mask=None, **kwargs): - with torch.no_grad(): + with torch.inference_mode(): outputs = proxy.runtime.model(input_ids=input_ids, attention_mask=attention_mask) logits = outputs[TASK_TO_LOGIT_KEY[proxy.task]] return type("OutputView", (), {"logits": logits})() @@ -160,10 +188,48 @@ class MultiTaskHeadProxy: def _predict_probs(self, texts: list[str]) -> tuple[torch.Tensor, torch.Tensor]: logits = self.runtime._predict_logits(self.task, texts) - raw_probs = torch.softmax(logits, dim=-1) - calibrated_probs = torch.softmax(logits / self.calibration.temperature, dim=-1) + with torch.inference_mode(): + raw_probs = torch.softmax(logits, dim=-1) + calibrated_probs = torch.softmax(logits / self.calibration.temperature, dim=-1) return raw_probs, calibrated_probs + def predict_probs_from_logits( + self, logits: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + """Compute calibrated probs from pre-computed logits (hot-path helper). + + Called by ``classify_query_fused`` after a single shared encoder pass + so that each ``MultiTaskHeadProxy`` does not re-run the encoder. + """ + with torch.inference_mode(): + raw_probs = torch.softmax(logits, dim=-1) + calibrated_probs = torch.softmax(logits / self.calibration.temperature, dim=-1) + return raw_probs, calibrated_probs + + def predict_from_logits( + self, logits: torch.Tensor, confidence_threshold: float | None = None + ) -> dict: + """Return a single prediction dict from pre-computed logits.""" + effective_threshold = ( + self.calibration.confidence_threshold + if confidence_threshold is None + else min(max(float(confidence_threshold), 0.0), 1.0) + ) + raw_probs, calibrated_probs = self.predict_probs_from_logits(logits.unsqueeze(0)) + raw_row = raw_probs[0] + calibrated_row = calibrated_probs[0] + pred_id = int(torch.argmax(calibrated_row).item()) + confidence = float(calibrated_row[pred_id].item()) + raw_confidence = float(raw_row[pred_id].item()) + return { + "label": self.config.id2label[pred_id], + "confidence": round_score(confidence), + "raw_confidence": round_score(raw_confidence), + "confidence_threshold": round_score(effective_threshold), + "calibrated": self.calibration.calibrated, + "meets_confidence_threshold": confidence >= effective_threshold, + } + def predict_probs_batch(self, texts: list[str]) -> tuple[torch.Tensor, torch.Tensor]: if not texts: empty = torch.empty((0, len(self.config.labels)), dtype=torch.float32) diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..eacb1b68fc3b6903b7f48ab7bbc024434f16d725 --- /dev/null +++ b/pipeline.py @@ -0,0 +1,305 @@ +""" +AdmeshIntentPipeline — transformers.Pipeline subclass for +admesh/agentic-intent-classifier. + +Because config.json declares "pt": [] the transformers pipeline() loader +skips AutoModel.from_pretrained() entirely and passes model=None straight +to this class. All model loading is handled internally via combined_inference, +which resolves paths relative to __file__ so it works wherever HF downloads +the repo (Inference Endpoints, Spaces, local snapshot_download, etc.). + +Supported HF deployment surfaces +--------------------------------- +1. transformers.pipeline() direct call (trust_remote_code=True): + + from transformers import pipeline + clf = pipeline( + "admesh-intent", + model="admesh/agentic-intent-classifier", + trust_remote_code=True, + ) + result = clf("Which laptop should I buy for college?") + +2. HF Inference Endpoints — Standard (PyTorch, trust_remote_code=True): + Deploy from https://ui.endpoints.huggingface.co — no custom container + needed; HF loads this pipeline class automatically. + +3. HF Spaces (Gradio / Streamlit): + + import sys + from huggingface_hub import snapshot_download + local_dir = snapshot_download("admesh/agentic-intent-classifier", repo_type="model") + sys.path.insert(0, local_dir) + from pipeline import AdmeshIntentPipeline + clf = AdmeshIntentPipeline() + result = clf("I need a CRM for a 5-person startup") + +4. Anywhere via from_pretrained(): + + from pipeline import AdmeshIntentPipeline + clf = AdmeshIntentPipeline.from_pretrained("admesh/agentic-intent-classifier") +""" + +from __future__ import annotations + +import sys +from pathlib import Path +from typing import Union + +# ── try to import transformers.Pipeline; fall back gracefully if absent ─────── +try: + from transformers import Pipeline as _HFPipeline + _TRANSFORMERS_AVAILABLE = True +except ImportError: + _HFPipeline = object # bare object as base when transformers is not installed + _TRANSFORMERS_AVAILABLE = False + + +class AdmeshIntentPipeline(_HFPipeline): + """ + Full intent + IAB classification pipeline. + + Inherits from ``transformers.Pipeline`` so it works natively with + ``pipeline()``, HF Inference Endpoints (standard mode), and HF Spaces. + + When ``transformers`` is not installed it falls back to a plain callable + class so the same code works in minimal environments too. + + Parameters + ---------- + model: + Ignored — we load all models internally. Present only to satisfy + the ``transformers.Pipeline`` interface when HF calls + ``PipelineClass(model=None, ...)``. + **kwargs: + Forwarded to ``transformers.Pipeline.__init__`` if transformers is + available, otherwise ignored. + """ + + # ── init ────────────────────────────────────────────────────────────────── + + def __init__(self, model=None, tokenizer=None, **kwargs): + # Ensure this repo's directory is on sys.path so all relative imports + # in combined_inference / config / model_runtime resolve correctly. + # Path(__file__) points to wherever HF cached the repo snapshot. + _repo_dir = Path(__file__).resolve().parent + if str(_repo_dir) not in sys.path: + sys.path.insert(0, str(_repo_dir)) + + if _TRANSFORMERS_AVAILABLE: + import torch + + # transformers.Pipeline requires certain attributes to be set. + # Because config.json has "pt": [] HF passes model=None here — + # we satisfy the interface by setting the minimum required attrs + # manually instead of calling super().__init__(model=None, ...) + # which would raise inside infer_framework_load_model(). + self.task = kwargs.pop("task", "admesh-intent") + self.model = model # None — unused, kept for interface compat + self.tokenizer = tokenizer # None — unused + self.feature_extractor = None + self.image_processor = None + self.modelcard = None + self.framework = "pt" + self.device = torch.device(kwargs.pop("device", "cpu")) + self.torch_dtype = kwargs.pop("torch_dtype", None) + self.binary_output = kwargs.pop("binary_output", False) + self.call_count = 0 + self._batch_size = kwargs.pop("batch_size", 1) + self._num_workers = kwargs.pop("num_workers", 0) + self._preprocess_params: dict = {} + self._forward_params: dict = {} + self._postprocess_params: dict = {} + # else: plain object, no init needed + + self._classify_fn = None # lazy-loaded on first __call__ + + # ── transformers.Pipeline abstract methods ──────────────────────────────── + # These are required by the ABC but our __call__ override bypasses them. + # They are still implemented in case a caller invokes them directly. + + def _sanitize_parameters(self, **kwargs): + forward_kwargs = {} + if "threshold_overrides" in kwargs: + forward_kwargs["threshold_overrides"] = kwargs["threshold_overrides"] + if "force_iab_placeholder" in kwargs: + forward_kwargs["force_iab_placeholder"] = kwargs["force_iab_placeholder"] + return {}, forward_kwargs, {} + + def preprocess(self, inputs): + return {"text": inputs if isinstance(inputs, str) else str(inputs)} + + def _forward(self, model_inputs, threshold_overrides=None, force_iab_placeholder=False): + self._ensure_loaded() + return self._classify_fn( + model_inputs["text"], + threshold_overrides=threshold_overrides, + force_iab_placeholder=force_iab_placeholder, + ) + + def postprocess(self, model_outputs): + return model_outputs + + # ── __call__ override ───────────────────────────────────────────────────── + # We bypass Pipeline's preprocess→_forward→postprocess chain entirely so + # we never touch self.model and keep full control over batching logic. + + def __call__( + self, + inputs: Union[str, list[str]], + *, + threshold_overrides: dict[str, float] | None = None, + force_iab_placeholder: bool = False, + ) -> Union[dict, list[dict]]: + """ + Classify one or more query strings. + + Parameters + ---------- + inputs: + A single query string or a list of query strings. + threshold_overrides: + Optional per-head confidence threshold overrides, e.g. + ``{"intent_type": 0.5, "iab_content": 0.3}``. + force_iab_placeholder: + Skip IAB classifier and return placeholder values (faster, + no IAB accuracy). + + Returns + ------- + dict or list[dict]: + Full classification payload matching the combined_inference schema. + Returns a single dict for a string input, list of dicts for a list. + + Examples + -------- + :: + + clf = pipeline("admesh-intent", model="admesh/agentic-intent-classifier", + trust_remote_code=True) + + # single + result = clf("Which laptop should I buy for college?") + + # batch + results = clf(["Best running shoes", "How does TCP work?"]) + + # custom thresholds + result = clf("Buy headphones", threshold_overrides={"intent_type": 0.6}) + """ + self._ensure_loaded() + + single = isinstance(inputs, str) + texts: list[str] = [inputs] if single else list(inputs) + + results = [ + self._classify_fn( + text, + threshold_overrides=threshold_overrides, + force_iab_placeholder=force_iab_placeholder, + ) + for text in texts + ] + return results[0] if single else results + + # ── warm-up / compile ───────────────────────────────────────────────────── + + def warm_up(self, compile: bool = False) -> "AdmeshIntentPipeline": + """ + Pre-load all models and optionally compile them with torch.compile(). + + Call once after instantiation so the first real request pays no + model-load cost. HF Inference Endpoints automatically sends a + warm-up probe before routing live traffic, so this is optional there. + + Parameters + ---------- + compile: + If ``True``, call ``torch.compile()`` on the DistilBERT encoder + and IAB classifier (requires PyTorch >= 2.0). Gives ~15-30 % + CPU speedup after the first traced call. + """ + self._ensure_loaded() + + if compile: + import torch # noqa: PLC0415 + if not hasattr(torch, "compile"): + import warnings + warnings.warn( + "torch.compile() is not available (PyTorch >= 2.0 required). " + "Skipping.", + stacklevel=2, + ) + else: + from multitask_runtime import get_multitask_runtime # noqa: PLC0415 + from model_runtime import get_head # noqa: PLC0415 + + rt = get_multitask_runtime() + if rt._model is not None: + rt._model = torch.compile(rt._model) + iab_head = get_head("iab_content") + if iab_head._model is not None: + iab_head._model = torch.compile(iab_head._model) + + # Dry run — triggers any remaining lazy init (calibration JSON reads, etc.) + self("warm up query for intent classification", force_iab_placeholder=True) + return self + + # ── factory ─────────────────────────────────────────────────────────────── + + @classmethod + def from_pretrained( + cls, + repo_id: str = "admesh/agentic-intent-classifier", + *, + revision: str | None = None, + token: str | None = None, + ) -> "AdmeshIntentPipeline": + """ + Download the model bundle from HF Hub and return a ready-to-use instance. + + Parameters + ---------- + repo_id: + HF Hub model id. + revision: + Optional git commit hash to pin a specific release. + token: + Optional HF auth token for private repos. + + Example + ------- + :: + + from pipeline import AdmeshIntentPipeline + clf = AdmeshIntentPipeline.from_pretrained("admesh/agentic-intent-classifier") + print(clf("I need a CRM for a 5-person startup")) + """ + try: + from huggingface_hub import snapshot_download # noqa: PLC0415 + except ImportError as exc: + raise ImportError( + "huggingface_hub is required. Install: pip install huggingface_hub" + ) from exc + + kwargs: dict = {"repo_type": "model"} + if revision: + kwargs["revision"] = revision + if token: + kwargs["token"] = token + + local_dir = snapshot_download(repo_id=repo_id, **kwargs) + if str(local_dir) not in sys.path: + sys.path.insert(0, str(local_dir)) + return cls() + + # ── internal ────────────────────────────────────────────────────────────── + + def _ensure_loaded(self) -> None: + if self._classify_fn is None: + from combined_inference import classify_query # noqa: PLC0415 + self._classify_fn = classify_query + + def __repr__(self) -> str: + state = "loaded" if self._classify_fn is not None else "not yet loaded" + return f"AdmeshIntentPipeline(classify_fn={state})" diff --git a/requirements.txt b/requirements.txt index ea4b63d88fba03fffa45ff43a8da04a17f423da7..e3d7d8b2c7ad9507577a567f556e816ccda1e4f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ pandas>=2.0.0 safetensors>=0.4.0 onnx>=1.14.0 onnxscript>=0.1.0 +huggingface_hub>=0.20.0 diff --git a/training/upload_to_hf.py b/training/upload_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..2a988813f93c357591f7b777d7307cac85aaba5f --- /dev/null +++ b/training/upload_to_hf.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +""" +Upload trained artifacts to Hugging Face Hub. + +This repo uses local-path inference. The upload is intended so you can later +download these directories into the same folder layout and run inference. +""" + +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Upload trained intent/IAB artifacts to Hugging Face Hub.") + parser.add_argument( + "--repo-id", + required=True, + help="HF repo id, e.g. 'yourname/admesh-intent-iab-v1'.", + ) + parser.add_argument( + "--token", + default=os.environ.get("HF_TOKEN"), + help="HF token. If omitted, uses env HF_TOKEN.", + ) + parser.add_argument( + "--private", + action="store_true", + help="Create the repo as private.", + ) + parser.add_argument( + "--include-multitask", + action="store_true", + help="Upload multitask intent model output directory.", + ) + parser.add_argument( + "--include-iab", + action="store_true", + help="Upload IAB classifier model output directory.", + ) + parser.add_argument( + "--include-calibration", + action="store_true", + help="Upload artifacts/calibration directory.", + ) + parser.add_argument( + "--multitask-dir", + default="multitask_intent_model_output", + help="Path to multitask intent output directory (relative to this script's base).", + ) + parser.add_argument( + "--iab-dir", + default="iab_classifier_model_output", + help="Path to IAB classifier model output directory (relative to this script's base).", + ) + parser.add_argument( + "--calibration-dir", + default="artifacts/calibration", + help="Path to calibration artifacts directory (relative to this script's base).", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print what would be uploaded without actually uploading.", + ) + return parser.parse_args() + + +def main() -> int: + args = _parse_args() + if not args.token: + print("Missing HF token. Provide --token or set env HF_TOKEN.", file=sys.stderr) + return 2 + + repo_root = Path(__file__).resolve().parent.parent + + multitask_dir = (repo_root / args.multitask_dir).resolve() + iab_dir = (repo_root / args.iab_dir).resolve() + calibration_dir = (repo_root / args.calibration_dir).resolve() + + to_upload: list[tuple[str, Path]] = [] + if args.include_multitask: + to_upload.append(("multitask_intent_model_output", multitask_dir)) + if args.include_iab: + to_upload.append(("iab_classifier_model_output", iab_dir)) + if args.include_calibration: + to_upload.append(("artifacts/calibration", calibration_dir)) + + if not to_upload: + print("Nothing to upload. Pass --include-multitask, --include-iab, and/or --include-calibration.", file=sys.stderr) + return 2 + + # Import lazily so `--dry-run` works without extra deps. + try: + from huggingface_hub import HfApi + except ModuleNotFoundError: + print("Missing dependency: huggingface_hub. Install with: pip install huggingface_hub", file=sys.stderr) + return 2 + + api = HfApi(token=args.token) + api.create_repo(repo_id=args.repo_id, repo_type="model", private=args.private, exist_ok=True) + + for repo_path, local_dir in to_upload: + if not local_dir.exists(): + print(f"[SKIP] {repo_path}: local path does not exist: {local_dir}", file=sys.stderr) + continue + if args.dry_run: + print(f"[DRY] Would upload {local_dir} -> {args.repo_id}:{repo_path}") + continue + print(f"[UPLOAD] {local_dir} -> {args.repo_id}:{repo_path}") + api.upload_folder( + repo_id=args.repo_id, + repo_type="model", + folder_path=str(local_dir), + path_in_repo=repo_path, + ) + + print("Upload complete.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) +