agentic-intent-classifier / artifacts /evaluation /latest /iab_quality_target_eval.json
manikumargouni's picture
Upload folder using huggingface_hub
53d5d9f verified
{
"by_status": {
"must_fix": {
"failed": 9,
"passed": 3,
"total": 12
}
},
"cases_path": "/content/agentic-intent-classifier/examples/iab_mapping_cases.json",
"count": 12,
"failed": 9,
"passed": 3,
"results": [
{
"actual": {
"model_output.classification.iab_content.mapping_mode": "nearest_equivalent",
"model_output.classification.iab_content.tier1.label": "Automotive",
"model_output.classification.iab_content.tier2.label": null
},
"expected": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Automotive",
"model_output.classification.iab_content.tier2.label": "Auto Buying and Selling"
},
"id": "car-buying-maps-to-automotive-buying",
"mismatches": [
{
"actual": null,
"expected": "Auto Buying and Selling",
"path": "model_output.classification.iab_content.tier2.label"
},
{
"actual": "nearest_equivalent",
"expected": "exact",
"path": "model_output.classification.iab_content.mapping_mode"
}
],
"notes": "Vehicle shopping queries should map into the automotive buying branch, not business sales.",
"pass": false,
"status": "must_fix",
"text": "Which car to buy in 2026"
},
{
"actual": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Technology & Computing",
"model_output.classification.iab_content.tier2.label": "Computing",
"model_output.classification.iab_content.tier3.label": "Laptops"
},
"expected": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Technology & Computing",
"model_output.classification.iab_content.tier2.label": "Computing",
"model_output.classification.iab_content.tier3.label": "Laptops"
},
"id": "laptop-buying-maps-to-laptops",
"mismatches": [],
"notes": "Laptop shopping should resolve into the laptops branch, not business sales.",
"pass": true,
"status": "must_fix",
"text": "Which laptop to buy in 2026"
},
{
"actual": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Technology & Computing",
"model_output.classification.iab_content.tier2.label": null,
"model_output.classification.iab_content.tier3.label": null
},
"expected": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Technology & Computing",
"model_output.classification.iab_content.tier2.label": "Computing",
"model_output.classification.iab_content.tier3.label": "Laptops"
},
"id": "labtop-buying-maps-to-laptops",
"mismatches": [
{
"actual": null,
"expected": "Computing",
"path": "model_output.classification.iab_content.tier2.label"
},
{
"actual": null,
"expected": "Laptops",
"path": "model_output.classification.iab_content.tier3.label"
}
],
"notes": "Common typo handling should still land in the laptops branch.",
"pass": false,
"status": "must_fix",
"text": "Which labtop to buy in 2026"
},
{
"actual": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Technology & Computing",
"model_output.classification.iab_content.tier2.label": null,
"model_output.classification.iab_content.tier3.label": null
},
"expected": {
"model_output.classification.iab_content.mapping_mode": "nearest_equivalent",
"model_output.classification.iab_content.tier1.label": "Business and Finance",
"model_output.classification.iab_content.tier2.label": "Business",
"model_output.classification.iab_content.tier3.label": "Sales"
},
"id": "crm-awareness-maps-to-sales",
"mismatches": [
{
"actual": "Technology & Computing",
"expected": "Business and Finance",
"path": "model_output.classification.iab_content.tier1.label"
},
{
"actual": null,
"expected": "Business",
"path": "model_output.classification.iab_content.tier2.label"
},
{
"actual": null,
"expected": "Sales",
"path": "model_output.classification.iab_content.tier3.label"
},
{
"actual": "exact",
"expected": "nearest_equivalent",
"path": "model_output.classification.iab_content.mapping_mode"
}
],
"notes": "CRM education should resolve to the closest business/sales path, not generic software.",
"pass": false,
"status": "must_fix",
"text": "What is CRM software?"
},
{
"actual": {
"model_output.classification.iab_content.mapping_mode": "nearest_equivalent",
"model_output.classification.iab_content.tier1.label": "Technology & Computing",
"model_output.classification.iab_content.tier2.label": null,
"model_output.classification.iab_content.tier3.label": null
},
"expected": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Business and Finance",
"model_output.classification.iab_content.tier2.label": "Business",
"model_output.classification.iab_content.tier3.label": "Sales"
},
"id": "crm-comparison-maps-to-sales",
"mismatches": [
{
"actual": "Technology & Computing",
"expected": "Business and Finance",
"path": "model_output.classification.iab_content.tier1.label"
},
{
"actual": null,
"expected": "Business",
"path": "model_output.classification.iab_content.tier2.label"
},
{
"actual": null,
"expected": "Sales",
"path": "model_output.classification.iab_content.tier3.label"
},
{
"actual": "nearest_equivalent",
"expected": "exact",
"path": "model_output.classification.iab_content.mapping_mode"
}
],
"notes": "Direct CRM vendor comparison should map cleanly into the sales domain.",
"pass": false,
"status": "must_fix",
"text": "HubSpot vs Zoho for a small team"
},
{
"actual": {
"model_output.classification.iab_content.mapping_mode": "nearest_equivalent",
"model_output.classification.iab_content.tier1.label": "Careers",
"model_output.classification.iab_content.tier2.label": null,
"model_output.classification.iab_content.tier3.label": null
},
"expected": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Business and Finance",
"model_output.classification.iab_content.tier2.label": "Business",
"model_output.classification.iab_content.tier3.label": "Marketing and Advertising"
},
"id": "marketing-tools-map-to-marketing",
"mismatches": [
{
"actual": "Careers",
"expected": "Business and Finance",
"path": "model_output.classification.iab_content.tier1.label"
},
{
"actual": null,
"expected": "Business",
"path": "model_output.classification.iab_content.tier2.label"
},
{
"actual": null,
"expected": "Marketing and Advertising",
"path": "model_output.classification.iab_content.tier3.label"
},
{
"actual": "nearest_equivalent",
"expected": "exact",
"path": "model_output.classification.iab_content.mapping_mode"
}
],
"notes": "Marketing tool discovery should map to the marketing and advertising branch.",
"pass": false,
"status": "must_fix",
"text": "Best AI SEO tools for content teams"
},
{
"actual": {
"model_output.classification.iab_content.mapping_mode": "nearest_equivalent",
"model_output.classification.iab_content.tier1.label": "Real Estate",
"model_output.classification.iab_content.tier2.label": null
},
"expected": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Technology & Computing",
"model_output.classification.iab_content.tier2.label": "Artificial Intelligence"
},
"id": "ml-explanation-maps-to-ai",
"mismatches": [
{
"actual": "Real Estate",
"expected": "Technology & Computing",
"path": "model_output.classification.iab_content.tier1.label"
},
{
"actual": null,
"expected": "Artificial Intelligence",
"path": "model_output.classification.iab_content.tier2.label"
},
{
"actual": "nearest_equivalent",
"expected": "exact",
"path": "model_output.classification.iab_content.mapping_mode"
}
],
"notes": "ML and NLP educational prompts should land in the AI branch.",
"pass": false,
"status": "must_fix",
"text": "What is intent classification in NLP?"
},
{
"actual": {
"model_output.classification.iab_content.mapping_mode": "nearest_equivalent",
"model_output.classification.iab_content.tier1.label": "Personal Finance",
"model_output.classification.iab_content.tier2.label": null,
"model_output.classification.iab_content.tier3.label": null
},
"expected": {
"model_output.classification.iab_content.mapping_mode": "nearest_equivalent",
"model_output.classification.iab_content.tier1.label": "Business and Finance",
"model_output.classification.iab_content.tier2.label": "Business",
"model_output.classification.iab_content.tier3.label": "Business I.T."
},
"id": "support-credential-help-maps-to-business-it",
"mismatches": [
{
"actual": "Personal Finance",
"expected": "Business and Finance",
"path": "model_output.classification.iab_content.tier1.label"
},
{
"actual": null,
"expected": "Business",
"path": "model_output.classification.iab_content.tier2.label"
},
{
"actual": null,
"expected": "Business I.T.",
"path": "model_output.classification.iab_content.tier3.label"
}
],
"notes": "Credential and account help should map to business IT rather than generic business.",
"pass": false,
"status": "must_fix",
"text": "How do I reset my password?"
},
{
"actual": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Food & Drink",
"model_output.classification.iab_content.tier2.label": "Dining Out"
},
"expected": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Food & Drink",
"model_output.classification.iab_content.tier2.label": "Dining Out"
},
"id": "restaurant-booking-maps-to-dining-out",
"mismatches": [],
"notes": "Generic dining requests should not inherit the repo's business default.",
"pass": true,
"status": "must_fix",
"text": "Book a table for 2 tonight"
},
{
"actual": {
"model_output.classification.iab_content.mapping_mode": "nearest_equivalent",
"model_output.classification.iab_content.tier1.label": "Sports",
"model_output.classification.iab_content.tier2.label": null,
"model_output.classification.iab_content.tier3.label": null
},
"expected": {
"model_output.classification.iab_content.mapping_mode": "nearest_equivalent",
"model_output.classification.iab_content.tier1.label": "Technology & Computing",
"model_output.classification.iab_content.tier2.label": "Computing",
"model_output.classification.iab_content.tier3.label": "Software and Applications"
},
"id": "trial-signup-maps-to-software",
"mismatches": [
{
"actual": "Sports",
"expected": "Technology & Computing",
"path": "model_output.classification.iab_content.tier1.label"
},
{
"actual": null,
"expected": "Computing",
"path": "model_output.classification.iab_content.tier2.label"
},
{
"actual": null,
"expected": "Software and Applications",
"path": "model_output.classification.iab_content.tier3.label"
}
],
"notes": "Software action queries should map to the software/application branch.",
"pass": false,
"status": "must_fix",
"text": "Start my free trial"
},
{
"actual": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Careers",
"model_output.classification.iab_content.tier2.label": "Remote Working",
"model_output.classification.iab_content.tier3.label": null,
"model_output.classification.iab_content.tier4.label": null
},
"expected": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Technology & Computing",
"model_output.classification.iab_content.tier2.label": "Computing",
"model_output.classification.iab_content.tier3.label": "Computer Software and Applications",
"model_output.classification.iab_content.tier4.label": "Communication"
},
"id": "communication-software-maps-to-tier4",
"mismatches": [
{
"actual": "Careers",
"expected": "Technology & Computing",
"path": "model_output.classification.iab_content.tier1.label"
},
{
"actual": "Remote Working",
"expected": "Computing",
"path": "model_output.classification.iab_content.tier2.label"
},
{
"actual": null,
"expected": "Computer Software and Applications",
"path": "model_output.classification.iab_content.tier3.label"
},
{
"actual": null,
"expected": "Communication",
"path": "model_output.classification.iab_content.tier4.label"
}
],
"notes": "Full taxonomy support should preserve the tier4 communication branch.",
"pass": false,
"status": "must_fix",
"text": "best communication software for remote teams"
},
{
"actual": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Food & Drink",
"model_output.classification.iab_content.tier2.label": "Alcoholic Beverages"
},
"expected": {
"model_output.classification.iab_content.mapping_mode": "exact",
"model_output.classification.iab_content.tier1.label": "Food & Drink",
"model_output.classification.iab_content.tier2.label": "Alcoholic Beverages"
},
"id": "vodka-query-maps-to-alcoholic-beverages",
"mismatches": [],
"notes": "Food and beverage prompts should not fall through to the business default.",
"pass": true,
"status": "must_fix",
"text": "what is best vodka drink should i try"
}
]
}