vgecbot / classifier-demo.py
harsh-dev's picture
docker deployment
4225666
from app.services.filter_classifier import FilterClassifier
classifier = FilterClassifier()
from typing import Optional, List, Dict, Any
from dataclasses import dataclass
@dataclass
class TestCase:
name: str
question: str
expected_type: Optional[str] = None
expected_category: Optional[str] = None
expected_topic: Optional[str] = None
expected_year: Optional[int] = None
expected_intent: Optional[str] = None
notes: str = "" # Why this test matters
# ============================================================================
# BATCH TEST SUITE - Organized by weakness category
# ============================================================================
TEST_SUITE = [
# ─────────────────────────────────────────────────────────────────────────
# GROUP 1: Basic Sanity (Should Pass)
# ─────────────────────────────────────────────────────────────────────────
TestCase("basic_dept_list", "show me all departments",
"department", None, None, None, "list",
"Basic type detection + list intent"),
TestCase("basic_hostel_fees", "what are the hostel fees",
"hostel", None, "fees", None, None,
"Type + topic combo"),
TestCase("basic_library", "library books catalog",
"library", None, None, None, None,
"Direct type match"),
TestCase("basic_placement_stats_year", "campus placement statistics 2023",
"placement", None, "stats", 2023, None,
"Type + topic + year extraction"),
# ─────────────────────────────────────────────────────────────────────────
# GROUP 2: Abbreviation Expansion (Critical)
# ─────────────────────────────────────────────────────────────────────────
TestCase("abbr_ce", "CE department faculty",
"department", "computer", "faculty", None, None,
"'ce' -> computer engineering"),
TestCase("abbr_cse_ds", "CSE DS syllabus",
"department", "cse_ds", "syllabus", None, None,
"'cse' + 'ds' -> cse_ds"),
TestCase("abbr_ece_missing", "ECE lab equipment", # ⚠️ WILL FAIL - not in ABBREVIATIONS
"department", "electronics_comm", "lab", None, None,
"'ece' NOT in ABBREVIATIONS dict - expect failure"),
TestCase("abbr_ei", "EI sensors calibration",
"department", "electronics_inst", None, None, None,
"'ei' -> electronics instrumentation"),
TestCase("abbr_it", "IT department placement stats",
"department", "it", "stats", None, None,
"'it' -> information technology"),
TestCase("abbr_me", "ME workshop schedule",
"department", "mechanical", None, None, None,
"'me' -> mechanical"),
TestCase("abbr_am", "AM stress analysis project",
"department", "applied_mechanics", "project", None, None,
"'am' -> applied mechanics"),
TestCase("abbr_ict", "ICT networking lab",
"department", "ict", "lab", None, None,
"'ict' in ABBREVIATIONS"),
# ─────────────────────────────────────────────────────────────────────────
# GROUP 3: Type/Category/Topic Conflicts (Ambiguity Hell)
# ─────────────────────────────────────────────────────────────────────────
TestCase("conflict_club_type_vs_cat", "adventure club activities",
"club", "adventure", None, None, None,
"'club' word in TYPE, 'adventure' in CATEGORY"),
TestCase("conflict_ieee_club", "join the ieee club",
"club", "ieee", None, None, None,
"ieee category + club type word"),
TestCase("conflict_event_ambiguity", "technical events this month",
None, None, "event", None, None,
"'events' = topic, but clubs host events - type confusion risk"),
TestCase("conflict_process_topic", "what is the admission process", # ⚠️ CRITICAL
"admission", None, "process", None, None,
"'process' = TOPIC (procedure) - NOT intent"),
TestCase("conflict_process_intent", "how to apply step by step process", # ⚠️ CRITICAL
"admission", None, None, None, "process",
"'process' = INTENT (how to) - same word, different meaning!"),
TestCase("conflict_fees_finance", "finance department fees structure",
"service", "finance", "fees", None, None,
"'finance' category + 'fees' topic overlap"),
# ─────────────────────────────────────────────────────────────────────────
# GROUP 4: Semantic Similarity Traps
# ─────────────────────────────────────────────────────────────────────────
TestCase("trap_computer_vs_cseds_hardware", "computer hardware VLSI design",
"department", "computer", None, None, None,
"MUST be 'computer' (VLSI), NOT cse_ds despite 'computer'"),
TestCase("trap_cseds_ml", "computer science machine learning course",
"department", "cse_ds", None, None, None,
"'computer science' + 'ML' = cse_ds (ML in cse_ds anchors)"),
TestCase("trap_ece_vs_ei_wireless", "electronics wireless communication",
"department", "electronics_comm", None, None, None,
"'wireless' = ECE anchor"),
TestCase("trap_ece_vs_ei_sensors", "electronics sensor measurement biomedical",
"department", "electronics_inst", None, None, None,
"'sensors' + 'biomedical' = EI anchor"),
TestCase("trap_it_vs_ict_software", "information technology software development ERP",
"department", "it", None, None, None,
"'software' + 'ERP' = IT anchor"),
TestCase("trap_it_vs_ict_fiber", "information communication fiber optic bandwidth",
"department", "ict", None, None, None,
"'fiber optic' = ICT anchor"),
# ─────────────────────────────────────────────────────────────────────────
# GROUP 5: Intent Detection
# ─────────────────────────────────────────────────────────────────────────
TestCase("intent_list_explicit", "list all mechanical engineering labs",
"department", "mechanical", "lab", None, "list",
"Explicit 'list' keyword"),
TestCase("intent_list_implicit", "show every available facility",
"facility", None, None, None, "list",
"Implicit list: 'show every'"),
TestCase("intent_count_how_many", "how many students placed in 2024",
"placement", None, None, 2024, "count",
"'how many' = count intent"),
TestCase("intent_count_total", "total number of hostels",
"hostel", None, None, None, "count",
"'total number' = count intent"),
TestCase("intent_detail_explain", "explain the civil engineering syllabus",
"department", "civil", "syllabus", None, "detail",
"'explain' = detail intent"),
TestCase("intent_detail_tell_me", "tell me about research publications",
"research", "publication", None, None, "detail",
"'tell me about' = detail intent"),
TestCase("intent_greeting_hello", "hello good morning",
None, None, None, None, "greeting",
"Pure greeting - no other fields"),
TestCase("intent_greeting_casual", "hi, how are you",
None, None, None, None, "greeting",
"Casual greeting"),
# ─────────────────────────────────────────────────────────────────────────
# GROUP 6: Year Extraction Edge Cases
# ─────────────────────────────────────────────────────────────────────────
TestCase("year_simple", "placement records 2023",
None, None, None, 2023, None,
"Standard year extraction"),
TestCase("year_batch_of", "batch of 2024 students",
None, None, None, 2024, None,
"'batch of' prefix"),
TestCase("year_explicit", "year 2025 admission process",
"admission", None, None, 2025, None,
"Explicit 'year' keyword"),
TestCase("year_ordinal_fail", "2nd year computer science", # ⚠️ LIKELY FAIL
"department", "cse_ds", None, None, None,
"'2nd year' - NO year should be extracted (regex is \\b20\\d{2}\\b)"),
TestCase("year_range_first", "2020-2024 batch", # ⚠️ AMBIGUOUS
None, None, None, 2020, None,
"Range - regex finds first 20xx (2020)"),
TestCase("year_written_fail", "twenty twenty three", # ⚠️ WILL FAIL
None, None, None, None, None,
"Written year - regex only matches digits"),
# ─────────────────────────────────────────────────────────────────────────
# GROUP 7: MASTER_INDEX Constraint Violations
# ─────────────────────────────────────────────────────────────────────────
TestCase("master_invalid_cat_for_type", "hostel admission process",
"hostel", None, None, None, None,
"'admission' invalid category for 'hostel' type -> should be None"),
TestCase("master_invalid_topic_for_type", "placement syllabus",
"placement", None, None, None, None,
"'syllabus' invalid topic for 'placement' -> should be None"),
TestCase("master_invalid_topic_research", "research timetable",
"research", None, None, None, None,
"'timetable' invalid for 'research' type -> should be None"),
# ─────────────────────────────────────────────────────────────────────────
# GROUP 8: Complex Real-World Queries
# ─────────────────────────────────────────────────────────────────────────
TestCase("complex_bonafide", "how to get bonafide certificate from finance office",
"service", "finance", "document", None, "process",
"4-field complex: service+finance+document+process"),
TestCase("complex_five_fields", "show me the 2024 civil engineering final year project list",
"department", "civil", "project", 2024, "list",
"5 fields: type+category+topic+year+intent"),
TestCase("complex_nss", "NSS volunteer induction schedule 2023",
"club", "nss", "induction", 2023, None,
"4 fields: club+nss+induction+year"),
TestCase("vague_bus", "when does the bus leave",
"service", "transport", None, None, None,
"Short query - transport detection"),
TestCase("ambiguous_hod", "where can I find the HOD",
None, None, "contact", None, None,
"Ambiguous: HOD in faculty topic, but asking contact/location"),
# ─────────────────────────────────────────────────────────────────────────
# GROUP 9: Adversarial / Edge Cases
# ─────────────────────────────────────────────────────────────────────────
TestCase("edge_empty", "",
None, None, None, None, None,
"Empty string - all None expected"),
TestCase("edge_nonsense", "the quick brown fox",
None, None, None, None, None,
"No relevant keywords"),
TestCase("edge_all_caps", "LIBRARY BOOKS",
"library", None, None, None, None,
"Case sensitivity test"),
TestCase("edge_mixed_case", "Ce DePaRtMeNt",
"department", "computer", None, None, None,
"Mixed case abbreviation"),
TestCase("edge_punctuation", "civil-engineering!!! lab???",
"department", "civil", "lab", None, None,
"Punctuation handling"),
TestCase("edge_parentheses", "cse_ds (data science) syllabus",
"department", "cse_ds", "syllabus", None, None,
"Parentheses handling"),
]
# ============================================================================
# BATCH PROCESSING ENGINE
# ============================================================================
def run_batch_tests(test_cases: List[TestCase]) -> Dict[str, Any]:
"""
Process all tests in batch, return structured results.
"""
results = []
for test in test_cases:
# Run classifier
actual = classifier.classify(test.question)
# Check each expected field
errors = []
fields = [
("type", test.expected_type),
("category", test.expected_category),
("topic", test.expected_topic),
("year", test.expected_year),
("intent", test.expected_intent),
]
for field_name, expected in fields:
if expected is not None and actual[field_name] != expected:
errors.append({
"field": field_name,
"expected": expected,
"actual": actual[field_name]
})
passed = len(errors) == 0
results.append({
"name": test.name,
"question": test.question,
"notes": test.notes,
"expected": {
"type": test.expected_type,
"category": test.expected_category,
"topic": test.expected_topic,
"year": test.expected_year,
"intent": test.expected_intent,
},
"actual": actual,
"passed": passed,
"errors": errors
})
return analyze_results(results)
def analyze_results(results: List[Dict]) -> Dict[str, Any]:
"""
Generate comprehensive analysis report.
"""
total = len(results)
passed = sum(1 for r in results if r["passed"])
failed = total - passed
# Error breakdown by field
field_errors = {"type": 0, "category": 0, "topic": 0, "year": 0, "intent": 0}
for r in results:
for err in r["errors"]:
field_errors[err["field"]] += 1
# Categorize failures
critical_failures = [r for r in results if not r["passed"] and "CRITICAL" in r["notes"]]
abbr_failures = [r for r in results if not r["passed"] and "abbrev" in r["name"].lower()]
ambiguity_failures = [r for r in results if not r["passed"] and "conflict" in r["name"].lower()]
return {
"summary": {
"total": total,
"passed": passed,
"failed": failed,
"pass_rate": f"{passed/total*100:.1f}%" if total > 0 else "0%"
},
"field_error_rates": field_errors,
"critical_issues": {
"count": len(critical_failures),
"tests": [r["name"] for r in critical_failures]
},
"abbreviation_issues": {
"count": len(abbr_failures),
"tests": [r["name"] for r in abbr_failures]
},
"ambiguity_issues": {
"count": len(ambiguity_failures),
"tests": [r["name"] for r in ambiguity_failures]
},
"all_results": results,
"failed_tests": [r for r in results if not r["passed"]]
}
def print_report(report: Dict[str, Any], verbose: bool = False):
"""
Pretty print the test report.
"""
s = report["summary"]
print(f"\n{'='*70}")
print(f"BATCH TEST RESULTS: {s['passed']}/{s['total']} passed ({s['pass_rate']})")
print(f"{'='*70}")
print(f"\nFIELD ERROR BREAKDOWN:")
for field, count in report["field_error_rates"].items():
if count > 0:
print(f" {field:12s}: {count} errors")
if report["critical_issues"]["count"] > 0:
print(f"\nCRITICAL ISSUES ({report['critical_issues']['count']}):")
for name in report["critical_issues"]["tests"]:
print(f" • {name}")
if report["abbreviation_issues"]["count"] > 0:
print(f"\nABBREVIATION FAILURES ({report['abbreviation_issues']['count']}):")
for name in report["abbreviation_issues"]["tests"]:
print(f" • {name}")
if report["ambiguity_issues"]["count"] > 0:
print(f"\nAMBIGUITY FAILURES ({report['ambiguity_issues']['count']}):")
for name in report["ambiguity_issues"]["tests"]:
print(f" • {name}")
if verbose and report["failed_tests"]:
print(f"\n{'='*70}")
print("DETAILED FAILURE LOG:")
print(f"{'='*70}")
for r in report["failed_tests"]:
print(f"\n[X] {r['name']}")
print(f" Query: '{r['question']}'")
print(f" Note: {r['notes']}")
print(f" Expected: {r['expected']}")
print(f" Actual: {r['actual']}")
for err in r["errors"]:
print(f" [!] {err['field']}: expected '{err['expected']}', got '{err['actual']}'")
print(f"\n{'='*70}")
# ============================================================================
# USAGE
# ============================================================================
# Run all tests
report = run_batch_tests(TEST_SUITE)
# Print summary
print_report(report, verbose=True)
# Access specific results
# print(report["failed_tests"]) # List of all failures
# print(report["field_error_rates"]) # Error counts per field
# Run specific group only
# group_2_tests = [t for t in TEST_SUITE if t.name.startswith("abbr_")]
# abbr_report = run_batch_tests(group_2_tests)
# print_report(abbr_report)