from app.services.filter_classifier import FilterClassifier classifier = FilterClassifier() from typing import Optional, List, Dict, Any from dataclasses import dataclass @dataclass class TestCase: name: str question: str expected_type: Optional[str] = None expected_category: Optional[str] = None expected_topic: Optional[str] = None expected_year: Optional[int] = None expected_intent: Optional[str] = None notes: str = "" # Why this test matters # ============================================================================ # BATCH TEST SUITE - Organized by weakness category # ============================================================================ TEST_SUITE = [ # ───────────────────────────────────────────────────────────────────────── # GROUP 1: Basic Sanity (Should Pass) # ───────────────────────────────────────────────────────────────────────── TestCase("basic_dept_list", "show me all departments", "department", None, None, None, "list", "Basic type detection + list intent"), TestCase("basic_hostel_fees", "what are the hostel fees", "hostel", None, "fees", None, None, "Type + topic combo"), TestCase("basic_library", "library books catalog", "library", None, None, None, None, "Direct type match"), TestCase("basic_placement_stats_year", "campus placement statistics 2023", "placement", None, "stats", 2023, None, "Type + topic + year extraction"), # ───────────────────────────────────────────────────────────────────────── # GROUP 2: Abbreviation Expansion (Critical) # ───────────────────────────────────────────────────────────────────────── TestCase("abbr_ce", "CE department faculty", "department", "computer", "faculty", None, None, "'ce' -> computer engineering"), TestCase("abbr_cse_ds", "CSE DS syllabus", "department", "cse_ds", "syllabus", None, None, "'cse' + 'ds' -> cse_ds"), TestCase("abbr_ece_missing", "ECE lab equipment", # ⚠️ WILL FAIL - not in ABBREVIATIONS "department", "electronics_comm", "lab", None, None, "'ece' NOT in ABBREVIATIONS dict - expect failure"), TestCase("abbr_ei", "EI sensors calibration", "department", "electronics_inst", None, None, None, "'ei' -> electronics instrumentation"), TestCase("abbr_it", "IT department placement stats", "department", "it", "stats", None, None, "'it' -> information technology"), TestCase("abbr_me", "ME workshop schedule", "department", "mechanical", None, None, None, "'me' -> mechanical"), TestCase("abbr_am", "AM stress analysis project", "department", "applied_mechanics", "project", None, None, "'am' -> applied mechanics"), TestCase("abbr_ict", "ICT networking lab", "department", "ict", "lab", None, None, "'ict' in ABBREVIATIONS"), # ───────────────────────────────────────────────────────────────────────── # GROUP 3: Type/Category/Topic Conflicts (Ambiguity Hell) # ───────────────────────────────────────────────────────────────────────── TestCase("conflict_club_type_vs_cat", "adventure club activities", "club", "adventure", None, None, None, "'club' word in TYPE, 'adventure' in CATEGORY"), TestCase("conflict_ieee_club", "join the ieee club", "club", "ieee", None, None, None, "ieee category + club type word"), TestCase("conflict_event_ambiguity", "technical events this month", None, None, "event", None, None, "'events' = topic, but clubs host events - type confusion risk"), TestCase("conflict_process_topic", "what is the admission process", # ⚠️ CRITICAL "admission", None, "process", None, None, "'process' = TOPIC (procedure) - NOT intent"), TestCase("conflict_process_intent", "how to apply step by step process", # ⚠️ CRITICAL "admission", None, None, None, "process", "'process' = INTENT (how to) - same word, different meaning!"), TestCase("conflict_fees_finance", "finance department fees structure", "service", "finance", "fees", None, None, "'finance' category + 'fees' topic overlap"), # ───────────────────────────────────────────────────────────────────────── # GROUP 4: Semantic Similarity Traps # ───────────────────────────────────────────────────────────────────────── TestCase("trap_computer_vs_cseds_hardware", "computer hardware VLSI design", "department", "computer", None, None, None, "MUST be 'computer' (VLSI), NOT cse_ds despite 'computer'"), TestCase("trap_cseds_ml", "computer science machine learning course", "department", "cse_ds", None, None, None, "'computer science' + 'ML' = cse_ds (ML in cse_ds anchors)"), TestCase("trap_ece_vs_ei_wireless", "electronics wireless communication", "department", "electronics_comm", None, None, None, "'wireless' = ECE anchor"), TestCase("trap_ece_vs_ei_sensors", "electronics sensor measurement biomedical", "department", "electronics_inst", None, None, None, "'sensors' + 'biomedical' = EI anchor"), TestCase("trap_it_vs_ict_software", "information technology software development ERP", "department", "it", None, None, None, "'software' + 'ERP' = IT anchor"), TestCase("trap_it_vs_ict_fiber", "information communication fiber optic bandwidth", "department", "ict", None, None, None, "'fiber optic' = ICT anchor"), # ───────────────────────────────────────────────────────────────────────── # GROUP 5: Intent Detection # ───────────────────────────────────────────────────────────────────────── TestCase("intent_list_explicit", "list all mechanical engineering labs", "department", "mechanical", "lab", None, "list", "Explicit 'list' keyword"), TestCase("intent_list_implicit", "show every available facility", "facility", None, None, None, "list", "Implicit list: 'show every'"), TestCase("intent_count_how_many", "how many students placed in 2024", "placement", None, None, 2024, "count", "'how many' = count intent"), TestCase("intent_count_total", "total number of hostels", "hostel", None, None, None, "count", "'total number' = count intent"), TestCase("intent_detail_explain", "explain the civil engineering syllabus", "department", "civil", "syllabus", None, "detail", "'explain' = detail intent"), TestCase("intent_detail_tell_me", "tell me about research publications", "research", "publication", None, None, "detail", "'tell me about' = detail intent"), TestCase("intent_greeting_hello", "hello good morning", None, None, None, None, "greeting", "Pure greeting - no other fields"), TestCase("intent_greeting_casual", "hi, how are you", None, None, None, None, "greeting", "Casual greeting"), # ───────────────────────────────────────────────────────────────────────── # GROUP 6: Year Extraction Edge Cases # ───────────────────────────────────────────────────────────────────────── TestCase("year_simple", "placement records 2023", None, None, None, 2023, None, "Standard year extraction"), TestCase("year_batch_of", "batch of 2024 students", None, None, None, 2024, None, "'batch of' prefix"), TestCase("year_explicit", "year 2025 admission process", "admission", None, None, 2025, None, "Explicit 'year' keyword"), TestCase("year_ordinal_fail", "2nd year computer science", # ⚠️ LIKELY FAIL "department", "cse_ds", None, None, None, "'2nd year' - NO year should be extracted (regex is \\b20\\d{2}\\b)"), TestCase("year_range_first", "2020-2024 batch", # ⚠️ AMBIGUOUS None, None, None, 2020, None, "Range - regex finds first 20xx (2020)"), TestCase("year_written_fail", "twenty twenty three", # ⚠️ WILL FAIL None, None, None, None, None, "Written year - regex only matches digits"), # ───────────────────────────────────────────────────────────────────────── # GROUP 7: MASTER_INDEX Constraint Violations # ───────────────────────────────────────────────────────────────────────── TestCase("master_invalid_cat_for_type", "hostel admission process", "hostel", None, None, None, None, "'admission' invalid category for 'hostel' type -> should be None"), TestCase("master_invalid_topic_for_type", "placement syllabus", "placement", None, None, None, None, "'syllabus' invalid topic for 'placement' -> should be None"), TestCase("master_invalid_topic_research", "research timetable", "research", None, None, None, None, "'timetable' invalid for 'research' type -> should be None"), # ───────────────────────────────────────────────────────────────────────── # GROUP 8: Complex Real-World Queries # ───────────────────────────────────────────────────────────────────────── TestCase("complex_bonafide", "how to get bonafide certificate from finance office", "service", "finance", "document", None, "process", "4-field complex: service+finance+document+process"), TestCase("complex_five_fields", "show me the 2024 civil engineering final year project list", "department", "civil", "project", 2024, "list", "5 fields: type+category+topic+year+intent"), TestCase("complex_nss", "NSS volunteer induction schedule 2023", "club", "nss", "induction", 2023, None, "4 fields: club+nss+induction+year"), TestCase("vague_bus", "when does the bus leave", "service", "transport", None, None, None, "Short query - transport detection"), TestCase("ambiguous_hod", "where can I find the HOD", None, None, "contact", None, None, "Ambiguous: HOD in faculty topic, but asking contact/location"), # ───────────────────────────────────────────────────────────────────────── # GROUP 9: Adversarial / Edge Cases # ───────────────────────────────────────────────────────────────────────── TestCase("edge_empty", "", None, None, None, None, None, "Empty string - all None expected"), TestCase("edge_nonsense", "the quick brown fox", None, None, None, None, None, "No relevant keywords"), TestCase("edge_all_caps", "LIBRARY BOOKS", "library", None, None, None, None, "Case sensitivity test"), TestCase("edge_mixed_case", "Ce DePaRtMeNt", "department", "computer", None, None, None, "Mixed case abbreviation"), TestCase("edge_punctuation", "civil-engineering!!! lab???", "department", "civil", "lab", None, None, "Punctuation handling"), TestCase("edge_parentheses", "cse_ds (data science) syllabus", "department", "cse_ds", "syllabus", None, None, "Parentheses handling"), ] # ============================================================================ # BATCH PROCESSING ENGINE # ============================================================================ def run_batch_tests(test_cases: List[TestCase]) -> Dict[str, Any]: """ Process all tests in batch, return structured results. """ results = [] for test in test_cases: # Run classifier actual = classifier.classify(test.question) # Check each expected field errors = [] fields = [ ("type", test.expected_type), ("category", test.expected_category), ("topic", test.expected_topic), ("year", test.expected_year), ("intent", test.expected_intent), ] for field_name, expected in fields: if expected is not None and actual[field_name] != expected: errors.append({ "field": field_name, "expected": expected, "actual": actual[field_name] }) passed = len(errors) == 0 results.append({ "name": test.name, "question": test.question, "notes": test.notes, "expected": { "type": test.expected_type, "category": test.expected_category, "topic": test.expected_topic, "year": test.expected_year, "intent": test.expected_intent, }, "actual": actual, "passed": passed, "errors": errors }) return analyze_results(results) def analyze_results(results: List[Dict]) -> Dict[str, Any]: """ Generate comprehensive analysis report. """ total = len(results) passed = sum(1 for r in results if r["passed"]) failed = total - passed # Error breakdown by field field_errors = {"type": 0, "category": 0, "topic": 0, "year": 0, "intent": 0} for r in results: for err in r["errors"]: field_errors[err["field"]] += 1 # Categorize failures critical_failures = [r for r in results if not r["passed"] and "CRITICAL" in r["notes"]] abbr_failures = [r for r in results if not r["passed"] and "abbrev" in r["name"].lower()] ambiguity_failures = [r for r in results if not r["passed"] and "conflict" in r["name"].lower()] return { "summary": { "total": total, "passed": passed, "failed": failed, "pass_rate": f"{passed/total*100:.1f}%" if total > 0 else "0%" }, "field_error_rates": field_errors, "critical_issues": { "count": len(critical_failures), "tests": [r["name"] for r in critical_failures] }, "abbreviation_issues": { "count": len(abbr_failures), "tests": [r["name"] for r in abbr_failures] }, "ambiguity_issues": { "count": len(ambiguity_failures), "tests": [r["name"] for r in ambiguity_failures] }, "all_results": results, "failed_tests": [r for r in results if not r["passed"]] } def print_report(report: Dict[str, Any], verbose: bool = False): """ Pretty print the test report. """ s = report["summary"] print(f"\n{'='*70}") print(f"BATCH TEST RESULTS: {s['passed']}/{s['total']} passed ({s['pass_rate']})") print(f"{'='*70}") print(f"\nFIELD ERROR BREAKDOWN:") for field, count in report["field_error_rates"].items(): if count > 0: print(f" {field:12s}: {count} errors") if report["critical_issues"]["count"] > 0: print(f"\nCRITICAL ISSUES ({report['critical_issues']['count']}):") for name in report["critical_issues"]["tests"]: print(f" • {name}") if report["abbreviation_issues"]["count"] > 0: print(f"\nABBREVIATION FAILURES ({report['abbreviation_issues']['count']}):") for name in report["abbreviation_issues"]["tests"]: print(f" • {name}") if report["ambiguity_issues"]["count"] > 0: print(f"\nAMBIGUITY FAILURES ({report['ambiguity_issues']['count']}):") for name in report["ambiguity_issues"]["tests"]: print(f" • {name}") if verbose and report["failed_tests"]: print(f"\n{'='*70}") print("DETAILED FAILURE LOG:") print(f"{'='*70}") for r in report["failed_tests"]: print(f"\n[X] {r['name']}") print(f" Query: '{r['question']}'") print(f" Note: {r['notes']}") print(f" Expected: {r['expected']}") print(f" Actual: {r['actual']}") for err in r["errors"]: print(f" [!] {err['field']}: expected '{err['expected']}', got '{err['actual']}'") print(f"\n{'='*70}") # ============================================================================ # USAGE # ============================================================================ # Run all tests report = run_batch_tests(TEST_SUITE) # Print summary print_report(report, verbose=True) # Access specific results # print(report["failed_tests"]) # List of all failures # print(report["field_error_rates"]) # Error counts per field # Run specific group only # group_2_tests = [t for t in TEST_SUITE if t.name.startswith("abbr_")] # abbr_report = run_batch_tests(group_2_tests) # print_report(abbr_report)