"""Full evaluation suite — verifies every sample gets the RIGHT diagnosis. Run: python -m pytest tests/test_eval.py -v """ import os, sys, json, tempfile import numpy as np import pytest sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) from audio_analyzer import extract_features, AudioFeatures from fault_rules import rank_candidates, RULES from feature_prompt import build_diagnosis_prompt from json_guard import validate, DiagnosisResult from mock_model import mock_generate as _mock_generate ASSETS = os.path.join(os.path.dirname(os.path.dirname(__file__)), "assets") # ============================================================================= # EVALUATION 1: Each sample gets correct diagnosis with correct appliance # ============================================================================= SAMPLE_DIAGNOSES = { "sample_washer_bearing.wav": { "appliance": "Washing machine", "expected_fault_contains": ["bearing", "drum"], "expected_urgency": ["HIGH", "CRITICAL"], "expected_min_weight": 0.6, }, "sample_fan_imbalanced.wav": { "appliance": "Electric fan", "expected_fault_contains": ["imbalance", "blade"], "expected_urgency": ["MEDIUM", "HIGH"], "expected_min_weight": 0.5, }, "sample_motor_squeal.wav": { "appliance": "Electric motor (generic)", "expected_fault_contains": ["squeal", "whine", "bearing", "belt"], "expected_urgency": ["MEDIUM", "HIGH"], "expected_min_weight": 0.5, }, "sample_washer_good.wav": { "appliance": "Washing machine", "expected_fault_contains": ["inconclusive"], "expected_urgency": ["LOW"], "expected_max_weight": 0.1, }, } class TestSampleDiagnoses: """Each sample WAV should produce a specific, correct diagnosis.""" @pytest.mark.parametrize("wav_name,expected", list(SAMPLE_DIAGNOSES.items())) def test_sample_diagnosis(self, wav_name, expected): path = os.path.join(ASSETS, wav_name) if not os.path.exists(path): pytest.skip(f"Sample {wav_name} not found") features = extract_features(path) candidates = rank_candidates(features, expected["appliance"]) assert len(candidates) >= 1 top = candidates[0] # Check fault name contains expected keyword fault_lower = top.name.lower() assert any(kw in fault_lower for kw in expected["expected_fault_contains"]), \ f"Expected fault containing {expected['expected_fault_contains']}, got '{top.name}'" # Check urgency assert top.urgency in expected["expected_urgency"], \ f"Expected urgency in {expected['expected_urgency']}, got '{top.urgency}'" # Check weight bounds if "expected_min_weight" in expected: assert top.weight >= expected["expected_min_weight"], \ f"Expected weight >= {expected['expected_min_weight']}, got {top.weight}" if "expected_max_weight" in expected: assert top.weight <= expected["expected_max_weight"], \ f"Expected weight <= {expected['expected_max_weight']}, got {top.weight}" def test_all_four_samples_give_different_diagnoses(self): """The 4 samples should produce 3+ distinct fault names.""" faults = [] for wav_name, info in SAMPLE_DIAGNOSES.items(): path = os.path.join(ASSETS, wav_name) if not os.path.exists(path): continue features = extract_features(path) candidates = rank_candidates(features, info["appliance"]) faults.append(candidates[0].name) assert len(set(faults)) >= 3, f"Expected 3+ distinct faults, got: {faults}" # ============================================================================= # EVALUATION 2: Mock generate uses correct candidate # ============================================================================= class TestMockGenerate: """The mock should return the top candidate's fault name, not 'Inconclusive'.""" def test_bearing_sample_returns_bearing(self): path = os.path.join(ASSETS, "sample_washer_bearing.wav") if not os.path.exists(path): pytest.skip("Sample not found") features = extract_features(path) candidates = rank_candidates(features, "Washing machine") prompt = build_diagnosis_prompt(features, candidates, "Washing machine") raw = _mock_generate(prompt, candidates, features) parsed = json.loads(raw) assert parsed["fault"].lower() in candidates[0].name.lower() or \ candidates[0].name.lower() in parsed["fault"].lower(), \ f"Mock should return '{candidates[0].name}', got '{parsed['fault']}'" assert parsed["confidence"] >= 60, f"Bearing should have high confidence, got {parsed['confidence']}" def test_fan_sample_returns_fan_fault(self): path = os.path.join(ASSETS, "sample_fan_imbalanced.wav") if not os.path.exists(path): pytest.skip("Sample not found") features = extract_features(path) candidates = rank_candidates(features, "Electric fan") prompt = build_diagnosis_prompt(features, candidates, "Electric fan") raw = _mock_generate(prompt, candidates, features) parsed = json.loads(raw) assert "inconclusive" not in parsed["fault"].lower(), \ f"Fan sample should NOT be Inconclusive, got '{parsed['fault']}'" def test_good_sample_returns_inconclusive(self): path = os.path.join(ASSETS, "sample_washer_good.wav") if not os.path.exists(path): pytest.skip("Sample not found") features = extract_features(path) candidates = rank_candidates(features, "Washing machine") prompt = build_diagnosis_prompt(features, candidates, "Washing machine") raw = _mock_generate(prompt, candidates, features) parsed = json.loads(raw) assert parsed["fault"].lower() == "inconclusive", \ f"Good sample should be Inconclusive, got '{parsed['fault']}'" assert parsed["urgency"] == "LOW" def test_mock_always_returns_valid_json(self): """Every sample should produce parseable JSON with required fields.""" for wav_name, info in SAMPLE_DIAGNOSES.items(): path = os.path.join(ASSETS, wav_name) if not os.path.exists(path): continue features = extract_features(path) candidates = rank_candidates(features, info["appliance"]) prompt = build_diagnosis_prompt(features, candidates, info["appliance"]) raw = _mock_generate(prompt, candidates, features) parsed = json.loads(raw) assert "fault" in parsed assert "urgency" in parsed assert "checks" in parsed and len(parsed["checks"]) >= 1 assert "safety" in parsed assert "confidence" in parsed assert 0 <= parsed["confidence"] <= 100 # ============================================================================= # EVALUATION 3: Full pipeline end-to-end # ============================================================================= class TestFullPipeline: """End-to-end: audio file -> features -> rules -> mock -> validate -> result.""" def test_bearing_pipeline(self): path = os.path.join(ASSETS, "sample_washer_bearing.wav") if not os.path.exists(path): pytest.skip("Sample not found") features = extract_features(path) candidates = rank_candidates(features, "Washing machine") prompt = build_diagnosis_prompt(features, candidates, "Washing machine") raw = _mock_generate(prompt, candidates, features) result = validate(raw, candidates) assert isinstance(result, DiagnosisResult) assert result.grounded assert "bearing" in result.fault.lower() or "drum" in result.fault.lower() assert result.urgency in ("HIGH", "CRITICAL") assert result.confidence >= 60 assert len(result.checks) >= 1 def test_fan_pipeline(self): path = os.path.join(ASSETS, "sample_fan_imbalanced.wav") if not os.path.exists(path): pytest.skip("Sample not found") features = extract_features(path) candidates = rank_candidates(features, "Electric fan") prompt = build_diagnosis_prompt(features, candidates, "Electric fan") raw = _mock_generate(prompt, candidates, features) result = validate(raw, candidates) assert isinstance(result, DiagnosisResult) assert result.grounded assert "inconclusive" not in result.fault.lower() assert result.confidence >= 50 def test_good_sample_pipeline(self): path = os.path.join(ASSETS, "sample_washer_good.wav") if not os.path.exists(path): pytest.skip("Sample not found") features = extract_features(path) candidates = rank_candidates(features, "Washing machine") prompt = build_diagnosis_prompt(features, candidates, "Washing machine") raw = _mock_generate(prompt, candidates, features) result = validate(raw, candidates) assert isinstance(result, DiagnosisResult) assert result.fault.lower() == "inconclusive" assert result.urgency == "LOW" def test_compare_produces_different_results(self): """Comparing bearing vs good washer should show clear improvement.""" f_bearing = extract_features(os.path.join(ASSETS, "sample_washer_bearing.wav")) f_good = extract_features(os.path.join(ASSETS, "sample_washer_good.wav")) c_bearing = rank_candidates(f_bearing, "Washing machine") c_good = rank_candidates(f_good, "Washing machine") # Bearing should have high anomaly, good should have low assert f_bearing.anomaly_score > f_good.anomaly_score, \ f"Bearing ({f_bearing.anomaly_score}) should be more anomalous than good ({f_good.anomaly_score})" # Bearing should fire rules, good should not (or fire fewer) assert c_bearing[0].name != "Inconclusive" or len(c_bearing) > len(c_good) # ============================================================================= # EVALUATION 4: Rule engine correctness for all appliances # ============================================================================= class TestRuleEngine: """Verify each appliance has rules and they fire correctly.""" def test_all_appliances_have_rules(self): expected = [ "Washing machine", "Tumble dryer", "Refrigerator/Freezer", "Electric fan", "Air conditioner", "Vacuum cleaner", "Dishwasher", "Microwave", "Electric motor (generic)", "Car engine", "Bicycle (chain/gears)", "Power drill", ] for appliance in expected: assert appliance in RULES, f"Missing rules for '{appliance}'" assert len(RULES[appliance]) >= 2 def test_typical_bad_input_fires_rules_for_every_appliance(self): """A 'typical bad' feature set should fire at least one rule per appliance.""" bad = AudioFeatures( duration_s=8.0, rms_db=-25.0, rms_variance=0.03, zero_crossing_rate=0.1, spectral_centroid_hz=2000, spectral_bandwidth_hz=2000, spectral_rolloff_hz=4500, dominant_frequency_hz=150.0, harmonic_ratio=0.5, onset_rate_per_sec=3.0, has_regular_pattern=True, pattern_interval_ms=120.0, peak_db=-18.0, anomaly_score=0.7, ) for appliance in RULES: cands = rank_candidates(bad, appliance) assert len(cands) >= 1, f"No rules fired for {appliance}" def test_normal_input_returns_inconclusive(self): """A quiet, normal-sounding input should be Inconclusive for most appliances.""" normal = AudioFeatures( duration_s=8.0, rms_db=-45.0, rms_variance=0.002, zero_crossing_rate=0.02, spectral_centroid_hz=600, spectral_bandwidth_hz=500, spectral_rolloff_hz=1200, dominant_frequency_hz=50.0, harmonic_ratio=0.4, onset_rate_per_sec=0.1, has_regular_pattern=False, pattern_interval_ms=0.0, peak_db=-40.0, anomaly_score=0.05, ) for appliance in ["Washing machine", "Electric fan", "Car engine"]: cands = rank_candidates(normal, appliance) assert cands[0].name == "Inconclusive", \ f"Normal input should be Inconclusive for {appliance}, got {cands[0].name}" # ============================================================================= # EVALUATION 5: Edge cases # ============================================================================= class TestEdgeCases: """Defensive checks on degenerate inputs.""" def test_empty_audio(self): import soundfile as sf path = tempfile.mktemp(suffix=".wav") sf.write(path, np.zeros(1600, dtype="float32"), 16000) try: f = extract_features(path) cands = rank_candidates(f, "Washing machine") assert cands[0].name == "Inconclusive" finally: os.unlink(path) def test_garbage_audio(self): path = tempfile.mktemp(suffix=".wav") import soundfile as sf sf.write(path, np.random.randn(22050 * 2).astype(np.float32) * 0.01, 22050) try: f = extract_features(path) cands = rank_candidates(f, "Electric fan") assert len(cands) >= 1 finally: os.unlink(path) def test_validate_malformed_json(self): f = AudioFeatures( duration_s=8.0, rms_db=-30.0, rms_variance=0.01, zero_crossing_rate=0.05, spectral_centroid_hz=500, spectral_bandwidth_hz=800, spectral_rolloff_hz=1500, dominant_frequency_hz=60.0, harmonic_ratio=0.5, onset_rate_per_sec=0.8, has_regular_pattern=False, pattern_interval_ms=0.0, peak_db=-24.0, anomaly_score=0.45, ) cands = rank_candidates(f, "Electric fan") result = validate("not json at all", cands) assert result.grounded assert result.fault == cands[0].name def test_validate_ungrounded_output(self): f = AudioFeatures( duration_s=8.0, rms_db=-25.0, rms_variance=0.03, zero_crossing_rate=0.08, spectral_centroid_hz=2200, spectral_bandwidth_hz=1800, spectral_rolloff_hz=4500, dominant_frequency_hz=180.0, harmonic_ratio=0.65, onset_rate_per_sec=3.5, has_regular_pattern=True, pattern_interval_ms=150.0, peak_db=-18.0, anomaly_score=0.75, ) cands = rank_candidates(f, "Washing machine") response = json.dumps({ "fault": "Exploding capacitor", "urgency": "CRITICAL", "checks": ["Check it"], "safety": "Unplug", "confidence": 95, }) result = validate(response, cands) assert result.grounded assert result.fault != "Exploding capacitor" def test_candidates_always_returned(self): extreme = AudioFeatures( duration_s=0.0, rms_db=-80.0, rms_variance=0.0, zero_crossing_rate=0.0, spectral_centroid_hz=0.0, spectral_bandwidth_hz=0.0, spectral_rolloff_hz=0.0, dominant_frequency_hz=0.0, harmonic_ratio=0.0, onset_rate_per_sec=0.0, has_regular_pattern=False, pattern_interval_ms=0.0, peak_db=-80.0, anomaly_score=0.0, ) for appliance in RULES: cands = rank_candidates(extreme, appliance) assert len(cands) >= 1