import json from pathlib import Path from jawbreaker.schema import RISK_LEVELS def test_eval_dataset_has_100_unique_cases() -> None: rows = [ json.loads(line) for line in Path("eval/scam_eval.jsonl").read_text(encoding="utf-8").splitlines() if line.strip() ] assert len(rows) == 100 assert len({row["id"] for row in rows}) == 100 def test_eval_dataset_required_fields_and_risk_levels() -> None: required = {"id", "category", "input", "expected_risk_level", "expected_scam_type", "expected_tactics"} rows = [ json.loads(line) for line in Path("eval/scam_eval.jsonl").read_text(encoding="utf-8").splitlines() if line.strip() ] for row in rows: assert required <= set(row) assert row["expected_risk_level"] in RISK_LEVELS assert isinstance(row["expected_tactics"], list) assert ".example" in row["input"] or "http" not in row["input"].lower() def test_generated_training_splits_exist_and_do_not_overlap() -> None: paths = { "train": Path("training/data/train.jsonl"), "dev": Path("training/data/dev.jsonl"), "test": Path("training/data/test.jsonl"), } expected_counts = {"train": 720, "dev": 120, "test": 180} seen_messages = set() for split, path in paths.items(): rows = [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] assert len(rows) == expected_counts[split] for row in rows: assert {"id", "messages", "input", "prediction"} <= set(row) assert row["input"] not in seen_messages seen_messages.add(row["input"]) prediction = row["prediction"] assert prediction["risk_level"] in RISK_LEVELS assert set(prediction["scam_dna"]) == {"impersonates", "pressure", "ask", "risk"} assert row["messages"][-1]["role"] == "assistant" def test_generated_eval_has_test_count_and_safe_urls() -> None: rows = [ json.loads(line) for line in Path("eval/generated_eval.jsonl").read_text(encoding="utf-8").splitlines() if line.strip() ] assert len(rows) == 180 assert len({row["id"] for row in rows}) == 180 for row in rows: assert row["expected_risk_level"] in RISK_LEVELS assert ".example" in row["input"] or "http" not in row["input"].lower() def test_field_examples_are_sanitized_and_valid() -> None: rows = [ json.loads(line) for line in Path("eval/field_examples.jsonl").read_text(encoding="utf-8").splitlines() if line.strip() ] assert len(rows) >= 2 for row in rows: assert {"id", "category", "input", "expected_risk_level", "expected_scam_type", "expected_tactics"} <= set(row) assert row["expected_risk_level"] in RISK_LEVELS assert "[phone number]" in row["input"] or "[callback number]" in row["input"] assert "Vineel" not in row["input"] assert "+1" not in row["input"] def test_fresh_2026_eval_is_sanitized_and_balanced() -> None: rows = [ json.loads(line) for line in Path("eval/fresh_2026_scam_eval.jsonl").read_text(encoding="utf-8").splitlines() if line.strip() ] assert len(rows) == 100 assert len({row["id"] for row in rows}) == 100 risk_counts = {} categories = set() for row in rows: assert {"id", "category", "input", "expected_risk_level", "expected_scam_type", "expected_tactics"} <= set(row) assert row["expected_risk_level"] in RISK_LEVELS assert isinstance(row["expected_tactics"], list) assert ".example" in row["input"] or "http" not in row["input"].lower() assert "+1" not in row["input"] assert "@" not in row["input"] risk_counts[row["expected_risk_level"]] = risk_counts.get(row["expected_risk_level"], 0) + 1 categories.add(row["category"]) assert risk_counts == {"dangerous": 72, "needs_check": 16, "safe": 12} assert { "toll_smishing", "package_phishing", "callback_phishing", "job_scam", "investment_scam", "credential_theft", "government_impersonation", "marketplace_scam", "tech_support", "legitimate_needs_check", "safe_benign", } <= categories def test_v7_training_data_is_sanitized_and_separate_from_fresh_eval() -> None: split_paths = { "train": Path("training/data/train_v7.jsonl"), "dev": Path("training/data/dev_v7.jsonl"), "test": Path("training/data/test_v7.jsonl"), } expected_counts = {"train": 2192, "dev": 498, "test": 558} fresh_eval_inputs = { json.loads(line)["input"] for line in Path("eval/fresh_2026_scam_eval.jsonl").read_text(encoding="utf-8").splitlines() if line.strip() } for split, path in split_paths.items(): rows = [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] assert len(rows) == expected_counts[split] assert not ({row["input"] for row in rows} & fresh_eval_inputs) for row in rows: assert {"id", "messages", "input", "prediction"} <= set(row) assert row["prediction"]["risk_level"] in RISK_LEVELS assert ".example" in row["input"] or "http" not in row["input"].lower() assert "+1" not in row["input"] assert "@" not in row["input"] hard_rows = [ json.loads(line) for line in Path("eval/hard_v7_eval.jsonl").read_text(encoding="utf-8").splitlines() if line.strip() ] assert len(hard_rows) == expected_counts["test"] assert { "public_pattern_wrong_number_crypto_v7", "public_pattern_marketplace_money_v7", "public_pattern_task_job_v7", "public_pattern_mfa_code_v7", "public_pattern_toll_tax_benefit_v7", "safe_everyday_family_v7", "needs_check_official_route_v7", } <= {row["category"] for row in hard_rows} def test_v8_training_data_is_sanitized_and_separate_from_fresh_eval() -> None: split_paths = { "train": Path("training/data/train_v8.jsonl"), "dev": Path("training/data/dev_v8.jsonl"), "test": Path("training/data/test_v8.jsonl"), } expected_counts = {"train": 2488, "dev": 572, "test": 632} fresh_eval_inputs = { json.loads(line)["input"] for line in Path("eval/fresh_2026_scam_eval.jsonl").read_text(encoding="utf-8").splitlines() if line.strip() } for split, path in split_paths.items(): rows = [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] assert len(rows) == expected_counts[split] assert not ({row["input"] for row in rows} & fresh_eval_inputs) for row in rows: assert {"id", "messages", "input", "prediction"} <= set(row) assert row["prediction"]["risk_level"] in RISK_LEVELS assert ".example" in row["input"] or "http" not in row["input"].lower() assert "+1" not in row["input"] assert "@" not in row["input"] hard_rows = [ json.loads(line) for line in Path("eval/hard_v8_eval.jsonl").read_text(encoding="utf-8").splitlines() if line.strip() ] assert len(hard_rows) == expected_counts["test"] assert { "wrong_number_investment_danger_v8", "wrong_number_social_no_money_v8", "safe_family_logistics_v8", "safe_school_pickup_v8", "safe_pharmacy_clinic_v8", "school_clinic_payment_link_danger_v8", "official_route_needs_check_v8", } <= {row["category"] for row in hard_rows}