File size: 5,272 Bytes
31f0e50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | #!/usr/bin/env python
"""
Task 4.1 Verification Script.
Runs the verification code from TASKS.md and validates acceptance criteria:
- 1000+ total samples
- 60% scam, 40% legitimate
- 50% English, 40% Hindi, 10% Hinglish
- All samples validated
"""
import json
import os
import sys
# Set UTF-8 output encoding for Windows console
if sys.stdout.encoding != 'utf-8':
sys.stdout.reconfigure(encoding='utf-8')
# Dataset path
DATASET_PATH = os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
"data",
"scam_detection_train.jsonl"
)
def main():
print("=" * 60)
print("Task 4.1: Dataset Creation - Verification")
print("=" * 60)
# Check if file exists
if not os.path.exists(DATASET_PATH):
print(f"\n[ERROR] Dataset file not found: {DATASET_PATH}")
print("Run 'python scripts/generate_dataset.py' first.")
return 1
# Load dataset (as per TASKS.md verification code)
print(f"\nLoading: {DATASET_PATH}")
with open(DATASET_PATH, encoding="utf-8") as f:
data = [json.loads(line) for line in f]
# Print statistics (as per TASKS.md)
print(f"\nTotal samples: {len(data)}")
scam_ratio = sum(1 for d in data if d['label'] == 'scam') / len(data)
print(f"Scam ratio: {scam_ratio:.2%}")
# Extended statistics
print("\n--- Detailed Statistics ---")
# Label distribution
scam_count = sum(1 for d in data if d['label'] == 'scam')
legit_count = sum(1 for d in data if d['label'] == 'legitimate')
print(f"Scam: {scam_count} ({scam_count/len(data):.1%})")
print(f"Legitimate: {legit_count} ({legit_count/len(data):.1%})")
# Language distribution
en_count = sum(1 for d in data if d['language'] == 'en')
hi_count = sum(1 for d in data if d['language'] == 'hi')
hinglish_count = sum(1 for d in data if d['language'] == 'hinglish')
print(f"\nEnglish: {en_count} ({en_count/len(data):.1%})")
print(f"Hindi: {hi_count} ({hi_count/len(data):.1%})")
print(f"Hinglish: {hinglish_count} ({hinglish_count/len(data):.1%})")
# Acceptance Criteria Validation
print("\n" + "=" * 60)
print("Acceptance Criteria")
print("=" * 60)
criteria_results = []
# AC-1: 1000+ total samples
ac1_pass = len(data) >= 1000
criteria_results.append(ac1_pass)
print(f"\nAC-1: 1000+ total samples")
print(f" Value: {len(data)}")
print(f" Status: {'PASS' if ac1_pass else 'FAIL'}")
# AC-2: 60% scam, 40% legitimate
ac2_pass = 0.55 <= scam_ratio <= 0.65
criteria_results.append(ac2_pass)
print(f"\nAC-2: 60% scam, 40% legitimate")
print(f" Scam: {scam_ratio:.1%} (expected: 55-65%)")
print(f" Legitimate: {1-scam_ratio:.1%} (expected: 35-45%)")
print(f" Status: {'PASS' if ac2_pass else 'FAIL'}")
# AC-3: 50% English, 40% Hindi, 10% Hinglish
en_ratio = en_count / len(data)
hi_ratio = hi_count / len(data)
hinglish_ratio = hinglish_count / len(data)
ac3_en = 0.45 <= en_ratio <= 0.55
ac3_hi = 0.35 <= hi_ratio <= 0.45
ac3_hinglish = 0.05 <= hinglish_ratio <= 0.15
ac3_pass = ac3_en and ac3_hi and ac3_hinglish
criteria_results.append(ac3_pass)
print(f"\nAC-3: 50% English, 40% Hindi, 10% Hinglish")
print(f" English: {en_ratio:.1%} (expected: 45-55%) - {'PASS' if ac3_en else 'FAIL'}")
print(f" Hindi: {hi_ratio:.1%} (expected: 35-45%) - {'PASS' if ac3_hi else 'FAIL'}")
print(f" Hinglish: {hinglish_ratio:.1%} (expected: 5-15%) - {'PASS' if ac3_hinglish else 'FAIL'}")
print(f" Status: {'PASS' if ac3_pass else 'FAIL'}")
# AC-4: All samples validated (check schema)
validation_errors = 0
required_fields = ["id", "message", "language", "label", "confidence", "scam_type", "indicators", "metadata"]
for sample in data:
for field in required_fields:
if field not in sample:
validation_errors += 1
break
if "metadata" in sample:
meta = sample["metadata"]
if not all(k in meta for k in ["source", "annotator", "annotation_date", "difficulty"]):
validation_errors += 1
ac4_pass = validation_errors == 0
criteria_results.append(ac4_pass)
print(f"\nAC-4: All samples validated")
print(f" Validation errors: {validation_errors}")
print(f" Status: {'PASS' if ac4_pass else 'FAIL'}")
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
all_pass = all(criteria_results)
print(f"\nAC-1 (1000+ samples): {'PASS' if criteria_results[0] else 'FAIL'}")
print(f"AC-2 (60% scam ratio): {'PASS' if criteria_results[1] else 'FAIL'}")
print(f"AC-3 (Language distribution): {'PASS' if criteria_results[2] else 'FAIL'}")
print(f"AC-4 (All validated): {'PASS' if criteria_results[3] else 'FAIL'}")
if all_pass:
print("\n[SUCCESS] ALL ACCEPTANCE CRITERIA PASSED")
return 0
else:
print("\n[FAILURE] SOME ACCEPTANCE CRITERIA FAILED")
return 1
if __name__ == "__main__":
sys.exit(main())
|