scam / scripts /verify_task_4_1.py
Gankit12's picture
Upload 129 files
31f0e50 verified
Raw
History Blame Contribute Delete
5.27 kB
#!/usr/bin/env python
"""
Task 4.1 Verification Script.
Runs the verification code from TASKS.md and validates acceptance criteria:
- 1000+ total samples
- 60% scam, 40% legitimate
- 50% English, 40% Hindi, 10% Hinglish
- All samples validated
"""
import json
import os
import sys
# Set UTF-8 output encoding for Windows console
if sys.stdout.encoding != 'utf-8':
sys.stdout.reconfigure(encoding='utf-8')
# Dataset path
DATASET_PATH = os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
"data",
"scam_detection_train.jsonl"
)
def main():
print("=" * 60)
print("Task 4.1: Dataset Creation - Verification")
print("=" * 60)
# Check if file exists
if not os.path.exists(DATASET_PATH):
print(f"\n[ERROR] Dataset file not found: {DATASET_PATH}")
print("Run 'python scripts/generate_dataset.py' first.")
return 1
# Load dataset (as per TASKS.md verification code)
print(f"\nLoading: {DATASET_PATH}")
with open(DATASET_PATH, encoding="utf-8") as f:
data = [json.loads(line) for line in f]
# Print statistics (as per TASKS.md)
print(f"\nTotal samples: {len(data)}")
scam_ratio = sum(1 for d in data if d['label'] == 'scam') / len(data)
print(f"Scam ratio: {scam_ratio:.2%}")
# Extended statistics
print("\n--- Detailed Statistics ---")
# Label distribution
scam_count = sum(1 for d in data if d['label'] == 'scam')
legit_count = sum(1 for d in data if d['label'] == 'legitimate')
print(f"Scam: {scam_count} ({scam_count/len(data):.1%})")
print(f"Legitimate: {legit_count} ({legit_count/len(data):.1%})")
# Language distribution
en_count = sum(1 for d in data if d['language'] == 'en')
hi_count = sum(1 for d in data if d['language'] == 'hi')
hinglish_count = sum(1 for d in data if d['language'] == 'hinglish')
print(f"\nEnglish: {en_count} ({en_count/len(data):.1%})")
print(f"Hindi: {hi_count} ({hi_count/len(data):.1%})")
print(f"Hinglish: {hinglish_count} ({hinglish_count/len(data):.1%})")
# Acceptance Criteria Validation
print("\n" + "=" * 60)
print("Acceptance Criteria")
print("=" * 60)
criteria_results = []
# AC-1: 1000+ total samples
ac1_pass = len(data) >= 1000
criteria_results.append(ac1_pass)
print(f"\nAC-1: 1000+ total samples")
print(f" Value: {len(data)}")
print(f" Status: {'PASS' if ac1_pass else 'FAIL'}")
# AC-2: 60% scam, 40% legitimate
ac2_pass = 0.55 <= scam_ratio <= 0.65
criteria_results.append(ac2_pass)
print(f"\nAC-2: 60% scam, 40% legitimate")
print(f" Scam: {scam_ratio:.1%} (expected: 55-65%)")
print(f" Legitimate: {1-scam_ratio:.1%} (expected: 35-45%)")
print(f" Status: {'PASS' if ac2_pass else 'FAIL'}")
# AC-3: 50% English, 40% Hindi, 10% Hinglish
en_ratio = en_count / len(data)
hi_ratio = hi_count / len(data)
hinglish_ratio = hinglish_count / len(data)
ac3_en = 0.45 <= en_ratio <= 0.55
ac3_hi = 0.35 <= hi_ratio <= 0.45
ac3_hinglish = 0.05 <= hinglish_ratio <= 0.15
ac3_pass = ac3_en and ac3_hi and ac3_hinglish
criteria_results.append(ac3_pass)
print(f"\nAC-3: 50% English, 40% Hindi, 10% Hinglish")
print(f" English: {en_ratio:.1%} (expected: 45-55%) - {'PASS' if ac3_en else 'FAIL'}")
print(f" Hindi: {hi_ratio:.1%} (expected: 35-45%) - {'PASS' if ac3_hi else 'FAIL'}")
print(f" Hinglish: {hinglish_ratio:.1%} (expected: 5-15%) - {'PASS' if ac3_hinglish else 'FAIL'}")
print(f" Status: {'PASS' if ac3_pass else 'FAIL'}")
# AC-4: All samples validated (check schema)
validation_errors = 0
required_fields = ["id", "message", "language", "label", "confidence", "scam_type", "indicators", "metadata"]
for sample in data:
for field in required_fields:
if field not in sample:
validation_errors += 1
break
if "metadata" in sample:
meta = sample["metadata"]
if not all(k in meta for k in ["source", "annotator", "annotation_date", "difficulty"]):
validation_errors += 1
ac4_pass = validation_errors == 0
criteria_results.append(ac4_pass)
print(f"\nAC-4: All samples validated")
print(f" Validation errors: {validation_errors}")
print(f" Status: {'PASS' if ac4_pass else 'FAIL'}")
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
all_pass = all(criteria_results)
print(f"\nAC-1 (1000+ samples): {'PASS' if criteria_results[0] else 'FAIL'}")
print(f"AC-2 (60% scam ratio): {'PASS' if criteria_results[1] else 'FAIL'}")
print(f"AC-3 (Language distribution): {'PASS' if criteria_results[2] else 'FAIL'}")
print(f"AC-4 (All validated): {'PASS' if criteria_results[3] else 'FAIL'}")
if all_pass:
print("\n[SUCCESS] ALL ACCEPTANCE CRITERIA PASSED")
return 0
else:
print("\n[FAILURE] SOME ACCEPTANCE CRITERIA FAILED")
return 1
if __name__ == "__main__":
sys.exit(main())