| import json |
| import random |
|
|
| |
| REAL_FRAUD_INSTITUTIONS = [ |
| "Pacific Western University", "St. Clements University", "Columbia State University", |
| "Belford University", "Rochville University", "Almeda University", |
| "McFord University", "Glencullen University", "University of Wolverton", |
| "Parkwood University", "Preston University", "Barrington University", |
| "Canterbury University", "Dublin Metropolitan University", "International University of America", |
| "Yorker International University", "Western Michigan State University (Fake)", |
| "Kingsbridge University", "Atlantic International University", "Mid-Atlantic University" |
| ] |
|
|
| DEGREES = ["B.Sc. in Computer Science", "Master of Business Administration (MBA)", "PhD in Quantum Physics", "Bachelor of Laws (LLB)", "Master of Arts in Global Governance"] |
| STATUSES = ["Blacklisted", "High Risk", "Verified (False Positive Test)", "Accreditation Expired"] |
|
|
| def generate_hi_fi_benchmark(): |
| data = [] |
| for i in range(100): |
| institution = random.choice(REAL_FRAUD_INSTITUTIONS) |
| is_fraud = True |
| |
| |
| if i % 10 == 0: |
| institution = "Atlanta College of Liberal Arts and Sciences" |
| is_fraud = False |
|
|
| record = { |
| "audit_id": f"AG-2026-{1000 + i}", |
| "institution_name": institution, |
| "degree_claimed": random.choice(DEGREES), |
| "graduation_year": random.randint(1995, 2024), |
| "reputation_score": round(random.uniform(0.1, 0.3) if is_fraud else random.uniform(0.9, 1.0), 2), |
| "is_diploma_mill": is_fraud, |
| "audit_status": "Flagged" if is_fraud else "Approved", |
| "reasoning_tag": "Temporal Paradox" if i % 3 == 0 and is_fraud else "ROR/OpenAlex Mismatch" |
| } |
| data.append(record) |
| |
| with open('data/full_benchmark_v1.json', 'w', encoding='utf-8') as f: |
| json.dump(data, f, indent=4) |
| print(f"SUCCESS: Generated 100 high-fidelity audit records using real-world fraud entities.") |
|
|
| if __name__ == "__main__": |
| generate_hi_fi_benchmark() |
|
|