File size: 4,555 Bytes
64e5ee2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import os

def create_sample_data(num_records=100, output_dir="sample_data"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    companies = ["Acme Corp", "Global Tech", "Stark Industries", "Wayne Enterprises", "Cyberdyne", 
                 "Umbrella Corp", "Tyrell Corporation", "Weyland-Yutani", "Omni Consumer Products", "Initech",
                 "Hooli", "Pied Piper", "Massive Dynamic", "Aperture Science", "Black Mesa"]
    
    # Typos and variations for fuzzy matching
    variations = {
        "Acme Corp": ["Acme Corp", "Acme Corporation", "Acm Corp", "Acme Corpration"],
        "Global Tech": ["Global Tech", "Global Technologies", "Gloabl Tech", "Global Tech Ltd."],
        "Stark Industries": ["Stark Industries", "Stark Ind", "Strk Industries", "Stark Industries Inc."],
        "Wayne Enterprises": ["Wayne Enterprises", "Wayne Ent", "Wayne Enterpises", "Wayne Enterprises LLC"]
    }

    books_data = []
    gst_data = []

    start_date = datetime(2023, 1, 1)

    for i in range(1, num_records + 1):
        invoice_id = f"INV-{1000 + i}"
        base_company = random.choice(companies)
        
        # Determine actual names to use
        books_company = random.choice(variations.get(base_company, [base_company]))
        gst_company = random.choice(variations.get(base_company, [base_company]))
        
        base_amount = round(random.uniform(100, 10000), 2)
        
        # Introduce discrepancies
        discrepancy_type = random.choices(
            ["none", "amount_diff", "missing_in_gst", "missing_in_books", "date_diff"], 
            weights=[0.6, 0.1, 0.1, 0.1, 0.1], 
            k=1
        )[0]
        
        books_amount = base_amount
        gst_amount = base_amount
        
        invoice_date = start_date + timedelta(days=random.randint(0, 365))
        books_date = invoice_date.strftime('%Y-%m-%d')
        gst_date = invoice_date.strftime('%Y-%m-%d')

        if discrepancy_type == "amount_diff":
            gst_amount = round(base_amount * random.choice([0.9, 1.1, 0.5, 1.05]), 2)
        elif discrepancy_type == "date_diff":
            gst_date = (invoice_date + timedelta(days=random.choice([-1, 1, -5, 5]))).strftime('%Y-%m-%d')
        
        books_record = {
            "InvoiceID": invoice_id,
            "VendorName": books_company,
            "Amount": books_amount,
            "InvoiceDate": books_date,
            "TaxAmount": round(books_amount * 0.18, 2)
        }
        
        gst_record = {
            "InvoiceID": invoice_id,
            "VendorName": gst_company,
            "Amount": gst_amount,
            "InvoiceDate": gst_date,
            "TaxAmount": round(gst_amount * 0.18, 2)
        }

        if discrepancy_type != "missing_in_books":
            books_data.append(books_record)
        if discrepancy_type != "missing_in_gst":
            gst_data.append(gst_record)

    # Add some random anomalies (high amount)
    for _ in range(max(1, num_records // 20)):
        idx = random.randint(0, len(books_data)-1)
        books_data[idx]["Amount"] = books_data[idx]["Amount"] * random.uniform(5, 10)
        books_data[idx]["TaxAmount"] = round(books_data[idx]["Amount"] * 0.18, 2)
        
    # INJECT CIRCULAR TRADING FRAUD RING FOR TESTING
    ring_vendors = ["Shell Corp Alpha", "Ghost Entity Beta", "Phantom Traders Gamma"]
    ring_amount = 55000.00
    for idx in range(3):
        # Create a cycle: Alpha -> Beta -> Gamma -> Alpha
        books_v = ring_vendors[idx]
        gst_v = ring_vendors[(idx + 1) % 3]
        
        inv_id = f"FRAUD-RING-{idx+1}"
        books_data.append({
            "InvoiceID": inv_id,
            "VendorName": books_v,
            "Amount": ring_amount,
            "InvoiceDate": "2023-11-15",
            "TaxAmount": round(ring_amount * 0.18, 2)
        })
        gst_data.append({
            "InvoiceID": inv_id,
            "VendorName": gst_v,
            "Amount": ring_amount,
            "InvoiceDate": "2023-11-15",
            "TaxAmount": round(ring_amount * 0.18, 2)
        })

    books_df = pd.DataFrame(books_data)
    gst_df = pd.DataFrame(gst_data)

    books_df.to_csv(os.path.join(output_dir, "books.csv"), index=False)
    gst_df.to_csv(os.path.join(output_dir, "gst.csv"), index=False)

    return {"source": books_df, "target": gst_df}

if __name__ == "__main__":
    create_sample_data(200)
    print("Sample data generated in sample_data directory.")