File size: 4,555 Bytes
64e5ee2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import os
def create_sample_data(num_records=100, output_dir="sample_data"):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
companies = ["Acme Corp", "Global Tech", "Stark Industries", "Wayne Enterprises", "Cyberdyne",
"Umbrella Corp", "Tyrell Corporation", "Weyland-Yutani", "Omni Consumer Products", "Initech",
"Hooli", "Pied Piper", "Massive Dynamic", "Aperture Science", "Black Mesa"]
# Typos and variations for fuzzy matching
variations = {
"Acme Corp": ["Acme Corp", "Acme Corporation", "Acm Corp", "Acme Corpration"],
"Global Tech": ["Global Tech", "Global Technologies", "Gloabl Tech", "Global Tech Ltd."],
"Stark Industries": ["Stark Industries", "Stark Ind", "Strk Industries", "Stark Industries Inc."],
"Wayne Enterprises": ["Wayne Enterprises", "Wayne Ent", "Wayne Enterpises", "Wayne Enterprises LLC"]
}
books_data = []
gst_data = []
start_date = datetime(2023, 1, 1)
for i in range(1, num_records + 1):
invoice_id = f"INV-{1000 + i}"
base_company = random.choice(companies)
# Determine actual names to use
books_company = random.choice(variations.get(base_company, [base_company]))
gst_company = random.choice(variations.get(base_company, [base_company]))
base_amount = round(random.uniform(100, 10000), 2)
# Introduce discrepancies
discrepancy_type = random.choices(
["none", "amount_diff", "missing_in_gst", "missing_in_books", "date_diff"],
weights=[0.6, 0.1, 0.1, 0.1, 0.1],
k=1
)[0]
books_amount = base_amount
gst_amount = base_amount
invoice_date = start_date + timedelta(days=random.randint(0, 365))
books_date = invoice_date.strftime('%Y-%m-%d')
gst_date = invoice_date.strftime('%Y-%m-%d')
if discrepancy_type == "amount_diff":
gst_amount = round(base_amount * random.choice([0.9, 1.1, 0.5, 1.05]), 2)
elif discrepancy_type == "date_diff":
gst_date = (invoice_date + timedelta(days=random.choice([-1, 1, -5, 5]))).strftime('%Y-%m-%d')
books_record = {
"InvoiceID": invoice_id,
"VendorName": books_company,
"Amount": books_amount,
"InvoiceDate": books_date,
"TaxAmount": round(books_amount * 0.18, 2)
}
gst_record = {
"InvoiceID": invoice_id,
"VendorName": gst_company,
"Amount": gst_amount,
"InvoiceDate": gst_date,
"TaxAmount": round(gst_amount * 0.18, 2)
}
if discrepancy_type != "missing_in_books":
books_data.append(books_record)
if discrepancy_type != "missing_in_gst":
gst_data.append(gst_record)
# Add some random anomalies (high amount)
for _ in range(max(1, num_records // 20)):
idx = random.randint(0, len(books_data)-1)
books_data[idx]["Amount"] = books_data[idx]["Amount"] * random.uniform(5, 10)
books_data[idx]["TaxAmount"] = round(books_data[idx]["Amount"] * 0.18, 2)
# INJECT CIRCULAR TRADING FRAUD RING FOR TESTING
ring_vendors = ["Shell Corp Alpha", "Ghost Entity Beta", "Phantom Traders Gamma"]
ring_amount = 55000.00
for idx in range(3):
# Create a cycle: Alpha -> Beta -> Gamma -> Alpha
books_v = ring_vendors[idx]
gst_v = ring_vendors[(idx + 1) % 3]
inv_id = f"FRAUD-RING-{idx+1}"
books_data.append({
"InvoiceID": inv_id,
"VendorName": books_v,
"Amount": ring_amount,
"InvoiceDate": "2023-11-15",
"TaxAmount": round(ring_amount * 0.18, 2)
})
gst_data.append({
"InvoiceID": inv_id,
"VendorName": gst_v,
"Amount": ring_amount,
"InvoiceDate": "2023-11-15",
"TaxAmount": round(ring_amount * 0.18, 2)
})
books_df = pd.DataFrame(books_data)
gst_df = pd.DataFrame(gst_data)
books_df.to_csv(os.path.join(output_dir, "books.csv"), index=False)
gst_df.to_csv(os.path.join(output_dir, "gst.csv"), index=False)
return {"source": books_df, "target": gst_df}
if __name__ == "__main__":
create_sample_data(200)
print("Sample data generated in sample_data directory.")
|