| import pandas as pd |
| import numpy as np |
| import random |
| from datetime import datetime, timedelta |
| import os |
|
|
| def create_sample_data(num_records=100, output_dir="sample_data"): |
| if not os.path.exists(output_dir): |
| os.makedirs(output_dir) |
|
|
| companies = ["Acme Corp", "Global Tech", "Stark Industries", "Wayne Enterprises", "Cyberdyne", |
| "Umbrella Corp", "Tyrell Corporation", "Weyland-Yutani", "Omni Consumer Products", "Initech", |
| "Hooli", "Pied Piper", "Massive Dynamic", "Aperture Science", "Black Mesa"] |
| |
| |
| variations = { |
| "Acme Corp": ["Acme Corp", "Acme Corporation", "Acm Corp", "Acme Corpration"], |
| "Global Tech": ["Global Tech", "Global Technologies", "Gloabl Tech", "Global Tech Ltd."], |
| "Stark Industries": ["Stark Industries", "Stark Ind", "Strk Industries", "Stark Industries Inc."], |
| "Wayne Enterprises": ["Wayne Enterprises", "Wayne Ent", "Wayne Enterpises", "Wayne Enterprises LLC"] |
| } |
|
|
| books_data = [] |
| gst_data = [] |
|
|
| start_date = datetime(2023, 1, 1) |
|
|
| for i in range(1, num_records + 1): |
| invoice_id = f"INV-{1000 + i}" |
| base_company = random.choice(companies) |
| |
| |
| books_company = random.choice(variations.get(base_company, [base_company])) |
| gst_company = random.choice(variations.get(base_company, [base_company])) |
| |
| base_amount = round(random.uniform(100, 10000), 2) |
| |
| |
| discrepancy_type = random.choices( |
| ["none", "amount_diff", "missing_in_gst", "missing_in_books", "date_diff"], |
| weights=[0.6, 0.1, 0.1, 0.1, 0.1], |
| k=1 |
| )[0] |
| |
| books_amount = base_amount |
| gst_amount = base_amount |
| |
| invoice_date = start_date + timedelta(days=random.randint(0, 365)) |
| books_date = invoice_date.strftime('%Y-%m-%d') |
| gst_date = invoice_date.strftime('%Y-%m-%d') |
|
|
| if discrepancy_type == "amount_diff": |
| gst_amount = round(base_amount * random.choice([0.9, 1.1, 0.5, 1.05]), 2) |
| elif discrepancy_type == "date_diff": |
| gst_date = (invoice_date + timedelta(days=random.choice([-1, 1, -5, 5]))).strftime('%Y-%m-%d') |
| |
| books_record = { |
| "InvoiceID": invoice_id, |
| "VendorName": books_company, |
| "Amount": books_amount, |
| "InvoiceDate": books_date, |
| "TaxAmount": round(books_amount * 0.18, 2) |
| } |
| |
| gst_record = { |
| "InvoiceID": invoice_id, |
| "VendorName": gst_company, |
| "Amount": gst_amount, |
| "InvoiceDate": gst_date, |
| "TaxAmount": round(gst_amount * 0.18, 2) |
| } |
|
|
| if discrepancy_type != "missing_in_books": |
| books_data.append(books_record) |
| if discrepancy_type != "missing_in_gst": |
| gst_data.append(gst_record) |
|
|
| |
| for _ in range(max(1, num_records // 20)): |
| idx = random.randint(0, len(books_data)-1) |
| books_data[idx]["Amount"] = books_data[idx]["Amount"] * random.uniform(5, 10) |
| books_data[idx]["TaxAmount"] = round(books_data[idx]["Amount"] * 0.18, 2) |
| |
| |
| ring_vendors = ["Shell Corp Alpha", "Ghost Entity Beta", "Phantom Traders Gamma"] |
| ring_amount = 55000.00 |
| for idx in range(3): |
| |
| books_v = ring_vendors[idx] |
| gst_v = ring_vendors[(idx + 1) % 3] |
| |
| inv_id = f"FRAUD-RING-{idx+1}" |
| books_data.append({ |
| "InvoiceID": inv_id, |
| "VendorName": books_v, |
| "Amount": ring_amount, |
| "InvoiceDate": "2023-11-15", |
| "TaxAmount": round(ring_amount * 0.18, 2) |
| }) |
| gst_data.append({ |
| "InvoiceID": inv_id, |
| "VendorName": gst_v, |
| "Amount": ring_amount, |
| "InvoiceDate": "2023-11-15", |
| "TaxAmount": round(ring_amount * 0.18, 2) |
| }) |
|
|
| books_df = pd.DataFrame(books_data) |
| gst_df = pd.DataFrame(gst_data) |
|
|
| books_df.to_csv(os.path.join(output_dir, "books.csv"), index=False) |
| gst_df.to_csv(os.path.join(output_dir, "gst.csv"), index=False) |
|
|
| return {"source": books_df, "target": gst_df} |
|
|
| if __name__ == "__main__": |
| create_sample_data(200) |
| print("Sample data generated in sample_data directory.") |
|
|