import pandas as pd import numpy as np import random from datetime import datetime, timedelta import os def create_sample_data(num_records=100, output_dir="sample_data"): if not os.path.exists(output_dir): os.makedirs(output_dir) companies = ["Acme Corp", "Global Tech", "Stark Industries", "Wayne Enterprises", "Cyberdyne", "Umbrella Corp", "Tyrell Corporation", "Weyland-Yutani", "Omni Consumer Products", "Initech", "Hooli", "Pied Piper", "Massive Dynamic", "Aperture Science", "Black Mesa"] # Typos and variations for fuzzy matching variations = { "Acme Corp": ["Acme Corp", "Acme Corporation", "Acm Corp", "Acme Corpration"], "Global Tech": ["Global Tech", "Global Technologies", "Gloabl Tech", "Global Tech Ltd."], "Stark Industries": ["Stark Industries", "Stark Ind", "Strk Industries", "Stark Industries Inc."], "Wayne Enterprises": ["Wayne Enterprises", "Wayne Ent", "Wayne Enterpises", "Wayne Enterprises LLC"] } books_data = [] gst_data = [] start_date = datetime(2023, 1, 1) for i in range(1, num_records + 1): invoice_id = f"INV-{1000 + i}" base_company = random.choice(companies) # Determine actual names to use books_company = random.choice(variations.get(base_company, [base_company])) gst_company = random.choice(variations.get(base_company, [base_company])) base_amount = round(random.uniform(100, 10000), 2) # Introduce discrepancies discrepancy_type = random.choices( ["none", "amount_diff", "missing_in_gst", "missing_in_books", "date_diff"], weights=[0.6, 0.1, 0.1, 0.1, 0.1], k=1 )[0] books_amount = base_amount gst_amount = base_amount invoice_date = start_date + timedelta(days=random.randint(0, 365)) books_date = invoice_date.strftime('%Y-%m-%d') gst_date = invoice_date.strftime('%Y-%m-%d') if discrepancy_type == "amount_diff": gst_amount = round(base_amount * random.choice([0.9, 1.1, 0.5, 1.05]), 2) elif discrepancy_type == "date_diff": gst_date = (invoice_date + timedelta(days=random.choice([-1, 1, -5, 5]))).strftime('%Y-%m-%d') books_record = { "InvoiceID": invoice_id, "VendorName": books_company, "Amount": books_amount, "InvoiceDate": books_date, "TaxAmount": round(books_amount * 0.18, 2) } gst_record = { "InvoiceID": invoice_id, "VendorName": gst_company, "Amount": gst_amount, "InvoiceDate": gst_date, "TaxAmount": round(gst_amount * 0.18, 2) } if discrepancy_type != "missing_in_books": books_data.append(books_record) if discrepancy_type != "missing_in_gst": gst_data.append(gst_record) # Add some random anomalies (high amount) for _ in range(max(1, num_records // 20)): idx = random.randint(0, len(books_data)-1) books_data[idx]["Amount"] = books_data[idx]["Amount"] * random.uniform(5, 10) books_data[idx]["TaxAmount"] = round(books_data[idx]["Amount"] * 0.18, 2) # INJECT CIRCULAR TRADING FRAUD RING FOR TESTING ring_vendors = ["Shell Corp Alpha", "Ghost Entity Beta", "Phantom Traders Gamma"] ring_amount = 55000.00 for idx in range(3): # Create a cycle: Alpha -> Beta -> Gamma -> Alpha books_v = ring_vendors[idx] gst_v = ring_vendors[(idx + 1) % 3] inv_id = f"FRAUD-RING-{idx+1}" books_data.append({ "InvoiceID": inv_id, "VendorName": books_v, "Amount": ring_amount, "InvoiceDate": "2023-11-15", "TaxAmount": round(ring_amount * 0.18, 2) }) gst_data.append({ "InvoiceID": inv_id, "VendorName": gst_v, "Amount": ring_amount, "InvoiceDate": "2023-11-15", "TaxAmount": round(ring_amount * 0.18, 2) }) books_df = pd.DataFrame(books_data) gst_df = pd.DataFrame(gst_data) books_df.to_csv(os.path.join(output_dir, "books.csv"), index=False) gst_df.to_csv(os.path.join(output_dir, "gst.csv"), index=False) return {"source": books_df, "target": gst_df} if __name__ == "__main__": create_sample_data(200) print("Sample data generated in sample_data directory.")