Spaces:

ACA050
/

ReconAI

Sleeping

File size: 4,555 Bytes

64e5ee2

import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import os

def create_sample_data(num_records=100, output_dir="sample_data"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    companies = ["Acme Corp", "Global Tech", "Stark Industries", "Wayne Enterprises", "Cyberdyne", 
                 "Umbrella Corp", "Tyrell Corporation", "Weyland-Yutani", "Omni Consumer Products", "Initech",
                 "Hooli", "Pied Piper", "Massive Dynamic", "Aperture Science", "Black Mesa"]
    
    # Typos and variations for fuzzy matching
    variations = {
        "Acme Corp": ["Acme Corp", "Acme Corporation", "Acm Corp", "Acme Corpration"],
        "Global Tech": ["Global Tech", "Global Technologies", "Gloabl Tech", "Global Tech Ltd."],
        "Stark Industries": ["Stark Industries", "Stark Ind", "Strk Industries", "Stark Industries Inc."],
        "Wayne Enterprises": ["Wayne Enterprises", "Wayne Ent", "Wayne Enterpises", "Wayne Enterprises LLC"]
    }

    books_data = []
    gst_data = []

    start_date = datetime(2023, 1, 1)

    for i in range(1, num_records + 1):
        invoice_id = f"INV-{1000 + i}"
        base_company = random.choice(companies)
        
        # Determine actual names to use
        books_company = random.choice(variations.get(base_company, [base_company]))
        gst_company = random.choice(variations.get(base_company, [base_company]))
        
        base_amount = round(random.uniform(100, 10000), 2)
        
        # Introduce discrepancies
        discrepancy_type = random.choices(
            ["none", "amount_diff", "missing_in_gst", "missing_in_books", "date_diff"], 
            weights=[0.6, 0.1, 0.1, 0.1, 0.1], 
            k=1
        )[0]
        
        books_amount = base_amount
        gst_amount = base_amount
        
        invoice_date = start_date + timedelta(days=random.randint(0, 365))
        books_date = invoice_date.strftime('%Y-%m-%d')
        gst_date = invoice_date.strftime('%Y-%m-%d')

        if discrepancy_type == "amount_diff":
            gst_amount = round(base_amount * random.choice([0.9, 1.1, 0.5, 1.05]), 2)
        elif discrepancy_type == "date_diff":
            gst_date = (invoice_date + timedelta(days=random.choice([-1, 1, -5, 5]))).strftime('%Y-%m-%d')
        
        books_record = {
            "InvoiceID": invoice_id,
            "VendorName": books_company,
            "Amount": books_amount,
            "InvoiceDate": books_date,
            "TaxAmount": round(books_amount * 0.18, 2)
        }
        
        gst_record = {
            "InvoiceID": invoice_id,
            "VendorName": gst_company,
            "Amount": gst_amount,
            "InvoiceDate": gst_date,
            "TaxAmount": round(gst_amount * 0.18, 2)
        }

        if discrepancy_type != "missing_in_books":
            books_data.append(books_record)
        if discrepancy_type != "missing_in_gst":
            gst_data.append(gst_record)

    # Add some random anomalies (high amount)
    for _ in range(max(1, num_records // 20)):
        idx = random.randint(0, len(books_data)-1)
        books_data[idx]["Amount"] = books_data[idx]["Amount"] * random.uniform(5, 10)
        books_data[idx]["TaxAmount"] = round(books_data[idx]["Amount"] * 0.18, 2)
        
    # INJECT CIRCULAR TRADING FRAUD RING FOR TESTING
    ring_vendors = ["Shell Corp Alpha", "Ghost Entity Beta", "Phantom Traders Gamma"]
    ring_amount = 55000.00
    for idx in range(3):
        # Create a cycle: Alpha -> Beta -> Gamma -> Alpha
        books_v = ring_vendors[idx]
        gst_v = ring_vendors[(idx + 1) % 3]
        
        inv_id = f"FRAUD-RING-{idx+1}"
        books_data.append({
            "InvoiceID": inv_id,
            "VendorName": books_v,
            "Amount": ring_amount,
            "InvoiceDate": "2023-11-15",
            "TaxAmount": round(ring_amount * 0.18, 2)
        })
        gst_data.append({
            "InvoiceID": inv_id,
            "VendorName": gst_v,
            "Amount": ring_amount,
            "InvoiceDate": "2023-11-15",
            "TaxAmount": round(ring_amount * 0.18, 2)
        })

    books_df = pd.DataFrame(books_data)
    gst_df = pd.DataFrame(gst_data)

    books_df.to_csv(os.path.join(output_dir, "books.csv"), index=False)
    gst_df.to_csv(os.path.join(output_dir, "gst.csv"), index=False)

    return {"source": books_df, "target": gst_df}

if __name__ == "__main__":
    create_sample_data(200)
    print("Sample data generated in sample_data directory.")