| """ |
| Synthetic SAP Finance Data Generator |
| |
| Generates synthetic datasets for: |
| - General Ledger accounts with transactions |
| - Financial Statements (P&L and Balance Sheet) |
| - Sales Order data |
| """ |
|
|
| import pandas as pd |
| import numpy as np |
| from datetime import datetime, timedelta |
| import os |
|
|
|
|
| def generate_gl_accounts(num_transactions=1000, output_path="data/synthetic_gl_accounts.csv"): |
| """Generate synthetic General Ledger accounts with transactions.""" |
| |
| |
| account_codes = [ |
| "100000", "110000", "120000", "130000", "140000", |
| "200000", "210000", "220000", "230000", |
| "300000", "310000", "320000", |
| "400000", "410000", "420000", "430000", |
| "500000", "510000", "520000", "530000", "540000", |
| ] |
| |
| account_descriptions = [ |
| "Cash and Cash Equivalents", "Accounts Receivable", "Inventory", |
| "Prepaid Expenses", "Property, Plant & Equipment", |
| "Accounts Payable", "Accrued Liabilities", "Short-term Debt", "Long-term Debt", |
| "Common Stock", "Retained Earnings", "Other Equity", |
| "Sales Revenue", "Service Revenue", "Interest Income", "Other Income", |
| "Cost of Goods Sold", "Salaries and Wages", "Rent Expense", |
| "Utilities Expense", "Marketing Expense" |
| ] |
| |
| np.random.seed(42) |
| |
| transactions = [] |
| base_date = datetime(2024, 1, 1) |
| |
| for i in range(num_transactions): |
| account_idx = np.random.randint(0, len(account_codes)) |
| transaction_date = base_date + timedelta(days=np.random.randint(0, 365)) |
| |
| |
| account_num = int(account_codes[account_idx][0]) |
| if account_num in [1, 5]: |
| debit = np.random.uniform(100, 50000) |
| credit = 0 |
| else: |
| debit = 0 |
| credit = np.random.uniform(100, 50000) |
| |
| transactions.append({ |
| "Transaction_ID": f"TXN{str(i+1).zfill(6)}", |
| "Date": transaction_date.strftime("%Y-%m-%d"), |
| "Account_Code": account_codes[account_idx], |
| "Account_Description": account_descriptions[account_idx], |
| "Debit": round(debit, 2), |
| "Credit": round(credit, 2), |
| "Balance": round(debit - credit, 2), |
| "Document_Number": f"DOC{str(np.random.randint(1000, 9999))}", |
| "Posting_Period": transaction_date.strftime("%Y-%m") |
| }) |
| |
| df = pd.DataFrame(transactions) |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) |
| df.to_csv(output_path, index=False) |
| print(f"Generated {num_transactions} GL transactions: {output_path}") |
| return df |
|
|
|
|
| def generate_financial_statements(num_periods=12, output_path="data/synthetic_financial_statements.csv"): |
| """Generate synthetic Financial Statements (P&L and Balance Sheet).""" |
| |
| np.random.seed(42) |
| base_date = datetime(2024, 1, 1) |
| |
| statements = [] |
| |
| for period in range(num_periods): |
| period_date = base_date + timedelta(days=period * 30) |
| period_str = period_date.strftime("%Y-%m") |
| |
| |
| revenue = np.random.uniform(500000, 1000000) |
| cogs = revenue * np.random.uniform(0.4, 0.6) |
| gross_profit = revenue - cogs |
| |
| operating_expenses = np.random.uniform(200000, 400000) |
| ebitda = gross_profit - operating_expenses |
| depreciation = np.random.uniform(20000, 50000) |
| ebit = ebitda - depreciation |
| interest_expense = np.random.uniform(10000, 30000) |
| ebt = ebit - interest_expense |
| tax = ebt * 0.25 |
| net_income = ebt - tax |
| |
| |
| cash = np.random.uniform(100000, 500000) |
| accounts_receivable = np.random.uniform(200000, 400000) |
| inventory = np.random.uniform(150000, 300000) |
| current_assets = cash + accounts_receivable + inventory |
| ppe = np.random.uniform(2000000, 5000000) |
| total_assets = current_assets + ppe |
| |
| accounts_payable = np.random.uniform(100000, 200000) |
| short_term_debt = np.random.uniform(50000, 150000) |
| current_liabilities = accounts_payable + short_term_debt |
| long_term_debt = np.random.uniform(1000000, 2000000) |
| total_liabilities = current_liabilities + long_term_debt |
| |
| equity = total_assets - total_liabilities |
| |
| statements.append({ |
| "Period": period_str, |
| "Statement_Type": "P&L", |
| "Revenue": round(revenue, 2), |
| "Cost_of_Goods_Sold": round(cogs, 2), |
| "Gross_Profit": round(gross_profit, 2), |
| "Operating_Expenses": round(operating_expenses, 2), |
| "EBITDA": round(ebitda, 2), |
| "Depreciation": round(depreciation, 2), |
| "EBIT": round(ebit, 2), |
| "Interest_Expense": round(interest_expense, 2), |
| "EBT": round(ebt, 2), |
| "Tax": round(tax, 2), |
| "Net_Income": round(net_income, 2), |
| "Cash": round(cash, 2), |
| "Accounts_Receivable": round(accounts_receivable, 2), |
| "Inventory": round(inventory, 2), |
| "Current_Assets": round(current_assets, 2), |
| "PPE": round(ppe, 2), |
| "Total_Assets": round(total_assets, 2), |
| "Accounts_Payable": round(accounts_payable, 2), |
| "Short_Term_Debt": round(short_term_debt, 2), |
| "Current_Liabilities": round(current_liabilities, 2), |
| "Long_Term_Debt": round(long_term_debt, 2), |
| "Total_Liabilities": round(total_liabilities, 2), |
| "Equity": round(equity, 2) |
| }) |
| |
| df = pd.DataFrame(statements) |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) |
| df.to_csv(output_path, index=False) |
| print(f"Generated {num_periods} financial statement periods: {output_path}") |
| return df |
|
|
|
|
| def generate_sales_orders(num_orders=500, output_path="data/synthetic_sales_orders.csv"): |
| """Generate synthetic Sales Order data.""" |
| |
| np.random.seed(42) |
| base_date = datetime(2024, 1, 1) |
| |
| products = [ |
| "Product A", "Product B", "Product C", "Product D", "Product E", |
| "Product F", "Product G", "Product H", "Product I", "Product J" |
| ] |
| |
| customers = [f"CUST{str(i).zfill(5)}" for i in range(1, 101)] |
| regions = ["North", "South", "East", "West", "Central"] |
| |
| orders = [] |
| |
| for i in range(num_orders): |
| order_date = base_date + timedelta(days=np.random.randint(0, 365)) |
| delivery_date = order_date + timedelta(days=np.random.randint(7, 30)) |
| |
| customer = np.random.choice(customers) |
| product = np.random.choice(products) |
| quantity = np.random.randint(1, 100) |
| unit_price = np.random.uniform(10, 500) |
| total_amount = quantity * unit_price |
| region = np.random.choice(regions) |
| |
| order_status = np.random.choice( |
| ["Open", "In Process", "Delivered", "Cancelled"], |
| p=[0.2, 0.3, 0.4, 0.1] |
| ) |
| |
| orders.append({ |
| "Order_Number": f"SO{str(i+1).zfill(6)}", |
| "Order_Date": order_date.strftime("%Y-%m-%d"), |
| "Delivery_Date": delivery_date.strftime("%Y-%m-%d"), |
| "Customer_ID": customer, |
| "Customer_Name": f"Customer {customer}", |
| "Product_Code": f"PRD{str(np.random.randint(1, 100)).zfill(3)}", |
| "Product_Name": product, |
| "Quantity": quantity, |
| "Unit_Price": round(unit_price, 2), |
| "Total_Amount": round(total_amount, 2), |
| "Currency": "USD", |
| "Region": region, |
| "Status": order_status, |
| "Sales_Rep": f"REP{str(np.random.randint(1, 20)).zfill(2)}" |
| }) |
| |
| df = pd.DataFrame(orders) |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) |
| df.to_csv(output_path, index=False) |
| print(f"Generated {num_orders} sales orders: {output_path}") |
| return df |
|
|
|
|
| def generate_all_datasets(): |
| """Generate all synthetic datasets.""" |
| print("Generating synthetic SAP finance datasets...") |
| generate_gl_accounts() |
| generate_financial_statements() |
| generate_sales_orders() |
| print("All datasets generated successfully!") |
|
|
|
|
| if __name__ == "__main__": |
| generate_all_datasets() |
|
|
|
|