|
|
""" |
|
|
Synthetic SAP Finance Data Generator |
|
|
|
|
|
Generates synthetic datasets for: |
|
|
- General Ledger accounts with transactions |
|
|
- Financial Statements (P&L and Balance Sheet) |
|
|
- Sales Order data |
|
|
""" |
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from datetime import datetime, timedelta |
|
|
import os |
|
|
|
|
|
|
|
|
def generate_gl_accounts(num_transactions=1000, output_path="data/synthetic_gl_accounts.csv"): |
|
|
"""Generate synthetic General Ledger accounts with transactions.""" |
|
|
|
|
|
|
|
|
account_codes = [ |
|
|
"100000", "110000", "120000", "130000", "140000", |
|
|
"200000", "210000", "220000", "230000", |
|
|
"300000", "310000", "320000", |
|
|
"400000", "410000", "420000", "430000", |
|
|
"500000", "510000", "520000", "530000", "540000", |
|
|
] |
|
|
|
|
|
account_descriptions = [ |
|
|
"Cash and Cash Equivalents", "Accounts Receivable", "Inventory", |
|
|
"Prepaid Expenses", "Property, Plant & Equipment", |
|
|
"Accounts Payable", "Accrued Liabilities", "Short-term Debt", "Long-term Debt", |
|
|
"Common Stock", "Retained Earnings", "Other Equity", |
|
|
"Sales Revenue", "Service Revenue", "Interest Income", "Other Income", |
|
|
"Cost of Goods Sold", "Salaries and Wages", "Rent Expense", |
|
|
"Utilities Expense", "Marketing Expense" |
|
|
] |
|
|
|
|
|
np.random.seed(42) |
|
|
|
|
|
transactions = [] |
|
|
base_date = datetime(2024, 1, 1) |
|
|
|
|
|
for i in range(num_transactions): |
|
|
account_idx = np.random.randint(0, len(account_codes)) |
|
|
transaction_date = base_date + timedelta(days=np.random.randint(0, 365)) |
|
|
|
|
|
|
|
|
account_num = int(account_codes[account_idx][0]) |
|
|
if account_num in [1, 5]: |
|
|
debit = np.random.uniform(100, 50000) |
|
|
credit = 0 |
|
|
else: |
|
|
debit = 0 |
|
|
credit = np.random.uniform(100, 50000) |
|
|
|
|
|
transactions.append({ |
|
|
"Transaction_ID": f"TXN{str(i+1).zfill(6)}", |
|
|
"Date": transaction_date.strftime("%Y-%m-%d"), |
|
|
"Account_Code": account_codes[account_idx], |
|
|
"Account_Description": account_descriptions[account_idx], |
|
|
"Debit": round(debit, 2), |
|
|
"Credit": round(credit, 2), |
|
|
"Balance": round(debit - credit, 2), |
|
|
"Document_Number": f"DOC{str(np.random.randint(1000, 9999))}", |
|
|
"Posting_Period": transaction_date.strftime("%Y-%m") |
|
|
}) |
|
|
|
|
|
df = pd.DataFrame(transactions) |
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True) |
|
|
df.to_csv(output_path, index=False) |
|
|
print(f"Generated {num_transactions} GL transactions: {output_path}") |
|
|
return df |
|
|
|
|
|
|
|
|
def generate_financial_statements(num_periods=12, output_path="data/synthetic_financial_statements.csv"): |
|
|
"""Generate synthetic Financial Statements (P&L and Balance Sheet).""" |
|
|
|
|
|
np.random.seed(42) |
|
|
base_date = datetime(2024, 1, 1) |
|
|
|
|
|
statements = [] |
|
|
|
|
|
for period in range(num_periods): |
|
|
period_date = base_date + timedelta(days=period * 30) |
|
|
period_str = period_date.strftime("%Y-%m") |
|
|
|
|
|
|
|
|
revenue = np.random.uniform(500000, 1000000) |
|
|
cogs = revenue * np.random.uniform(0.4, 0.6) |
|
|
gross_profit = revenue - cogs |
|
|
|
|
|
operating_expenses = np.random.uniform(200000, 400000) |
|
|
ebitda = gross_profit - operating_expenses |
|
|
depreciation = np.random.uniform(20000, 50000) |
|
|
ebit = ebitda - depreciation |
|
|
interest_expense = np.random.uniform(10000, 30000) |
|
|
ebt = ebit - interest_expense |
|
|
tax = ebt * 0.25 |
|
|
net_income = ebt - tax |
|
|
|
|
|
|
|
|
cash = np.random.uniform(100000, 500000) |
|
|
accounts_receivable = np.random.uniform(200000, 400000) |
|
|
inventory = np.random.uniform(150000, 300000) |
|
|
current_assets = cash + accounts_receivable + inventory |
|
|
ppe = np.random.uniform(2000000, 5000000) |
|
|
total_assets = current_assets + ppe |
|
|
|
|
|
accounts_payable = np.random.uniform(100000, 200000) |
|
|
short_term_debt = np.random.uniform(50000, 150000) |
|
|
current_liabilities = accounts_payable + short_term_debt |
|
|
long_term_debt = np.random.uniform(1000000, 2000000) |
|
|
total_liabilities = current_liabilities + long_term_debt |
|
|
|
|
|
equity = total_assets - total_liabilities |
|
|
|
|
|
statements.append({ |
|
|
"Period": period_str, |
|
|
"Statement_Type": "P&L", |
|
|
"Revenue": round(revenue, 2), |
|
|
"Cost_of_Goods_Sold": round(cogs, 2), |
|
|
"Gross_Profit": round(gross_profit, 2), |
|
|
"Operating_Expenses": round(operating_expenses, 2), |
|
|
"EBITDA": round(ebitda, 2), |
|
|
"Depreciation": round(depreciation, 2), |
|
|
"EBIT": round(ebit, 2), |
|
|
"Interest_Expense": round(interest_expense, 2), |
|
|
"EBT": round(ebt, 2), |
|
|
"Tax": round(tax, 2), |
|
|
"Net_Income": round(net_income, 2), |
|
|
"Cash": round(cash, 2), |
|
|
"Accounts_Receivable": round(accounts_receivable, 2), |
|
|
"Inventory": round(inventory, 2), |
|
|
"Current_Assets": round(current_assets, 2), |
|
|
"PPE": round(ppe, 2), |
|
|
"Total_Assets": round(total_assets, 2), |
|
|
"Accounts_Payable": round(accounts_payable, 2), |
|
|
"Short_Term_Debt": round(short_term_debt, 2), |
|
|
"Current_Liabilities": round(current_liabilities, 2), |
|
|
"Long_Term_Debt": round(long_term_debt, 2), |
|
|
"Total_Liabilities": round(total_liabilities, 2), |
|
|
"Equity": round(equity, 2) |
|
|
}) |
|
|
|
|
|
df = pd.DataFrame(statements) |
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True) |
|
|
df.to_csv(output_path, index=False) |
|
|
print(f"Generated {num_periods} financial statement periods: {output_path}") |
|
|
return df |
|
|
|
|
|
|
|
|
def generate_sales_orders(num_orders=500, output_path="data/synthetic_sales_orders.csv"): |
|
|
"""Generate synthetic Sales Order data.""" |
|
|
|
|
|
np.random.seed(42) |
|
|
base_date = datetime(2024, 1, 1) |
|
|
|
|
|
products = [ |
|
|
"Product A", "Product B", "Product C", "Product D", "Product E", |
|
|
"Product F", "Product G", "Product H", "Product I", "Product J" |
|
|
] |
|
|
|
|
|
customers = [f"CUST{str(i).zfill(5)}" for i in range(1, 101)] |
|
|
regions = ["North", "South", "East", "West", "Central"] |
|
|
|
|
|
orders = [] |
|
|
|
|
|
for i in range(num_orders): |
|
|
order_date = base_date + timedelta(days=np.random.randint(0, 365)) |
|
|
delivery_date = order_date + timedelta(days=np.random.randint(7, 30)) |
|
|
|
|
|
customer = np.random.choice(customers) |
|
|
product = np.random.choice(products) |
|
|
quantity = np.random.randint(1, 100) |
|
|
unit_price = np.random.uniform(10, 500) |
|
|
total_amount = quantity * unit_price |
|
|
region = np.random.choice(regions) |
|
|
|
|
|
order_status = np.random.choice( |
|
|
["Open", "In Process", "Delivered", "Cancelled"], |
|
|
p=[0.2, 0.3, 0.4, 0.1] |
|
|
) |
|
|
|
|
|
orders.append({ |
|
|
"Order_Number": f"SO{str(i+1).zfill(6)}", |
|
|
"Order_Date": order_date.strftime("%Y-%m-%d"), |
|
|
"Delivery_Date": delivery_date.strftime("%Y-%m-%d"), |
|
|
"Customer_ID": customer, |
|
|
"Customer_Name": f"Customer {customer}", |
|
|
"Product_Code": f"PRD{str(np.random.randint(1, 100)).zfill(3)}", |
|
|
"Product_Name": product, |
|
|
"Quantity": quantity, |
|
|
"Unit_Price": round(unit_price, 2), |
|
|
"Total_Amount": round(total_amount, 2), |
|
|
"Currency": "USD", |
|
|
"Region": region, |
|
|
"Status": order_status, |
|
|
"Sales_Rep": f"REP{str(np.random.randint(1, 20)).zfill(2)}" |
|
|
}) |
|
|
|
|
|
df = pd.DataFrame(orders) |
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True) |
|
|
df.to_csv(output_path, index=False) |
|
|
print(f"Generated {num_orders} sales orders: {output_path}") |
|
|
return df |
|
|
|
|
|
|
|
|
def generate_all_datasets(): |
|
|
"""Generate all synthetic datasets.""" |
|
|
print("Generating synthetic SAP finance datasets...") |
|
|
generate_gl_accounts() |
|
|
generate_financial_statements() |
|
|
generate_sales_orders() |
|
|
print("All datasets generated successfully!") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
generate_all_datasets() |
|
|
|
|
|
|