MLOps-risk-model / src /generate_synthetic_data.py
github-actions[bot]
deploy: sync from GitHub main
1e5b98a
import numpy as np
import pandas as pd
def generate_credit_risk_dataset(n_samples: int = 10000, random_state: int = 42) -> pd.DataFrame:
rng = np.random.default_rng(random_state)
# 1) customer_id
customer_id = [f"C{str(i).zfill(6)}" for i in range(1, n_samples + 1)]
# 2) application_date (distribuimos fechas en ~3 años)
start_date = np.datetime64("2022-01-01")
end_date = np.datetime64("2024-12-31")
n_days = (end_date - start_date).astype(int)
application_date = start_date + rng.integers(0, n_days, size=n_samples).astype("timedelta64[D]")
# 3) age
age = rng.integers(18, 76, size=n_samples)
# 4) gender
gender = rng.choice(["M", "F"], size=n_samples, p=[0.5, 0.5])
# 5) marital_status
marital_status = rng.choice(
["single", "married", "divorced", "widowed"],
size=n_samples,
p=[0.45, 0.4, 0.1, 0.05],
)
# 6) dependents
dependents = rng.integers(0, 6, size=n_samples)
# 7) monthly_income – asimétrica (más prob de ingresos bajos)
monthly_income = rng.gamma(shape=2.0, scale=1500.0, size=n_samples) + 800
monthly_income = np.clip(monthly_income, 800, 15000)
# 8) employment_type
employment_type = rng.choice(
["permanent", "contract", "self_employed", "unemployed"],
size=n_samples,
p=[0.55, 0.2, 0.2, 0.05],
)
# 9) employment_months – más corto para desempleados / contract
base_emp_months = rng.integers(0, 361, size=n_samples)
employment_months = base_emp_months.copy()
employment_months[(employment_type == "unemployed")] = rng.integers(0, 6, size=(employment_type == "unemployed").sum())
employment_months[(employment_type == "contract")] = rng.integers(0, 60, size=(employment_type == "contract").sum())
# 10) requested_amount
requested_amount = rng.normal(loc=30000, scale=12000, size=n_samples)
requested_amount = np.clip(requested_amount, 5000, 80000)
# 11) loan_term_months (12,24,36,48,60)
loan_term_months = rng.choice([12, 24, 36, 48, 60], size=n_samples, p=[0.1, 0.2, 0.35, 0.2, 0.15])
# 12) interest_rate (%)
interest_rate = rng.normal(loc=18, scale=5, size=n_samples)
interest_rate = np.clip(interest_rate, 8, 35)
# 13) installment – aproximada (no hace falta precisión financiera)
# cuota aproximada: (requested_amount / loan_term) * factor de interés simplificado
interest_factor = 1 + (interest_rate / 100 * loan_term_months / 12) # simple
installment = (requested_amount * interest_factor) / loan_term_months
# 14) debt_to_income
# asumimos deuda total aproximada = cuota * 3 + algo de ruido
approx_total_debt = installment * 3 + rng.normal(0, 500, size=n_samples)
approx_total_debt = np.clip(approx_total_debt, 0, None)
debt_to_income = approx_total_debt / monthly_income
debt_to_income = np.clip(debt_to_income, 0, 1.5)
# 15) num_open_loans
num_open_loans = rng.integers(0, 11, size=n_samples)
# 16) num_credit_cards
num_credit_cards = rng.integers(0, 9, size=n_samples)
# 17) has_mortgage
has_mortgage = rng.binomial(1, 0.3, size=n_samples)
# 18) channel
channel = rng.choice(
["branch", "web", "partner", "call_center"],
size=n_samples,
p=[0.4, 0.25, 0.25, 0.10],
)
# 19) region
region = rng.choice(
["capital", "north", "south", "east", "west"],
size=n_samples,
p=[0.4, 0.2, 0.15, 0.15, 0.1],
)
# 20) default_90d – generamos con un modelo logístico sintético
# Construimos un "score" con algunas reglas razonables
score = (
-2.0
+ 1.8 * debt_to_income
+ 0.6 * (num_open_loans > 4)
+ 0.4 * (num_credit_cards > 4)
+ 0.8 * (employment_type == "unemployed")
+ 0.4 * (employment_type == "contract")
+ 0.3 * (channel == "web")
+ 0.3 * (channel == "partner")
+ 0.5 * (monthly_income < 2000)
- 0.3 * (has_mortgage == 1)
- 0.2 * ((age >= 30) & (age <= 50))
)
# Probabilidad usando sigmoide
p_default = 1 / (1 + np.exp(-score))
# un poco de ruido aleatorio para que no sea tan determinístico
p_default = np.clip(p_default + rng.normal(0, 0.02, size=n_samples), 0.01, 0.95)
default_90d = rng.binomial(1, p_default, size=n_samples)
df = pd.DataFrame(
{
"customer_id": customer_id,
"application_date": application_date,
"age": age,
"gender": gender,
"marital_status": marital_status,
"dependents": dependents,
"monthly_income": monthly_income.round(2),
"employment_type": employment_type,
"employment_months": employment_months,
"requested_amount": requested_amount.round(2),
"loan_term_months": loan_term_months,
"interest_rate": interest_rate.round(2),
"installment": installment.round(2),
"debt_to_income": debt_to_income.round(3),
"num_open_loans": num_open_loans,
"num_credit_cards": num_credit_cards,
"has_mortgage": has_mortgage,
"channel": channel,
"region": region,
"default_90d": default_90d,
}
)
return df
if __name__ == "__main__":
df = generate_credit_risk_dataset(n_samples=15000, random_state=42)
df.to_csv("data/raw/credit_risk_synthetic.csv", index=False)
print(df.head())
print(df["default_90d"].mean())