MLOps-risk-model / src /simulate_production_data.py
github-actions[bot]
deploy: sync from GitHub main
2127e22
# src/simulate_production_data.py
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
# Handle both module and direct execution
if __name__ == "__main__":
# Add parent directory to path for direct execution
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.config import PRODUCTION_DATA_DIR, PRODUCTION_DATA_PATH, RANDOM_STATE
from src.generate_synthetic_data import generate_credit_risk_dataset
else:
# Relative imports for module execution
from .config import PRODUCTION_DATA_DIR, PRODUCTION_DATA_PATH, RANDOM_STATE
from .generate_synthetic_data import generate_credit_risk_dataset
def apply_data_drift(df: pd.DataFrame, rng: np.random.Generator) -> pd.DataFrame:
"""Simula data drift cambiando la distribuci贸n de channel e ingresos."""
df = df.copy()
# M谩s tr谩fico digital: subir proporci贸n de 'web' y 'partner'
new_channels = ["branch", "web", "partner", "call_center"]
new_probs = [0.15, 0.45, 0.3, 0.10] # antes branch 0.4, ahora mucho m谩s web/partner
df["channel"] = rng.choice(new_channels, size=len(df), p=new_probs)
# Clientes m谩s riesgosos: ingresos m谩s bajos
df["monthly_income"] = df["monthly_income"] * rng.uniform(0.6, 0.9, size=len(df))
df["monthly_income"] = df["monthly_income"].clip(lower=800)
return df
def apply_model_drift(df: pd.DataFrame, rng: np.random.Generator) -> pd.DataFrame:
"""Simula model drift cambiando la relaci贸n X -> y (default_90d)."""
df = df.copy()
# Ignoramos la columna default_90d original y recalculamos con una regla distinta
debt_to_income = df["debt_to_income"].values
num_open_loans = df["num_open_loans"].values
monthly_income = df["monthly_income"].values
channel = df["channel"].values
employment_type = df["employment_type"].values
# Nuevo "mundo": ahora el canal 'partner' es mucho m谩s riesgoso,
# y el peso de debt_to_income se incrementa.
score = (
-1.0
+ 2.3 * debt_to_income
+ 0.8 * (num_open_loans > 3)
+ 0.7 * (channel == "partner")
+ 0.4 * (channel == "web")
+ 0.6 * (employment_type == "unemployed")
+ 0.3 * (monthly_income < 1800)
)
p_default_new = 1 / (1 + np.exp(-score))
p_default_new = np.clip(p_default_new + rng.normal(0, 0.03, size=len(df)), 0.01, 0.98)
df["default_90d"] = rng.binomial(1, p_default_new, size=len(df))
return df
def main() -> None:
rng = np.random.default_rng(RANDOM_STATE + 123)
scenario = os.environ.get("DRIFT_SCENARIO", "both").lower()
n_samples = int(os.environ.get("PRODUCTION_SAMPLES", 5000))
print(f"Generando batch de producci贸n con escenario: {scenario} (n={n_samples})")
# Partimos de un dataset "normal"
df = generate_credit_risk_dataset(n_samples=n_samples, random_state=RANDOM_STATE + 999)
if scenario in ("data", "both"):
df = apply_data_drift(df, rng)
if scenario in ("model", "both"):
df = apply_model_drift(df, rng)
PRODUCTION_DATA_DIR.mkdir(parents=True, exist_ok=True)
df.to_csv(PRODUCTION_DATA_PATH, index=False)
print(f"Batch de producci贸n guardado en: {PRODUCTION_DATA_PATH}")
if __name__ == "__main__":
main()