| | import io |
| | import time |
| | import pandas as pd |
| | import numpy as np |
| | import streamlit as st |
| | try: |
| | from sdmetrics.reports.single_table import QualityReport |
| | except Exception: |
| | QualityReport = None |
| |
|
| | st.set_page_config(page_title="Sintetizador de Datos", layout="wide") |
| |
|
| | |
| | CUSTOM_CSS = """ |
| | <style> |
| | @import url("https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap"); |
| | html, body, [class*="css"] { font-family: "Inter", sans-serif; } |
| | |
| | #app-center { max-width: 1100px; margin: 0 auto; padding: 8px 16px 24px; } |
| | |
| | .app-header { |
| | background: linear-gradient(135deg, #0ea5e9 0%, #6366f1 60%, #9333ea 100%); |
| | color: #fff; border-radius: 16px; padding: 20px 24px; |
| | box-shadow: 0 8px 28px rgba(99,102,241,.25); |
| | } |
| | .app-header h1 { margin: 0 0 6px 0; font-weight: 700; letter-spacing: -.015em; } |
| | .app-header p { margin: 0; opacity: .95; } |
| | |
| | .card { |
| | background: #ffffff; border-radius: 14px; border: 1px solid #eef2f7; |
| | box-shadow: 0 6px 18px rgba(15,23,42,.06); padding: 18px; |
| | } |
| | [data-theme="dark"] .card { background:#0b1220; border-color:#1f2937; } |
| | |
| | .stButton>button, .stDownloadButton>button { border-radius: 10px; padding:.6rem 1rem; } |
| | </style> |
| | """ |
| | st.markdown(CUSTOM_CSS, unsafe_allow_html=True) |
| |
|
| | st.markdown('<div id="app-center">', unsafe_allow_html=True) |
| | st.markdown(""" |
| | <div class="app-header"> |
| | <h1>🧪 Sintetizador de Datos</h1> |
| | <p>Por Ricardo Urdaneta</p> |
| | </div> |
| | """, unsafe_allow_html=True) |
| | st.markdown(""" |
| | |
| | Genera datos sintéticos a partir de un CSV. |
| | |
| | La app aprende la **distribución estadística** de tus datos y crea **filas nuevas** que preservan patrones globales (distribuciones, correlaciones), sin copiar registros originales. |
| | |
| | **Usos típicos:** |
| | - Compartir datos con menor riesgo de **exposición**. |
| | - **Balancear** clases raras o aumentar datos (data augmentation). |
| | - **Probar** pipelines/ETL y entrenar modelos sin tocar datos sensibles. |
| | """) |
| |
|
| | with st.expander("Resumen técnico"): |
| | st.markdown(""" |
| | - Se infiere **SingleTableMetadata** a partir del DataFrame. |
| | - El modelo aplica **transformaciones** internas (ej., codificación de categóricas) y aprende la **distribución conjunta**. |
| | - Síntesis: |
| | - **GaussianCopula**: ajusta marginales y dependencias con **copulas gaussianas**. |
| | - **CTGAN**: **GAN** tabular (generador vs. discriminador) adaptada a mixto; usamos `pac=1` para compatibilidad en batches. |
| | - **TVAE**: **Variational Autoencoder** tabular; aprende una latente que modela la distribución global. |
| | - Generación: se **muestran y descargan** filas nuevas en CSV; si está disponible `sdmetrics`, se reporta una **puntuación de calidad** (0–1). |
| | """) |
| |
|
| | with st.expander("ℹ️ Recomendaciones para elegir modelo y parámetros"): |
| | st.markdown(""" |
| | **Cobertura de casos (recomendaciones rápidas):** |
| | - **GaussianCopula** *(no requiere PyTorch)* → rápido en CPU, datasets pequeños/medianos, baseline estable. |
| | - **CTGAN** *(requiere PyTorch)* → datos **mixtos** con muchas **categóricas** o **alta cardinalidad**; mejor para modas múltiples y clases minoritarias. |
| | - **TVAE** *(requiere PyTorch)* → **numérico dominante** o cuando buscas **estabilidad** y estructura global sin adversarial. |
| | |
| | **Consejos prácticos:** |
| | - En CPU, usa **5–15 epochs**. |
| | - Si el CSV es grande, **muestra** (ej.: `df.sample(100_000)`) antes de entrenar. |
| | """) |
| |
|
| | |
| | |
| | def torch_available(): |
| | try: |
| | import torch |
| | return True |
| | except Exception: |
| | return False |
| |
|
| | def _info_columns(df): |
| | num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])] |
| | cat_cols = [c for c in df.columns if not pd.api.types.is_numeric_dtype(df[c])] |
| | high_card = sum(df[c].nunique(dropna=True) > 50 for c in cat_cols) |
| | return { |
| | "rows": len(df), |
| | "cols": len(df.columns), |
| | "num": len(num_cols), |
| | "cat": len(cat_cols), |
| | "high_card_cat": high_card, |
| | } |
| |
|
| | def recommend_model(df): |
| | info = _info_columns(df) |
| | has_torch = torch_available() |
| |
|
| | |
| | rec = { |
| | "model": "GaussianCopula", |
| | "epochs": 10, |
| | "batch": 64, |
| | "sample": min(1000, max(300, info["rows"])), |
| | "reason": "Baseline rápido y estable (CPU)." |
| | } |
| |
|
| | if not has_torch: |
| | rec["reason"] += " PyTorch no disponible." |
| | return rec |
| |
|
| | |
| | if info["rows"] < 800: |
| | return { |
| | "model": "TVAE", |
| | "epochs": 8, |
| | "batch": max(16, min(64, info["rows"])), |
| | "sample": min(1000, max(300, info["rows"])), |
| | "reason": "Dataset pequeño; TVAE es estable y generaliza sin adversarial." |
| | } |
| |
|
| | if info["cat"] >= info["num"] or info["high_card_cat"] >= 1: |
| | return { |
| | "model": "CTGAN", |
| | "epochs": 10, |
| | "batch": min(128, max(32, info["rows"] // 10)), |
| | "sample": min(2000, max(500, info["rows"])), |
| | "reason": "Mixto con categóricas/alta cardinalidad; CTGAN captura modas múltiples mejor." |
| | } |
| |
|
| | |
| | return { |
| | "model": "TVAE", |
| | "epochs": 10, |
| | "batch": min(128, max(32, info["rows"] // 10)), |
| | "sample": min(2000, max(500, info["rows"])), |
| | "reason": "Numérico dominante; TVAE es estable y veloz en CPU." |
| | } |
| |
|
| | def get_model(model_name: str, epochs: int, batch_size: int, df_train: pd.DataFrame): |
| | name = model_name.lower() |
| |
|
| | |
| | if len(df_train) < batch_size: |
| | batch_size = max(8, len(df_train)) |
| |
|
| | try: |
| | from sdv.metadata import SingleTableMetadata |
| | md = SingleTableMetadata() |
| | md.detect_from_dataframe(df_train) |
| |
|
| | if name == "ctgan": |
| | from sdv.single_table import CTGANSynthesizer |
| | return CTGANSynthesizer(md, epochs=epochs, batch_size=batch_size, pac=1), md |
| | elif name == "tvae": |
| | from sdv.single_table import TVAESynthesizer |
| | return TVAESynthesizer(md, epochs=epochs, batch_size=batch_size), md |
| | else: |
| | from sdv.single_table import GaussianCopulaSynthesizer |
| | return GaussianCopulaSynthesizer(md), md |
| | except Exception: |
| | |
| | md = None |
| | if name == "ctgan": |
| | from sdv.tabular import CTGAN |
| | return CTGAN(epochs=epochs, batch_size=batch_size, verbose=True, pac=1), md |
| | elif name == "tvae": |
| | from sdv.tabular import TVAE |
| | return TVAE(epochs=epochs, batch_size=batch_size, verbose=True), md |
| | else: |
| | from sdv.tabular import GaussianCopula |
| | return GaussianCopula(), md |
| |
|
| | |
| | MAX_UPLOAD_MB = 5 |
| | MAX_UPLOAD_BYTES = MAX_UPLOAD_MB * 1024 * 1024 |
| |
|
| | |
| | left, right = st.columns([1, 1]) |
| |
|
| | with left: |
| | st.subheader("1) Sube tu CSV") |
| | st.caption(f"Máximo {MAX_UPLOAD_MB} MB · Formato: CSV") |
| | up = st.file_uploader("Selecciona un archivo .csv", type=["csv"], key="csv_uploader") |
| |
|
| | df_real = None |
| | if up is not None: |
| | file_bytes = up.getvalue() |
| | size_bytes = getattr(up, "size", None) or len(file_bytes) |
| |
|
| | |
| | if size_bytes > MAX_UPLOAD_BYTES: |
| | st.error(f"El archivo pesa {size_bytes/1024/1024:.2f} MB y excede el límite de {MAX_UPLOAD_MB} MB. " |
| | "Sube un archivo más liviano.") |
| | else: |
| | try: |
| | |
| | df_real = pd.read_csv(io.BytesIO(file_bytes), sep=None, engine="python") |
| | except Exception: |
| | |
| | try: |
| | df_real = pd.read_csv(io.BytesIO(file_bytes), sep=";") |
| | except Exception: |
| | df_real = pd.read_csv(io.BytesIO(file_bytes), sep=",") |
| |
|
| | st.success(f"Cargado: {df_real.shape[0]} filas × {df_real.shape[1]} columnas") |
| | st.dataframe(df_real.head()) |
| |
|
| | |
| | df_real = df_real.dropna(axis=1, how="all") |
| | if df_real.shape[1] > 25: |
| | df_real = df_real.iloc[:, :25] |
| | st.info("Se detectaron muchas columnas. Para este demo se usan solo las primeras 25.") |
| | else: |
| | st.info("Sube un CSV para continuar.") |
| |
|
| | with right: |
| | st.subheader("2) Modelo y parámetros") |
| |
|
| | model_options = ["GaussianCopula", "CTGAN", "TVAE"] |
| |
|
| | if df_real is None: |
| | |
| | model_name = st.selectbox("Modelo", model_options, index=0, help="Sube un CSV para obtener recomendaciones automáticas.") |
| | epochs = st.slider("Epochs (demo CPU)", 1, 25, value=10, step=1) |
| | batch_size = st.select_slider("Batch size", options=[32, 64, 128, 256], value=64) |
| | sample_size = st.slider("Filas sintéticas a generar", 100, 5000, value=1000, step=100) |
| | st.caption("💡 Sube un CSV para que la app recomiende modelo y parámetros.") |
| | else: |
| | rec = recommend_model(df_real) |
| | default_index = model_options.index(rec["model"]) |
| | model_name = st.selectbox("Modelo", model_options, index=default_index, help=f"Recomendación: {rec['reason']}") |
| | epochs = st.slider("Epochs (demo CPU)", 1, 25, value=min(25, rec["epochs"]), step=1) |
| | batch_size = st.select_slider("Batch size", options=[32, 64, 128, 256], value=min(256, max(32, rec["batch"]))) |
| | sample_size = st.slider("Filas sintéticas a generar", 100, 5000, value=min(5000, max(300, rec['sample'])), step=100) |
| | st.caption(f"Sugerencia automática → **{rec['model']}** · {rec['reason']}") |
| |
|
| | |
| | st.subheader("3) Entrenamiento y generación") |
| |
|
| | train_btn = st.button("Entrenar modelo y generar datos sintéticos") |
| |
|
| | if train_btn: |
| | if df_real is None: |
| | st.warning("Primero sube un CSV en la sección 1.") |
| | st.stop() |
| |
|
| | |
| | df_train = df_real.copy() |
| | if df_train.shape[1] >= 40 and sample_size > 2000: |
| | sample_size = 2000 |
| | st.warning("Muchas columnas detectadas; se limita sample_size a 2000.") |
| |
|
| | |
| | try: |
| | model, metadata = get_model( |
| | model_name=model_name, |
| | epochs=epochs, |
| | batch_size=batch_size, |
| | df_train=df_train |
| | ) |
| | except Exception as e: |
| | st.error("No se pudo inicializar el modelo. Posible falta de dependencias (torch para CTGAN/TVAE) o versión de SDV.") |
| | st.code(str(e)) |
| | st.stop() |
| |
|
| | with st.spinner("Entrenando modelo…"): |
| | t0 = time.time() |
| | model.fit(df_train) |
| | dt = time.time() - t0 |
| | st.success(f"✅ Entrenamiento completado en {dt:.1f} s") |
| |
|
| | with st.spinner("Generando datos sintéticos…"): |
| | df_syn = model.sample(sample_size) |
| |
|
| | st.write("### Muestra de datos sintéticos") |
| | st.dataframe(df_syn.head()) |
| |
|
| | st.write("### Evaluación de calidad") |
| | if QualityReport is None: |
| | st.info("`sdmetrics` no está disponible; añade `sdmetrics>=0.12.0` a requirements.txt.") |
| | else: |
| | try: |
| | report = QualityReport() |
| |
|
| | |
| | md = None |
| | if 'metadata' in locals() and metadata is not None: |
| | md = metadata.to_dict() if hasattr(metadata, "to_dict") else metadata |
| |
|
| | if md is not None: |
| | report.generate(real_data=df_train, synthetic_data=df_syn, metadata=md) |
| | else: |
| | report.generate(real_data=df_train, synthetic_data=df_syn) |
| |
|
| | st.metric("Calidad global (0–1)", f"{report.get_score():.3f}") |
| | except Exception as e: |
| | st.warning(f"No se pudo calcular la calidad: {e}") |
| |
|
| | st.write("### Comparación univariante (hasta 6 columnas)") |
| | cols = df_train.columns[:6] |
| | tabs = st.tabs([str(c) for c in cols]) |
| | for i, c in enumerate(cols): |
| | with tabs[i]: |
| | c1, c2 = st.columns(2) |
| | with c1: |
| | st.write("Real") |
| | if pd.api.types.is_numeric_dtype(df_train[c]): |
| | st.bar_chart(df_train[c].dropna().value_counts(bins=20)) |
| | else: |
| | st.bar_chart(df_train[c].astype(str).value_counts().head(30)) |
| | with c2: |
| | st.write("Sintético") |
| | if c in df_syn.columns: |
| | if pd.api.types.is_numeric_dtype(df_syn[c]): |
| | st.bar_chart(df_syn[c].dropna().value_counts(bins=20)) |
| | else: |
| | st.bar_chart(df_syn[c].astype(str).value_counts().head(30)) |
| | else: |
| | st.info("Columna no generada.") |
| |
|
| | st.write("### Descargar CSV sintético") |
| | buf = io.StringIO() |
| | df_syn.to_csv(buf, index=False) |
| | st.download_button( |
| | "Descargar synthetic.csv", |
| | data=buf.getvalue().encode("utf-8"), |
| | file_name="synthetic.csv", |
| | mime="text/csv" |
| | ) |