Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import joblib | |
| import gradio as gr | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.decomposition import PCA | |
| from sklearn.preprocessing import StandardScaler | |
| # Cargar el pipeline y los datos | |
| try: | |
| pipeline = joblib.load('../models/final_pipe.joblib') | |
| df_original = pd.read_csv('../data/clean_df.csv') | |
| except: | |
| # Si falla la ruta relativa, intenta desde el directorio actual | |
| pipeline = joblib.load('final_pipe.joblib') | |
| df_original = pd.read_csv('clean_df.csv') | |
| # Preparar los datos (sin ID, Education, Marital_Status) | |
| df_features = df_original.drop(['ID', 'Education', 'Marital_Status'], axis=1) | |
| # Obtener las predicciones de clusters para el dataset completo | |
| cluster_predictions = pipeline.predict(df_features) | |
| df_viz = df_original.copy() | |
| df_viz['Cluster'] = cluster_predictions | |
| # Obtener datos transformados con PCA para visualización | |
| X_scaled = pipeline.named_steps['scaler'].transform(df_features) | |
| X_pca = pipeline.named_steps['pca'].transform(X_scaled) | |
| # Diccionario para traducir variables | |
| VARIABLE_NAMES = { | |
| 'Year_Birth': 'Año de Nacimiento', | |
| 'Income': 'Ingresos (log)', | |
| 'Kidhome': 'Niños en Casa', | |
| 'Teenhome': 'Adolescentes en Casa', | |
| 'Recency': 'Días desde Última Compra', | |
| 'MntWines': 'Gastos en Vinos (log)', | |
| 'MntFruits': 'Gastos en Frutas (log)', | |
| 'MntMeatProducts': 'Gastos en Carnes (log)', | |
| 'MntFishProducts': 'Gastos en Pescados (log)', | |
| 'MntSweetProducts': 'Gastos en Dulces (log)', | |
| 'MntGoldProds': 'Gastos en Productos Premium (log)', | |
| 'NumWebPurchases': 'Compras Web', | |
| 'NumCatalogPurchases': 'Compras por Catálogo', | |
| 'NumStorePurchases': 'Compras en Tienda', | |
| 'NumWebVisitsMonth': 'Visitas Web por Mes' | |
| } | |
| def analyze_clusters(): | |
| """Analizar y describir las características de cada cluster""" | |
| cluster_descriptions = {} | |
| for cluster in [0, 1]: | |
| cluster_data = df_viz[df_viz['Cluster'] == cluster] | |
| # Calcular promedios de variables clave | |
| avg_income = np.exp(cluster_data['Income'].mean()) - 1 # Deshacer log1p | |
| avg_wines = np.exp(cluster_data['MntWines'].mean()) - 1 | |
| avg_meat = np.exp(cluster_data['MntMeatProducts'].mean()) - 1 | |
| avg_gold = np.exp(cluster_data['MntGoldProds'].mean()) - 1 | |
| avg_web_purchases = cluster_data['NumWebPurchases'].mean() | |
| avg_store_purchases = cluster_data['NumStorePurchases'].mean() | |
| avg_kids = cluster_data['Kidhome'].mean() | |
| avg_teens = cluster_data['Teenhome'].mean() | |
| cluster_descriptions[cluster] = { | |
| 'avg_income': avg_income, | |
| 'avg_wines': avg_wines, | |
| 'avg_meat': avg_meat, | |
| 'avg_gold': avg_gold, | |
| 'avg_web': avg_web_purchases, | |
| 'avg_store': avg_store_purchases, | |
| 'avg_kids': avg_kids, | |
| 'avg_teens': avg_teens, | |
| 'count': len(cluster_data) | |
| } | |
| return cluster_descriptions | |
| def get_cluster_interpretation(): | |
| """Generar interpretación de los clusters""" | |
| cluster_info = analyze_clusters() | |
| interpretation = """ | |
| ## 🧠 Interpretación de los Clusters | |
| Basándose en el análisis de los datos, nuestro modelo ha identificado dos segmentos principales de clientes: | |
| """ | |
| for cluster in [0, 1]: | |
| info = cluster_info[cluster] | |
| # Determinar el perfil del cluster | |
| if info['avg_income'] > 50000: | |
| income_level = "ingresos altos" | |
| elif info['avg_income'] > 30000: | |
| income_level = "ingresos medios" | |
| else: | |
| income_level = "ingresos bajos" | |
| if info['avg_wines'] > 200: | |
| wine_spending = "alto gasto en vinos" | |
| elif info['avg_wines'] > 50: | |
| wine_spending = "gasto moderado en vinos" | |
| else: | |
| wine_spending = "bajo gasto en vinos" | |
| if info['avg_web'] > info['avg_store']: | |
| channel_pref = "prefieren compras online" | |
| else: | |
| channel_pref = "prefieren compras en tienda física" | |
| family_status = "" | |
| if info['avg_kids'] + info['avg_teens'] > 1: | |
| family_status = "con familias más grandes" | |
| elif info['avg_kids'] + info['avg_teens'] > 0.5: | |
| family_status = "con hijos" | |
| else: | |
| family_status = "sin hijos o familias pequeñas" | |
| interpretation += f""" | |
| ### 🎯 **Cluster {cluster}** ({info['count']} clientes) | |
| **Perfil:** Clientes con {income_level}, {wine_spending}, {family_status} y que {channel_pref}. | |
| **Características principales:** | |
| - 💰 Ingresos promedio: ${info['avg_income']:,.0f} | |
| - 🍷 Gasto en vinos: ${info['avg_wines']:.0f} | |
| - 🥩 Gasto en carnes: ${info['avg_meat']:.0f} | |
| - ✨ Productos premium: ${info['avg_gold']:.0f} | |
| - 🛒 Compras web: {info['avg_web']:.1f} | Tienda: {info['avg_store']:.1f} | |
| - 👶 Niños: {info['avg_kids']:.1f} | Adolescentes: {info['avg_teens']:.1f} | |
| """ | |
| return interpretation | |
| def create_cluster_plot(): | |
| """Crear el gráfico de scatter de los clusters""" | |
| plt.figure(figsize=(10, 6)) | |
| # Crear scatter plot con diferentes colores para cada cluster | |
| unique_clusters = np.unique(cluster_predictions) | |
| colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'] | |
| for i, cluster in enumerate(unique_clusters): | |
| mask = cluster_predictions == cluster | |
| plt.scatter(X_pca[mask, 0], X_pca[mask, 1], | |
| c=colors[i], label=f'Cluster {cluster}', alpha=0.6, s=50) | |
| plt.xlabel('Componente Principal 1') | |
| plt.ylabel('Componente Principal 2') | |
| plt.title('Segmentación de Clientes - Visualización 2D con PCA') | |
| plt.legend() | |
| plt.grid(True, alpha=0.3) | |
| plt.tight_layout() | |
| return plt | |
| def get_cluster_summary(selected_cluster): | |
| """Obtener resumen estadístico del cluster seleccionado""" | |
| cluster_data = df_viz[df_viz['Cluster'] == selected_cluster] | |
| # Variables principales para el resumen | |
| important_vars = [ | |
| 'Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Recency', | |
| 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', | |
| 'MntSweetProducts', 'MntGoldProds', 'NumWebPurchases', | |
| 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth' | |
| ] | |
| summary_data = [] | |
| for var in important_vars: | |
| if var in cluster_data.columns: | |
| mean_val = cluster_data[var].mean() | |
| var_name = VARIABLE_NAMES.get(var, var) | |
| summary_data.append([var_name, round(mean_val, 2)]) | |
| return pd.DataFrame(summary_data, columns=['Variable', 'Promedio']) | |
| def predict_new_customer(customer_id, year_birth, income, kidhome, teenhome, recency, | |
| mnt_wines, mnt_fruits, mnt_meat, mnt_fish, mnt_sweet, mnt_gold, | |
| num_deals, num_web, num_catalog, num_store, num_web_visits, | |
| cmp3, cmp4, cmp5, cmp1, cmp2, complain, response): | |
| """Predecir cluster para un nuevo cliente""" | |
| # Aplicar log1p a las variables que lo requieren (como en el preprocessing original) | |
| income_log = np.log1p(income) | |
| mnt_wines_log = np.log1p(mnt_wines) | |
| mnt_fruits_log = np.log1p(mnt_fruits) | |
| mnt_meat_log = np.log1p(mnt_meat) | |
| mnt_fish_log = np.log1p(mnt_fish) | |
| mnt_sweet_log = np.log1p(mnt_sweet) | |
| mnt_gold_log = np.log1p(mnt_gold) | |
| # Crear DataFrame con los datos del nuevo cliente (SIN ID) | |
| new_customer = pd.DataFrame([{ | |
| 'Year_Birth': year_birth, | |
| 'Income': income_log, | |
| 'Kidhome': kidhome, | |
| 'Teenhome': teenhome, | |
| 'Recency': recency, | |
| 'MntWines': mnt_wines_log, | |
| 'MntFruits': mnt_fruits_log, | |
| 'MntMeatProducts': mnt_meat_log, | |
| 'MntFishProducts': mnt_fish_log, | |
| 'MntSweetProducts': mnt_sweet_log, | |
| 'MntGoldProds': mnt_gold_log, | |
| 'NumDealsPurchases': num_deals, | |
| 'NumWebPurchases': num_web, | |
| 'NumCatalogPurchases': num_catalog, | |
| 'NumStorePurchases': num_store, | |
| 'NumWebVisitsMonth': num_web_visits, | |
| 'AcceptedCmp3': cmp3, | |
| 'AcceptedCmp4': cmp4, | |
| 'AcceptedCmp5': cmp5, | |
| 'AcceptedCmp1': cmp1, | |
| 'AcceptedCmp2': cmp2, | |
| 'Complain': complain, | |
| 'Z_CostContact': 3, # Valor fijo del dataset | |
| 'Z_Revenue': 11, # Valor fijo del dataset | |
| 'Response': response | |
| }]) | |
| # Predecir cluster (el pipeline maneja internamente el scaling y PCA) | |
| predicted_cluster = pipeline.predict(new_customer)[0] | |
| # Obtener descripción del cluster | |
| cluster_info = analyze_clusters() | |
| info = cluster_info[predicted_cluster] | |
| result = f""" | |
| ## 🎯 Resultado de la Predicción | |
| **El cliente ID {customer_id} pertenece al Cluster {predicted_cluster}** | |
| ### 📊 Características del Cluster {predicted_cluster}: | |
| - 👥 Total de clientes similares: {info['count']} | |
| - 💰 Ingresos promedio del grupo: ${info['avg_income']:,.0f} | |
| - 🍷 Gasto promedio en vinos: ${info['avg_wines']:.0f} | |
| - 🥩 Gasto promedio en carnes: ${info['avg_meat']:.0f} | |
| - 🛒 Compras web vs tienda: {info['avg_web']:.1f} vs {info['avg_store']:.1f} | |
| """ | |
| return result | |
| def update_cluster_info(selected_cluster): | |
| """Actualizar información cuando se selecciona un cluster""" | |
| summary_df = get_cluster_summary(selected_cluster) | |
| cluster_size = len(df_viz[df_viz['Cluster'] == selected_cluster]) | |
| cluster_info = f"📊 **Cluster {selected_cluster}** - Total de clientes: {cluster_size}" | |
| return summary_df, cluster_info | |
| # Crear la interfaz con Gradio Blocks | |
| with gr.Blocks(title="Customer Personality Analysis", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # 🎯 Análisis de Personalidad de Clientes - Demo de Clustering | |
| **Para que puedan comprender el modelo, hagan de cuenta que son los dueños de un negocio.** | |
| Quieren entender a sus clientes: cuáles compran más en ciertas categorías, cuáles gastan menos, | |
| y en general cómo se pueden segmentar. Esto es solo una demostración con un dataset ficticio, | |
| pero en la práctica se aplicaría con los datos de su empresa. | |
| El modelo utiliza **K-Means con PCA** para segmentar clientes en 2 grupos basándose en sus patrones de compra y comportamiento. | |
| """) | |
| # Agregar interpretación de clusters | |
| gr.Markdown(get_cluster_interpretation()) | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Gráfico de clusters | |
| gr.Markdown("## 📈 Visualización de Clusters") | |
| cluster_plot = gr.Plot(value=create_cluster_plot(), label="Clusters de Clientes") | |
| with gr.Column(): | |
| # Dropdown para seleccionar cluster | |
| gr.Markdown("## 🔍 Explorar Clusters") | |
| cluster_dropdown = gr.Dropdown( | |
| choices=[0, 1], | |
| value=0, | |
| label="Seleccionar Cluster para Análisis" | |
| ) | |
| cluster_info = gr.Markdown("📊 **Cluster 0** - Total de clientes: " + str(len(df_viz[df_viz['Cluster'] == 0]))) | |
| # Tabla resumen | |
| cluster_summary = gr.Dataframe( | |
| value=get_cluster_summary(0), | |
| label="Resumen Estadístico del Cluster", | |
| interactive=False | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown("## 🆕 Predecir Cluster para Nuevo Cliente") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### 👤 Información Personal") | |
| customer_id = gr.Textbox(label="ID del Cliente", value="NUEVO_001", placeholder="Ej: CLIENTE_123") | |
| year_birth = gr.Slider(1940, 2000, value=1985, label="Año de Nacimiento") | |
| income = gr.Slider(0, 200000, value=50000, step=1000, label="Ingresos Anuales ($)") | |
| kidhome = gr.Slider(0, 5, value=0, step=1, label="Niños en Casa") | |
| teenhome = gr.Slider(0, 5, value=0, step=1, label="Adolescentes en Casa") | |
| recency = gr.Slider(0, 100, value=30, label="Días desde última compra") | |
| with gr.Column(): | |
| gr.Markdown("### 🛒 Gastos por Categoría ($)") | |
| mnt_wines = gr.Slider(0, 2000, value=100, step=10, label="Gastos en Vinos") | |
| mnt_fruits = gr.Slider(0, 500, value=20, step=5, label="Gastos en Frutas") | |
| mnt_meat = gr.Slider(0, 2000, value=150, step=10, label="Gastos en Carnes") | |
| mnt_fish = gr.Slider(0, 500, value=30, step=5, label="Gastos en Pescados") | |
| mnt_sweet = gr.Slider(0, 500, value=15, step=5, label="Gastos en Dulces") | |
| mnt_gold = gr.Slider(0, 500, value=50, step=5, label="Gastos en Productos Premium") | |
| with gr.Column(): | |
| gr.Markdown("### 🛍️ Comportamiento de Compra") | |
| num_deals = gr.Slider(0, 20, value=2, step=1, label="Compras con Descuento") | |
| num_web = gr.Slider(0, 20, value=3, step=1, label="Compras Web") | |
| num_catalog = gr.Slider(0, 20, value=1, step=1, label="Compras por Catálogo") | |
| num_store = gr.Slider(0, 20, value=5, step=1, label="Compras en Tienda") | |
| num_web_visits = gr.Slider(0, 20, value=4, step=1, label="Visitas Web/Mes") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### 📢 Respuesta a Campañas") | |
| cmp1 = gr.Checkbox(label="Campaña 1", value=False) | |
| cmp2 = gr.Checkbox(label="Campaña 2", value=False) | |
| cmp3 = gr.Checkbox(label="Campaña 3", value=False) | |
| cmp4 = gr.Checkbox(label="Campaña 4", value=False) | |
| cmp5 = gr.Checkbox(label="Campaña 5", value=False) | |
| with gr.Column(): | |
| gr.Markdown("### 📋 Otros") | |
| complain = gr.Checkbox(label="Ha presentado quejas", value=False) | |
| response = gr.Checkbox(label="Respondió última campaña", value=False) | |
| # Botón de predicción | |
| predict_btn = gr.Button("🔮 Predecir Cluster", variant="primary", size="lg") | |
| # Resultado de la predicción | |
| prediction_result = gr.Markdown("") | |
| # Event handlers | |
| cluster_dropdown.change( | |
| fn=update_cluster_info, | |
| inputs=[cluster_dropdown], | |
| outputs=[cluster_summary, cluster_info] | |
| ) | |
| predict_btn.click( | |
| fn=predict_new_customer, | |
| inputs=[ | |
| customer_id, year_birth, income, kidhome, teenhome, recency, | |
| mnt_wines, mnt_fruits, mnt_meat, mnt_fish, mnt_sweet, mnt_gold, | |
| num_deals, num_web, num_catalog, num_store, num_web_visits, | |
| cmp3, cmp4, cmp5, cmp1, cmp2, complain, response | |
| ], | |
| outputs=[prediction_result] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |