""" Iris Flower Classification con XGBoost ======================================= Pipeline completo: EDA → Feature Engineering → Entrenamiento → Evaluación Dataset: https://www.kaggle.com/datasets/sims22/irisflowerdatasets """ import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import xgboost as xgb import joblib import json import os from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score from sklearn.preprocessing import LabelEncoder from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from datetime import datetime # ============================================================ # 1. CARGA DE DATOS # ============================================================ print("=" * 60) print("1. CARGA DE DATOS") print("=" * 60) df = pd.read_csv("data/IRIS.csv") print(f"Shape: {df.shape}") print(f"\nPrimeras filas:\n{df.head()}") print(f"\nInfo:") df.info() print(f"\nEstadísticas descriptivas:\n{df.describe()}") print(f"\nValores nulos:\n{df.isnull().sum()}") print(f"\nDistribución de clases:\n{df['species'].value_counts()}") # ============================================================ # 2. EDA - ANÁLISIS EXPLORATORIO # ============================================================ print("\n" + "=" * 60) print("2. EDA - ANÁLISIS EXPLORATORIO") print("=" * 60) os.makedirs("outputs", exist_ok=True) numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() # 2.1 Distribuciones por feature fig, axes = plt.subplots(2, 2, figsize=(12, 10)) for i, col in enumerate(numeric_cols): ax = axes[i // 2][i % 2] for species in df['species'].unique(): subset = df[df['species'] == species] ax.hist(subset[col], alpha=0.6, label=species, bins=15) ax.set_title(f"Distribución de {col}") ax.legend(fontsize=8) plt.tight_layout() plt.savefig("outputs/distribuciones.png", dpi=150) plt.close() print("✓ Distribuciones guardadas en outputs/distribuciones.png") # 2.2 Boxplots por especie fig, axes = plt.subplots(2, 2, figsize=(12, 10)) for i, col in enumerate(numeric_cols): ax = axes[i // 2][i % 2] sns.boxplot(x='species', y=col, data=df, ax=ax) ax.set_title(f"{col} por especie") ax.tick_params(axis='x', rotation=15) plt.tight_layout() plt.savefig("outputs/boxplots.png", dpi=150) plt.close() print("✓ Boxplots guardados en outputs/boxplots.png") # 2.3 Matriz de correlación fig, ax = plt.subplots(figsize=(8, 6)) corr = df[numeric_cols].corr() mask = np.triu(np.ones_like(corr, dtype=bool)) sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap="coolwarm", center=0, ax=ax) ax.set_title("Matriz de Correlación") plt.tight_layout() plt.savefig("outputs/correlacion.png", dpi=150) plt.close() print("✓ Correlación guardada en outputs/correlacion.png") # 2.4 Pairplot pairplot = sns.pairplot(df, hue='species', diag_kind='kde', height=2.5) pairplot.savefig("outputs/pairplot.png", dpi=150) plt.close() print("✓ Pairplot guardado en outputs/pairplot.png") # ============================================================ # 3. PREPARACIÓN DE DATOS # ============================================================ print("\n" + "=" * 60) print("3. PREPARACIÓN DE DATOS") print("=" * 60) le = LabelEncoder() df['species_encoded'] = le.fit_transform(df['species']) print(f"Clases: {dict(zip(le.classes_, le.transform(le.classes_)))}") X = df[numeric_cols] y = df['species_encoded'] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) print(f"Train: {X_train.shape}, Test: {X_test.shape}") # ============================================================ # 4. ENTRENAMIENTO CON XGBOOST # ============================================================ print("\n" + "=" * 60) print("4. ENTRENAMIENTO CON XGBOOST") print("=" * 60) # Cross-validation primero (sin early stopping) xgb_cv = xgb.XGBClassifier( n_estimators=200, max_depth=4, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, random_state=42, eval_metric='mlogloss', ) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) cv_scores = cross_val_score(xgb_cv, X, y, cv=cv, scoring='accuracy') print(f"CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}") # Modelo final con early stopping xgb_model = xgb.XGBClassifier( n_estimators=200, max_depth=4, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, random_state=42, eval_metric='mlogloss', early_stopping_rounds=20 ) xgb_model.fit( X_train, y_train, eval_set=[(X_test, y_test)], verbose=10 ) # ============================================================ # 5. EVALUACIÓN # ============================================================ print("\n" + "=" * 60) print("5. EVALUACIÓN") print("=" * 60) y_pred = xgb_model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"Test Accuracy: {accuracy:.4f}") print(f"\nClassification Report:") report = classification_report(y_test, y_pred, target_names=le.classes_) print(report) # 5.1 Confusion Matrix fig, ax = plt.subplots(figsize=(8, 6)) cm = confusion_matrix(y_test, y_pred) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_, ax=ax) ax.set_xlabel('Predicted') ax.set_ylabel('Actual') ax.set_title(f'Confusion Matrix (Accuracy: {accuracy:.4f})') plt.tight_layout() plt.savefig("outputs/confusion_matrix.png", dpi=150) plt.close() print("✓ Confusion matrix guardada en outputs/confusion_matrix.png") # 5.2 Feature Importance fig, ax = plt.subplots(figsize=(8, 5)) importance = pd.Series( xgb_model.feature_importances_, index=numeric_cols ).sort_values(ascending=True) importance.plot(kind='barh', ax=ax, color='steelblue') ax.set_title("Feature Importance (XGBoost)") ax.set_xlabel("Importance") plt.tight_layout() plt.savefig("outputs/feature_importance.png", dpi=150) plt.close() print("✓ Feature importance guardada en outputs/feature_importance.png") # ============================================================ # 6. GUARDAR MODELO Y METADATA # ============================================================ print("\n" + "=" * 60) print("6. GUARDAR MODELO Y METADATA") print("=" * 60) joblib.dump(xgb_model, "model.joblib") print("✓ Modelo guardado en model.joblib") # Label encoder para la app joblib.dump(le, "label_encoder.joblib") print("✓ Label encoder guardado en label_encoder.joblib") model_info = { "framework": "xgboost", "model_type": "XGBClassifier", "dataset": "sims22/irisflowerdatasets", "task": "multiclass_classification", "classes": list(le.classes_), "features": numeric_cols, "metrics": { "test_accuracy": float(accuracy), "cv_accuracy_mean": float(cv_scores.mean()), "cv_accuracy_std": float(cv_scores.std()), }, "hyperparameters": { "n_estimators": 200, "max_depth": 4, "learning_rate": 0.1, "subsample": 0.8, "colsample_bytree": 0.8, }, "training_samples": int(len(X_train)), "test_samples": int(len(X_test)), "trained_at": datetime.now().isoformat(), } with open("model_info.json", "w") as f: json.dump(model_info, f, indent=2) print("✓ Metadata guardada en model_info.json") print("\n" + "=" * 60) print("PIPELINE COMPLETADO") print(f"Test Accuracy: {accuracy:.4f}") print(f"CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}") print("=" * 60)