| """ |
| Iris Flower Classification con XGBoost |
| ======================================= |
| Pipeline completo: EDA → Feature Engineering → Entrenamiento → Evaluación |
| Dataset: https://www.kaggle.com/datasets/sims22/irisflowerdatasets |
| """ |
|
|
| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| import xgboost as xgb |
| import joblib |
| import json |
| import os |
| from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score |
| from sklearn.preprocessing import LabelEncoder |
| from sklearn.metrics import classification_report, confusion_matrix, accuracy_score |
| from datetime import datetime |
|
|
| |
| |
| |
| print("=" * 60) |
| print("1. CARGA DE DATOS") |
| print("=" * 60) |
|
|
| df = pd.read_csv("data/IRIS.csv") |
| print(f"Shape: {df.shape}") |
| print(f"\nPrimeras filas:\n{df.head()}") |
| print(f"\nInfo:") |
| df.info() |
| print(f"\nEstadísticas descriptivas:\n{df.describe()}") |
| print(f"\nValores nulos:\n{df.isnull().sum()}") |
| print(f"\nDistribución de clases:\n{df['species'].value_counts()}") |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("2. EDA - ANÁLISIS EXPLORATORIO") |
| print("=" * 60) |
|
|
| os.makedirs("outputs", exist_ok=True) |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() |
|
|
| |
| fig, axes = plt.subplots(2, 2, figsize=(12, 10)) |
| for i, col in enumerate(numeric_cols): |
| ax = axes[i // 2][i % 2] |
| for species in df['species'].unique(): |
| subset = df[df['species'] == species] |
| ax.hist(subset[col], alpha=0.6, label=species, bins=15) |
| ax.set_title(f"Distribución de {col}") |
| ax.legend(fontsize=8) |
| plt.tight_layout() |
| plt.savefig("outputs/distribuciones.png", dpi=150) |
| plt.close() |
| print("✓ Distribuciones guardadas en outputs/distribuciones.png") |
|
|
| |
| fig, axes = plt.subplots(2, 2, figsize=(12, 10)) |
| for i, col in enumerate(numeric_cols): |
| ax = axes[i // 2][i % 2] |
| sns.boxplot(x='species', y=col, data=df, ax=ax) |
| ax.set_title(f"{col} por especie") |
| ax.tick_params(axis='x', rotation=15) |
| plt.tight_layout() |
| plt.savefig("outputs/boxplots.png", dpi=150) |
| plt.close() |
| print("✓ Boxplots guardados en outputs/boxplots.png") |
|
|
| |
| fig, ax = plt.subplots(figsize=(8, 6)) |
| corr = df[numeric_cols].corr() |
| mask = np.triu(np.ones_like(corr, dtype=bool)) |
| sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap="coolwarm", center=0, ax=ax) |
| ax.set_title("Matriz de Correlación") |
| plt.tight_layout() |
| plt.savefig("outputs/correlacion.png", dpi=150) |
| plt.close() |
| print("✓ Correlación guardada en outputs/correlacion.png") |
|
|
| |
| pairplot = sns.pairplot(df, hue='species', diag_kind='kde', height=2.5) |
| pairplot.savefig("outputs/pairplot.png", dpi=150) |
| plt.close() |
| print("✓ Pairplot guardado en outputs/pairplot.png") |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("3. PREPARACIÓN DE DATOS") |
| print("=" * 60) |
|
|
| le = LabelEncoder() |
| df['species_encoded'] = le.fit_transform(df['species']) |
| print(f"Clases: {dict(zip(le.classes_, le.transform(le.classes_)))}") |
|
|
| X = df[numeric_cols] |
| y = df['species_encoded'] |
|
|
| X_train, X_test, y_train, y_test = train_test_split( |
| X, y, test_size=0.2, random_state=42, stratify=y |
| ) |
| print(f"Train: {X_train.shape}, Test: {X_test.shape}") |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("4. ENTRENAMIENTO CON XGBOOST") |
| print("=" * 60) |
|
|
| |
| xgb_cv = xgb.XGBClassifier( |
| n_estimators=200, |
| max_depth=4, |
| learning_rate=0.1, |
| subsample=0.8, |
| colsample_bytree=0.8, |
| random_state=42, |
| eval_metric='mlogloss', |
| ) |
|
|
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) |
| cv_scores = cross_val_score(xgb_cv, X, y, cv=cv, scoring='accuracy') |
| print(f"CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}") |
|
|
| |
| xgb_model = xgb.XGBClassifier( |
| n_estimators=200, |
| max_depth=4, |
| learning_rate=0.1, |
| subsample=0.8, |
| colsample_bytree=0.8, |
| random_state=42, |
| eval_metric='mlogloss', |
| early_stopping_rounds=20 |
| ) |
|
|
| xgb_model.fit( |
| X_train, y_train, |
| eval_set=[(X_test, y_test)], |
| verbose=10 |
| ) |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("5. EVALUACIÓN") |
| print("=" * 60) |
|
|
| y_pred = xgb_model.predict(X_test) |
| accuracy = accuracy_score(y_test, y_pred) |
| print(f"Test Accuracy: {accuracy:.4f}") |
| print(f"\nClassification Report:") |
| report = classification_report(y_test, y_pred, target_names=le.classes_) |
| print(report) |
|
|
| |
| fig, ax = plt.subplots(figsize=(8, 6)) |
| cm = confusion_matrix(y_test, y_pred) |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', |
| xticklabels=le.classes_, yticklabels=le.classes_, ax=ax) |
| ax.set_xlabel('Predicted') |
| ax.set_ylabel('Actual') |
| ax.set_title(f'Confusion Matrix (Accuracy: {accuracy:.4f})') |
| plt.tight_layout() |
| plt.savefig("outputs/confusion_matrix.png", dpi=150) |
| plt.close() |
| print("✓ Confusion matrix guardada en outputs/confusion_matrix.png") |
|
|
| |
| fig, ax = plt.subplots(figsize=(8, 5)) |
| importance = pd.Series( |
| xgb_model.feature_importances_, index=numeric_cols |
| ).sort_values(ascending=True) |
| importance.plot(kind='barh', ax=ax, color='steelblue') |
| ax.set_title("Feature Importance (XGBoost)") |
| ax.set_xlabel("Importance") |
| plt.tight_layout() |
| plt.savefig("outputs/feature_importance.png", dpi=150) |
| plt.close() |
| print("✓ Feature importance guardada en outputs/feature_importance.png") |
|
|
| |
| |
| |
| print("\n" + "=" * 60) |
| print("6. GUARDAR MODELO Y METADATA") |
| print("=" * 60) |
|
|
| joblib.dump(xgb_model, "model.joblib") |
| print("✓ Modelo guardado en model.joblib") |
|
|
| |
| joblib.dump(le, "label_encoder.joblib") |
| print("✓ Label encoder guardado en label_encoder.joblib") |
|
|
| model_info = { |
| "framework": "xgboost", |
| "model_type": "XGBClassifier", |
| "dataset": "sims22/irisflowerdatasets", |
| "task": "multiclass_classification", |
| "classes": list(le.classes_), |
| "features": numeric_cols, |
| "metrics": { |
| "test_accuracy": float(accuracy), |
| "cv_accuracy_mean": float(cv_scores.mean()), |
| "cv_accuracy_std": float(cv_scores.std()), |
| }, |
| "hyperparameters": { |
| "n_estimators": 200, |
| "max_depth": 4, |
| "learning_rate": 0.1, |
| "subsample": 0.8, |
| "colsample_bytree": 0.8, |
| }, |
| "training_samples": int(len(X_train)), |
| "test_samples": int(len(X_test)), |
| "trained_at": datetime.now().isoformat(), |
| } |
|
|
| with open("model_info.json", "w") as f: |
| json.dump(model_info, f, indent=2) |
| print("✓ Metadata guardada en model_info.json") |
|
|
| print("\n" + "=" * 60) |
| print("PIPELINE COMPLETADO") |
| print(f"Test Accuracy: {accuracy:.4f}") |
| print(f"CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}") |
| print("=" * 60) |
|
|