File size: 6,468 Bytes
e17f3ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""
AutoGluon Wrapper
=================

Sklearn-compatible wrapper for AutoGluon Tabular.

AutoGluon is an AutoML framework that automatically
trains and ensembles multiple models.

Author: UW MSIM Team
Date: November 2025
"""

import time
import logging
from typing import Optional, Union
import numpy as np
import pandas as pd
import tempfile
import shutil

from .base_wrapper import BaseModelWrapper

logger = logging.getLogger(__name__)


class AutoGluonWrapper(BaseModelWrapper):
    """
    AutoGluon Tabular wrapper.

    Parameters
    ----------
    task_type : str, default='classification'
        Task type: 'classification' or 'regression'
    time_limit : int, default=300
        Time limit for training in seconds
    preset : str, default='medium_quality'
        Preset: 'best_quality', 'high_quality', 'good_quality', 'medium_quality'
    eval_metric : str, optional
        Evaluation metric (auto-detected if None)
    random_state : int, default=42
        Random seed
    """

    def __init__(
        self,
        task_type: str = 'classification',
        time_limit: int = 300,
        preset: str = 'medium_quality',
        eval_metric: Optional[str] = None,
        random_state: int = 42
    ):
        super().__init__(task_type=task_type, random_state=random_state)
        self.time_limit = time_limit
        self.preset = preset
        self.eval_metric = eval_metric
        self._temp_dir = None

    def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> 'AutoGluonWrapper':
        """
        Fit AutoGluon model.

        Parameters
        ----------
        X : pd.DataFrame or np.ndarray, shape (n_samples, n_features)
            Training features
        y : pd.Series or np.ndarray, shape (n_samples,)
            Training target

        Returns
        -------
        self : AutoGluonWrapper
            Fitted model
        """
        self._validate_input(X, y)

        logger.info(f"Fitting AutoGluon ({self.preset}) on {X.shape[0]} samples...")
        start_time = time.time()

        try:
            from autogluon.tabular import TabularPredictor

            # Convert to DataFrame if needed
            if isinstance(X, np.ndarray):
                X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])

            if isinstance(y, np.ndarray):
                y = pd.Series(y, name='target')

            # Combine X and y for AutoGluon
            train_data = X.copy()
            train_data['target'] = y.values

            # Create temporary directory for model
            self._temp_dir = tempfile.mkdtemp(prefix='autogluon_')

            # Auto-detect problem type
            problem_type = 'binary' if self.task_type == 'classification' and len(np.unique(y)) == 2 else None
            if self.task_type == 'regression':
                problem_type = 'regression'
            elif self.task_type == 'classification' and len(np.unique(y)) > 2:
                problem_type = 'multiclass'

            # Initialize predictor
            self.model = TabularPredictor(
                label='target',
                problem_type=problem_type,
                eval_metric=self.eval_metric,
                path=self._temp_dir,
                verbosity=2
            )

            # Fit model
            self.model.fit(
                train_data=train_data,
                time_limit=self.time_limit,
                presets=self.preset
            )

            self.is_fitted = True
            self.fit_time = time.time() - start_time

            # Log leaderboard
            leaderboard = self.model.leaderboard(silent=True)
            best_model = leaderboard.iloc[0]['model']
            logger.info(f"AutoGluon fitted in {self.fit_time:.2f} seconds. Best model: {best_model}")

        except ImportError:
            logger.error("AutoGluon not installed")
            raise ImportError("Install AutoGluon with: pip install autogluon.tabular[all]")
        except Exception as e:
            logger.error(f"Error fitting AutoGluon: {e}")
            raise

        return self

    def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
        """
        Make predictions with AutoGluon.

        Parameters
        ----------
        X : pd.DataFrame or np.ndarray, shape (n_samples, n_features)
            Test features

        Returns
        -------
        predictions : np.ndarray, shape (n_samples,)
            Predicted values or class labels
        """
        if not self.is_fitted:
            raise ValueError("Model not fitted. Call fit() first.")

        self._validate_input(X)

        logger.info(f"Predicting on {X.shape[0]} samples with AutoGluon...")
        start_time = time.time()

        try:
            # Convert to DataFrame if needed
            if isinstance(X, np.ndarray):
                X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])

            predictions = self.model.predict(X).values
            self.predict_time = time.time() - start_time

            logger.info(f"Predictions complete in {self.predict_time:.2f} seconds")

            return predictions

        except Exception as e:
            logger.error(f"Error during prediction: {e}")
            raise

    def _predict_proba_impl(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
        """
        Predict class probabilities with AutoGluon.

        Parameters
        ----------
        X : pd.DataFrame or np.ndarray, shape (n_samples, n_features)
            Test features

        Returns
        -------
        probabilities : np.ndarray, shape (n_samples, n_classes)
            Class probabilities
        """
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])

        return self.model.predict_proba(X).values

    def get_params(self, deep: bool = True) -> dict:
        """Get parameters for this estimator."""
        params = super().get_params(deep)
        params.update({
            'time_limit': self.time_limit,
            'preset': self.preset,
            'eval_metric': self.eval_metric
        })
        return params

    def __del__(self):
        """Clean up temporary directory on deletion."""
        if self._temp_dir and self._temp_dir.startswith('/tmp'):
            try:
                shutil.rmtree(self._temp_dir)
            except:
                pass