Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Helper module to import AdaBoost classes without running module-level code. | |
| This module re-exports the AdaBoostEnsemble and WeightedDecisionTree classes | |
| from classification_adaboost.py, but without triggering the module-level | |
| data loading and training code. | |
| """ | |
| import numpy as np | |
| from sklearn.base import BaseEstimator, ClassifierMixin | |
| from sklearn.tree import DecisionTreeClassifier | |
| from typing import List | |
| class WeightedDecisionTree(DecisionTreeClassifier): | |
| """ | |
| A wrapper around DecisionTreeClassifier that properly handles sample weights. | |
| This tree is grown based on weighted training errors. | |
| """ | |
| def __init__(self, max_depth: int = 5, min_samples_split: int = 2, | |
| min_samples_leaf: int = 1, random_state: int = 42): | |
| super().__init__( | |
| max_depth=max_depth, | |
| min_samples_split=min_samples_split, | |
| min_samples_leaf=min_samples_leaf, | |
| random_state=random_state | |
| ) | |
| def fit(self, X, y, sample_weight=None): | |
| """Fit the decision tree with optional sample weights.""" | |
| return super().fit(X, y, sample_weight=sample_weight) | |
| class AdaBoostEnsemble(BaseEstimator, ClassifierMixin): | |
| """ | |
| AdaBoost ensemble of decision trees where each tree is grown based on | |
| weighted training errors. Weights are updated based on the error of | |
| previous trees. | |
| The algorithm: | |
| 1. Initialize equal weights for all training samples | |
| 2. For each tree in the ensemble: | |
| - Train a decision tree on weighted data | |
| - Calculate weighted error rate | |
| - Compute tree weight (alpha) | |
| - Update sample weights (increase for misclassified, decrease for correct) | |
| - Normalize weights | |
| 3. Make predictions using weighted voting | |
| """ | |
| def __init__( | |
| self, | |
| n_estimators: int = 50, | |
| max_depth: int = 5, | |
| min_samples_split: int = 2, | |
| min_samples_leaf: int = 1, | |
| random_state: int = 42 | |
| ): | |
| self.n_estimators = n_estimators | |
| self.max_depth = max_depth | |
| self.min_samples_split = min_samples_split | |
| self.min_samples_leaf = min_samples_leaf | |
| self.random_state = random_state | |
| self.trees: List[WeightedDecisionTree] = [] | |
| self.tree_weights: List[float] = [] | |
| self.n_classes: int = 0 | |
| self.classes_: np.ndarray = None | |
| def _initialize_weights(self, n_samples: int) -> np.ndarray: | |
| """Initialize equal weights for all samples.""" | |
| return np.ones(n_samples) / n_samples | |
| def _update_weights( | |
| self, | |
| weights: np.ndarray, | |
| y_true: np.ndarray, | |
| y_pred: np.ndarray, | |
| alpha: float | |
| ) -> np.ndarray: | |
| """ | |
| Update sample weights based on prediction errors. | |
| Increase weight for misclassified samples, decrease for correct. | |
| """ | |
| # Misclassified samples get multiplied by exp(alpha) | |
| # Correctly classified samples get multiplied by exp(-alpha) | |
| misclassified = y_true != y_pred | |
| updated_weights = weights * np.exp(alpha * misclassified.astype(float)) | |
| # Normalize weights | |
| return updated_weights / updated_weights.sum() | |
| def _compute_weighted_error( | |
| self, | |
| weights: np.ndarray, | |
| y_true: np.ndarray, | |
| y_pred: np.ndarray | |
| ) -> float: | |
| """Compute weighted error rate.""" | |
| misclassified = (y_true != y_pred).astype(float) | |
| return np.sum(weights * misclassified) / np.sum(weights) | |
| def _compute_alpha(self, error: float) -> float: | |
| """ | |
| Compute the weight of the classifier. | |
| Avoid division by zero and log(0). | |
| """ | |
| if error <= 0: | |
| return 10.0 # Very high weight for perfect classifier | |
| if error >= 1: | |
| return -10.0 # Very negative weight for completely wrong classifier | |
| return 0.5 * np.log((1 - error) / error) | |
| def fit(self, X: np.ndarray, y: np.ndarray) -> 'AdaBoostEnsemble': | |
| """Fit the AdaBoost ensemble.""" | |
| n_samples, n_features = X.shape | |
| self.classes_ = np.unique(y) | |
| self.n_classes = len(self.classes_) | |
| # Initialize sample weights | |
| weights = self._initialize_weights(n_samples) | |
| for i in range(self.n_estimators): | |
| # Create and train decision tree with current weights | |
| tree = WeightedDecisionTree( | |
| max_depth=self.max_depth, | |
| min_samples_split=self.min_samples_split, | |
| min_samples_leaf=self.min_samples_leaf, | |
| random_state=self.random_state + i | |
| ) | |
| tree.fit(X, y, sample_weight=weights) | |
| # Make predictions | |
| y_pred = tree.predict(X) | |
| # Calculate weighted error | |
| error = self._compute_weighted_error(weights, y, y_pred) | |
| # Compute tree weight (alpha) | |
| alpha = self._compute_alpha(error) | |
| # Update sample weights | |
| weights = self._update_weights(weights, y, y_pred, alpha) | |
| # Store tree and its weight | |
| self.trees.append(tree) | |
| self.tree_weights.append(alpha) | |
| print(f"Tree {i+1}/{self.n_estimators}: Error={error:.4f}, Alpha={alpha:.4f}") | |
| return self | |
| def predict(self, X: np.ndarray) -> np.ndarray: | |
| """Predict using weighted voting.""" | |
| # Get predictions from all trees | |
| all_predictions = np.array([tree.predict(X) for tree in self.trees]) | |
| # Get class labels | |
| classes = self.classes_ | |
| # Compute weighted votes for each class | |
| n_samples = X.shape[0] | |
| weighted_votes = np.zeros((n_samples, len(classes))) | |
| for tree_idx, tree in enumerate(self.trees): | |
| alpha = self.tree_weights[tree_idx] | |
| predictions = all_predictions[tree_idx] | |
| for class_idx, class_label in enumerate(classes): | |
| weighted_votes[:, class_idx] += alpha * (predictions == class_label) | |
| # Return class with highest weighted vote | |
| return classes[np.argmax(weighted_votes, axis=1)] | |
| def predict_proba(self, X: np.ndarray) -> np.ndarray: | |
| """Predict class probabilities using weighted voting.""" | |
| # Get predictions from all trees | |
| all_predictions = np.array([tree.predict(X) for tree in self.trees]) | |
| # Get class labels | |
| classes = self.classes_ | |
| # Compute weighted vote proportions for each class | |
| n_samples = X.shape[0] | |
| weighted_votes = np.zeros((n_samples, len(classes))) | |
| total_weight = sum(abs(w) for w in self.tree_weights) | |
| for tree_idx, tree in enumerate(self.trees): | |
| alpha = self.tree_weights[tree_idx] | |
| predictions = all_predictions[tree_idx] | |
| for class_idx, class_label in enumerate(classes): | |
| weighted_votes[:, class_idx] += abs(alpha) * (predictions == class_label) | |
| # Normalize to get probabilities | |
| return weighted_votes / total_weight | |