Source code for alpbench.util.ensemble_constructor

import time
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from alpbench.util.common import fullname


[docs] class Ensemble: """Ensemble This class is used to create an ensemble of estimators. The ensemble can be used to predict the probabilities of the ensemble members and the classes of the ensemble members. Args: estimator: object num_estimators: int max_neighbors: int (for k nearest neighbors) else None Attributes: estimator: object (the estimator to construct the ensemble of) num_estimators: int (the number of estimators in the ensemble) max_neighbors: int (for k nearest neighbors) random_states: list (random states for the ensemble members) estimators_: list (list containing the ensemble members) learner_fqn: str (fully qualified name of the estimator) """ def __init__(self, estimator, num_estimators, max_neighbors=None): self.estimator = estimator self.num_estimators = num_estimators self.max_neighbors = max_neighbors self.random_states = [np.random.randint(0, 1000) for _ in range(num_estimators)] self.estimators_ = None self.learner_fqn = fullname(self.estimator) self.init()
[docs] def init(self): """Initializes the ensemble members. Returns: None """ self.estimators_ = [] if self.learner_fqn == "tabpfn.scripts.transformer_prediction_interface.TabPFNClassifier": from alpbench.util.transformer_prediction_interface_ens import TabPFNClassifierEns as TabPFNEns self.estimator = TabPFNEns(N_ensemble_configurations=self.num_estimators) if self.learner_fqn == "pytorch_tabnet.tab_model.TabNetClassifier": for seed in self.random_states: from alpbench.util.pytorch_tabnet.tab_model import TabNetClassifier self.estimators_.append(TabNetClassifier(seed=seed, verbose=0)) if self.learner_fqn == "catboost.core.CatBoostClassifier": self.num_estimators = self.estimator.tree_count_ if self.learner_fqn == "xgboost.sklearn.XGBClassifier": self.num_estimators = self.estimator.n_estimators if self.learner_fqn == "sklearn.ensemble._forest.RandomForestClassifier": self.estimators_ = self.estimator.estimators_ if self.learner_fqn == "sklearn.svm._classes.SVC": for seed in np.linspace(1, 20, self.num_estimators): self.estimators_.append(SVC(kernel="rbf", probability=True, C=seed)) if self.learner_fqn == "sklearn.neighbors._classification.KNeighborsClassifier": num_neighbors = np.random.choice( np.linspace(1, self.max_neighbors, self.max_neighbors, dtype=int), self.num_estimators ) for seed in num_neighbors: self.estimators_.append(KNeighborsClassifier(n_neighbors=seed)) else: self.estimators_ = [self.estimator for _ in range(self.num_estimators)]
[docs] def fit(self, X: np.ndarray, y: np.ndarray): """ Fits the ensemble and sets the attributes of the class. Args ---------- X : array-like of shape (n_samples, n_features) The training input samples. y : array-like of shape (n_samples,) The target values (class labels) as integers. Returns ------- None """ if self.learner_fqn in [ "sklearn.ensemble._forest.RandomForestClassifier", "xgboost.sklearn.XGBClassifier", "catboost.core.CatBoostClassifier", "tabpfn.scripts.transformer_prediction_interface.TabPFNClassifier", ]: if self.learner_fqn == "tabpfn.scripts.transformer_prediction_interface.TabPFNClassifier" and len(y) > 1000: ids = np.random.choice(len(y), 1000) self.estimator.fit(X[ids], y[ids]) else: self.estimator.fit(X, y) else: for estimator in self.estimators_: if self.learner_fqn == "pytorch_tabnet.tab_model.TabNetClassifier": from alpbench.util.TorchUtil import TimeLimitCallback estimator.fit(X, y, callbacks=[TimeLimitCallback(60)]) else: estimator.fit(X, y) self.n_classes_ = len(np.unique(y))
[docs] def predict_proba(self, X: np.ndarray, alpha: float = None) -> np.ndarray: """ Predicts the probabilities of the ensemble members. Args ---------- X : array-like of shape (n_samples, n_features) The training input samples. alpha : float, optional (default=None) The threshold for the normalized likelihoods of the ensemble members. Returns ------- preds : predicted probabilities, array-like of shape (n_samples, n_classes, n_estimators) """ if self.learner_fqn == "tabpfn.scripts.transformer_prediction_interface.TabPFNClassifier": return self.estimator.predict_proba(X).transpose(0, 2, 1) else: preds = np.empty((X.shape[0], self.n_classes_, self.num_estimators)) for i in range(self.num_estimators): if self.learner_fqn == "xgboost.sklearn.XGBClassifier": preds[:, :, i] = self.estimator.predict_proba(X, iteration_range=(i, i + 1)) elif self.learner_fqn == "catboost.core.CatBoostClassifier": preds[:, :, i] = self.estimator.predict_proba(X, ntree_start=i, ntree_end=i + 1) elif self.learner_fqn == "sklearn.ensemble._forest.RandomForestClassifier": preds[:, :, i] = self.estimator.estimators_[i].predict_proba(X) else: preds[:, :, i] = self.estimators_[i].predict_proba(X) return preds
[docs] def predict(self, X: np.ndarray, alpha: float = None) -> np.ndarray: """ Predicts the classes of the ensemble members. Args ---------- X : array-like of shape (n_samples, n_features) The training input samples. alpha : float, optional (default=None) The threshold for the normalized likelihoods of the ensemble members. Returns ------- preds : predicted classes, array-like of shape (n_samples, n_estimators) """ probas = self.predict_proba(X, alpha) return np.argmax(probas.mean(axis=-1), axis=1)