Source code for alpbench.benchmark.ActiveLearningScenario

import numpy as np
import openml
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

from alpbench.benchmark.ActiveLearningSetting import ActiveLearningSetting



[docs]
def create_dataset_split(
    X, y, test_split_seed, test_split_size: float, train_split_seed, train_split_size, train_split_type, factor
):
    """This method creates a split of the data into labeled, unlabeled and test data. The type of the split can be
    either absolute (i.e., a fixed number of labeled data points) or relative (i.e., a fixed share of the training
    data). The split is stratified according to the labels. The labeled data is guaranteed to contain at least one
    instance of each class. Further, if a factor is given, the number of labeled data points is determined by the
    number of classes times the factor.

    Args:
        X (numpy.ndarray): data
        y (numpy.ndarray): labels
        test_split_seed (int): seed for the test split
        test_split_size (float): size of the test data
        train_split_seed (int): seed for the train split
        train_split_size (float): size of the labeled training data
        train_split_type (str): type of the size parameter: number of data points or share of the (training) dataset
        factor (int): task-dependent factor

    Returns:
        labeled_indices (list): indices of the labeled data
        test_indices (list): indices of the test data
    """

    # initialize list of indices
    indices = np.arange(0, len(X))

    # split data into train and test and retrieve test_indices to be returned later
    X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
        X, y, indices, test_size=test_split_size, random_state=test_split_seed, stratify=y
    )
    # determine the proportion of unlabeled data, also in case the train split is given in terms of an absolute number
    # of labeled data points
    unlabeled_size = 1 - train_split_size
    if train_split_type == "absolute":
        if factor != -1:
            train_split_size = factor * len(np.unique(y))
            unlabeled_size = 1 - train_split_size / len(X_train)
        else:
            unlabeled_size = 1 - train_split_size / len(X_train)

    # split data into labeled and unlabeled
    X_l, X_u, y_l, y_u, labeled_indices, unlabeled_indices = train_test_split(
        X_train, y_train, train_indices, test_size=unlabeled_size, random_state=train_split_seed, stratify=y_train
    )

    if len(np.unique(y[labeled_indices])) != len(np.unique(y)):
        # make sure that each class within y is at least once in the labeled data
        for i in np.unique(y):
            if i not in y_l:
                ids = np.where(y_u == i)[0]
                np.random.seed(train_split_seed)
                idx_in_yu = np.random.choice(ids)
                idx = unlabeled_indices[idx_in_yu]
                labeled_indices = np.append(labeled_indices, idx)

    assert len(np.unique(y[labeled_indices])) == len(
        np.unique(y)
    ), "Not all classes are represented in the labeled data"

    return labeled_indices.tolist(), test_indices.tolist()




[docs]
class ActiveLearningScenario:
    """Active Learning Scenario

    The active learning scenario defines the data and the setting of one active learning setup.  The scenario is
    initialized with the openml id of the dataset, the test split, train split and the seed for reproducibility, the
    setting, and optionally labeled and test indices.

    Args:
        scenario_id (int): id of the scenario in the database
        openml_id (int): id of the dataset on openml
        test_split_seed (int): seed for the test split
        train_split_seed (int): seed for the train split
        seed (int): seed for reproducibility
        setting (ActiveLearningSetting): active learning setting
        labeled_indices (list): indices of the labeled data
        test_indices (list): indices of the test data

    Attributes:
        scenario_id (int): id of the scenario in the database
        openml_id (int): id of the dataset on openml
        test_split_seed (int): seed for the test split
        train_split_seed (int): seed for the train split
        seed (int): seed for reproducibility
        setting (ActiveLearningSetting): active learning setting
        labeled_indices (list): indices of the labeled data
        test_indices (list): indices of the test data

    """

    def __init__(
        self,
        scenario_id,
        openml_id,
        test_split_seed,
        train_split_seed,
        seed,
        setting: ActiveLearningSetting,
        labeled_indices: list = None,
        test_indices: list = None,
    ):
        self.scenario_id = scenario_id
        self.openml_id = openml_id
        self.test_split_seed = test_split_seed
        self.train_split_seed = train_split_seed
        self.seed = seed
        self.labeled_indices = labeled_indices
        self.test_indices = test_indices
        self.setting = setting

        # actual data
        ds = openml.datasets.get_dataset(
            openml_id, download_data=True, download_qualities=True, download_features_meta_data=True
        )
        df = ds.get_data()[0]
        # prepare label column as numpy array
        X = np.array(df.drop(columns=[ds.default_target_attribute]).values)
        y = np.array(df[ds.default_target_attribute].values)
        if y.dtype != int:
            y_int = np.zeros(len(y)).astype(int)
            vals = np.unique(y)
            for i, val in enumerate(vals):
                mask = y == val
                y_int[mask] = i
            y = y_int
        X = OrdinalEncoder().fit_transform(X)
        X = SimpleImputer(missing_values=np.nan, strategy="mean").fit_transform(X)

        # filter X for duplicates
        _, unique_indices = np.unique(X, axis=0, return_index=True)

        self.X = X[unique_indices]
        self.y = LabelEncoder().fit_transform(y)[unique_indices]

        if test_indices is None or labeled_indices is None:
            self.labeled_indices, self.test_indices = create_dataset_split(
                self.X,
                self.y,
                test_split_seed,
                setting.setting_test_size,
                train_split_seed,
                setting.setting_labeled_train_size,
                setting.setting_train_type,
                setting.factor,
            )


[docs]
    def get_scenario_id(self):
        """
        Get the scenario id.
        """
        return self.scenario_id



[docs]
    def get_openml_id(self):
        """
        Get the openml id.
        """
        return self.openml_id



[docs]
    def get_setting(self):
        """
        Get the setting.
        """
        return self.setting



[docs]
    def get_seed(self):
        """
        Get the seed.
        """
        return self.seed



[docs]
    def get_labeled_instances(self):
        """
        Get the labeled instances.
        """
        return self.labeled_indices



[docs]
    def get_test_indices(self):
        """
        Get the test indices.
        """
        return self.test_indices



[docs]
    def get_labeled_train_data(self):
        """
        Get the labeled training data.
        """
        return self.X[self.labeled_indices], self.y[self.labeled_indices]



[docs]
    def get_unlabeled_train_data(self):
        """
        Get the unlabeled training data (X and y).
        """
        combined_train_labeled_test = self.labeled_indices + self.test_indices
        mask = np.array([True] * len(self.X))
        mask[combined_train_labeled_test] = False
        return self.X[mask], self.y[mask]



[docs]
    def get_test_data(self):
        """
        Get the test data.
        """
        return self.X[self.test_indices], self.y[self.test_indices]



[docs]
    def get_data_split(self):
        """
        Get labeled, unlabeled and test data.
        """
        X_l, y_l = self.get_labeled_train_data()
        X_u, y_u = self.get_unlabeled_train_data()
        X_test, y_test = self.get_test_data()
        return X_l, y_l, X_u, y_u, X_test, y_test


    def __repr__(self):
        params = dict(self.__dict__)
        params.pop("X")
        params.pop("y")
        return "<ActiveLearningScenario> " + str(params)