import numpy as np
import openml
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from alpbench.benchmark.ActiveLearningSetting import ActiveLearningSetting
[docs]
def create_dataset_split(
X, y, test_split_seed, test_split_size: float, train_split_seed, train_split_size, train_split_type, factor
):
"""This method creates a split of the data into labeled, unlabeled and test data. The type of the split can be
either absolute (i.e., a fixed number of labeled data points) or relative (i.e., a fixed share of the training
data). The split is stratified according to the labels. The labeled data is guaranteed to contain at least one
instance of each class. Further, if a factor is given, the number of labeled data points is determined by the
number of classes times the factor.
Args:
X (numpy.ndarray): data
y (numpy.ndarray): labels
test_split_seed (int): seed for the test split
test_split_size (float): size of the test data
train_split_seed (int): seed for the train split
train_split_size (float): size of the labeled training data
train_split_type (str): type of the size parameter: number of data points or share of the (training) dataset
factor (int): task-dependent factor
Returns:
labeled_indices (list): indices of the labeled data
test_indices (list): indices of the test data
"""
# initialize list of indices
indices = np.arange(0, len(X))
# split data into train and test and retrieve test_indices to be returned later
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
X, y, indices, test_size=test_split_size, random_state=test_split_seed, stratify=y
)
# determine the proportion of unlabeled data, also in case the train split is given in terms of an absolute number
# of labeled data points
unlabeled_size = 1 - train_split_size
if train_split_type == "absolute":
if factor != -1:
train_split_size = factor * len(np.unique(y))
unlabeled_size = 1 - train_split_size / len(X_train)
else:
unlabeled_size = 1 - train_split_size / len(X_train)
# split data into labeled and unlabeled
X_l, X_u, y_l, y_u, labeled_indices, unlabeled_indices = train_test_split(
X_train, y_train, train_indices, test_size=unlabeled_size, random_state=train_split_seed, stratify=y_train
)
if len(np.unique(y[labeled_indices])) != len(np.unique(y)):
# make sure that each class within y is at least once in the labeled data
for i in np.unique(y):
if i not in y_l:
ids = np.where(y_u == i)[0]
np.random.seed(train_split_seed)
idx_in_yu = np.random.choice(ids)
idx = unlabeled_indices[idx_in_yu]
labeled_indices = np.append(labeled_indices, idx)
assert len(np.unique(y[labeled_indices])) == len(
np.unique(y)
), "Not all classes are represented in the labeled data"
return labeled_indices.tolist(), test_indices.tolist()
[docs]
class ActiveLearningScenario:
"""Active Learning Scenario
The active learning scenario defines the data and the setting of one active learning setup. The scenario is
initialized with the openml id of the dataset, the test split, train split and the seed for reproducibility, the
setting, and optionally labeled and test indices.
Args:
scenario_id (int): id of the scenario in the database
openml_id (int): id of the dataset on openml
test_split_seed (int): seed for the test split
train_split_seed (int): seed for the train split
seed (int): seed for reproducibility
setting (ActiveLearningSetting): active learning setting
labeled_indices (list): indices of the labeled data
test_indices (list): indices of the test data
Attributes:
scenario_id (int): id of the scenario in the database
openml_id (int): id of the dataset on openml
test_split_seed (int): seed for the test split
train_split_seed (int): seed for the train split
seed (int): seed for reproducibility
setting (ActiveLearningSetting): active learning setting
labeled_indices (list): indices of the labeled data
test_indices (list): indices of the test data
"""
def __init__(
self,
scenario_id,
openml_id,
test_split_seed,
train_split_seed,
seed,
setting: ActiveLearningSetting,
labeled_indices: list = None,
test_indices: list = None,
):
self.scenario_id = scenario_id
self.openml_id = openml_id
self.test_split_seed = test_split_seed
self.train_split_seed = train_split_seed
self.seed = seed
self.labeled_indices = labeled_indices
self.test_indices = test_indices
self.setting = setting
# actual data
ds = openml.datasets.get_dataset(
openml_id, download_data=True, download_qualities=True, download_features_meta_data=True
)
df = ds.get_data()[0]
# prepare label column as numpy array
X = np.array(df.drop(columns=[ds.default_target_attribute]).values)
y = np.array(df[ds.default_target_attribute].values)
if y.dtype != int:
y_int = np.zeros(len(y)).astype(int)
vals = np.unique(y)
for i, val in enumerate(vals):
mask = y == val
y_int[mask] = i
y = y_int
X = OrdinalEncoder().fit_transform(X)
X = SimpleImputer(missing_values=np.nan, strategy="mean").fit_transform(X)
# filter X for duplicates
_, unique_indices = np.unique(X, axis=0, return_index=True)
self.X = X[unique_indices]
self.y = LabelEncoder().fit_transform(y)[unique_indices]
if test_indices is None or labeled_indices is None:
self.labeled_indices, self.test_indices = create_dataset_split(
self.X,
self.y,
test_split_seed,
setting.setting_test_size,
train_split_seed,
setting.setting_labeled_train_size,
setting.setting_train_type,
setting.factor,
)
[docs]
def get_scenario_id(self):
"""
Get the scenario id.
"""
return self.scenario_id
[docs]
def get_openml_id(self):
"""
Get the openml id.
"""
return self.openml_id
[docs]
def get_setting(self):
"""
Get the setting.
"""
return self.setting
[docs]
def get_seed(self):
"""
Get the seed.
"""
return self.seed
[docs]
def get_labeled_instances(self):
"""
Get the labeled instances.
"""
return self.labeled_indices
[docs]
def get_test_indices(self):
"""
Get the test indices.
"""
return self.test_indices
[docs]
def get_labeled_train_data(self):
"""
Get the labeled training data.
"""
return self.X[self.labeled_indices], self.y[self.labeled_indices]
[docs]
def get_unlabeled_train_data(self):
"""
Get the unlabeled training data (X and y).
"""
combined_train_labeled_test = self.labeled_indices + self.test_indices
mask = np.array([True] * len(self.X))
mask[combined_train_labeled_test] = False
return self.X[mask], self.y[mask]
[docs]
def get_test_data(self):
"""
Get the test data.
"""
return self.X[self.test_indices], self.y[self.test_indices]
[docs]
def get_data_split(self):
"""
Get labeled, unlabeled and test data.
"""
X_l, y_l = self.get_labeled_train_data()
X_u, y_u = self.get_unlabeled_train_data()
X_test, y_test = self.get_test_data()
return X_l, y_l, X_u, y_u, X_test, y_test
def __repr__(self):
params = dict(self.__dict__)
params.pop("X")
params.pop("y")
return "<ActiveLearningScenario> " + str(params)