Customize resampling target

This example demonstrates how to customize the resampling target to achieve advanced resampling control. This can be easily done by setting the “target_label” and “n_target_samples” parameter when calling the “fit()” method.

Note that this feature only applies to resampling-based ensemble classifiers that are iteratively trained.

This example uses:

# Authors: Zhining Liu <zhining.liu@outlook.com>
# License: MIT
print(__doc__)

# Import imbalanced_ensemble
import imbalanced_ensemble as imbens

# Import utilities
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from imbalanced_ensemble.ensemble.base import sort_dict_by_key
from collections import Counter

# Import plot utilities
from imbalanced_ensemble.utils._plot import set_ax_border
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('talk')

RANDOM_STATE = 42

init_kwargs = {
    'n_estimators': 1,
    'random_state': RANDOM_STATE,
}
fit_kwargs = {
    'train_verbose': {
        'print_metrics': False,
    },
}

# sphinx_gallery_thumbnail_number = -1

Prepare data

Make a toy 3-class imbalanced classification task.

# Generate and split a synthetic dataset
X, y = make_classification(n_classes=3, n_samples=2000, class_sep=2,
    weights=[0.1, 0.3, 0.6], n_informative=3, n_redundant=1, flip_y=0,
    n_features=20, n_clusters_per_class=2, random_state=RANDOM_STATE)
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
    test_size=0.5, stratify=y, random_state=RANDOM_STATE)

# Print class distribution
print('Training dataset distribution    %s' % sort_dict_by_key(Counter(y_train)))
print('Validation dataset distribution  %s' % sort_dict_by_key(Counter(y_valid)))

Out:

Training dataset distribution    {0: 100, 1: 300, 2: 600}
Validation dataset distribution  {0: 100, 1: 300, 2: 600}

Implement some plot utilities

ylim = (0, 630)

all_distribution = {}

def plot_class_distribution(distr:dict, xlabel:str='Class Label',
                            ylabel:str='Number of samples', **kwargs):
    distr = dict(sorted(distr.items(), key=lambda k: k[0], reverse=True))
    ax = sns.barplot(
        x=list(distr.keys()),
        y=list(distr.values()),
        order=list(distr.keys()),
        **kwargs
    )
    set_ax_border(ax)
    ax.grid(axis='y', alpha=0.5, ls='-.')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    return ax

def plot_class_distribution_comparison(clf,
                                       title1='Original imbalanced class distribution',
                                       title2='After resampling', figsize=(12, 6)):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
    plot_class_distribution(clf.origin_distr_, ax=ax1)
    ax1.set(ylim=ylim, title=title1)
    plot_class_distribution(clf.target_distr_, ax=ax2)
    ax2.set(ylim=ylim, title=title2)
    fig.tight_layout()

Default under-sampling

By default, under-sampling-based ensemble methods will consider the smallest class as the minority class (class 0 with 100 samples). All other classes (class 1 and 2) will be considered as majority classes and will be under-sampled until the number of samples is equalized.

Take SelfPacedEnsembleClassifier as example

spe_clf = imbens.ensemble.SelfPacedEnsembleClassifier(**init_kwargs)

Train with the default under-sampling setting

spe_clf.fit(X_train, y_train, **fit_kwargs)

all_distribution['Before under-sampling'] = spe_clf.origin_distr_
resampling_type='After default under-sampling'
all_distribution[resampling_type] = spe_clf.target_distr_
plot_class_distribution_comparison(spe_clf, title2=resampling_type)
Original imbalanced class distribution, After default under-sampling

Out:

┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃             ┃                          ┃
┃ #Estimators ┃    Class Distribution    ┃
┃             ┃                          ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃      1      ┃ {0: 100, 1: 100, 2: 100} ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃    final    ┃ {0: 100, 1: 100, 2: 100} ┃
┗━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━┛

Specify the class targeted by the under-sampling

Set parameter ``target_label``: int All other classes that have more samples than the target class will be considered as majority classes. They will be under-sampled until the number of samples is equalized. The remaining minority classes (if any) will stay unchanged.

spe_clf.fit(X_train, y_train,
            target_label=1, # target class 1
            **fit_kwargs)

resampling_type='After under-sampling (target class 1)'
all_distribution[resampling_type] = spe_clf.target_distr_
plot_class_distribution_comparison(spe_clf, title2=resampling_type)
Original imbalanced class distribution, After under-sampling (target class 1)

Out:

┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃             ┃                          ┃
┃ #Estimators ┃    Class Distribution    ┃
┃             ┃                          ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃      1      ┃ {0: 100, 1: 300, 2: 300} ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃    final    ┃ {0: 100, 1: 300, 2: 300} ┃
┗━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━┛

Specify the desired number of samples after under-sampling

Set parameter ``n_target_samples``: int or dict If int, all classes that have more than the n_target_samples samples will be under-sampled until the number of samples is equalized.

spe_clf.fit(X_train, y_train,
            n_target_samples=200, # target number of samples 200
            **fit_kwargs)

resampling_type='After under-sampling (target number 200)'
all_distribution[resampling_type] = spe_clf.target_distr_
plot_class_distribution_comparison(spe_clf, title2=resampling_type)
Original imbalanced class distribution, After under-sampling (target number 200)

Out:

┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃             ┃                          ┃
┃ #Estimators ┃    Class Distribution    ┃
┃             ┃                          ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃      1      ┃ {0: 100, 1: 200, 2: 200} ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃    final    ┃ {0: 100, 1: 200, 2: 200} ┃
┗━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━┛

Specify the desired number of samples of each class after under-sampling

Set parameter ``n_target_samples``: int or dict If dict, the keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted class.

spe_clf.fit(X_train, y_train,
            n_target_samples={
                0: 80,
                1: 200,
                2: 400,
            }, # target number of samples
            **fit_kwargs)

resampling_type='After under-sampling \n(target number {0: 80, 1: 200, 2: 400})'
all_distribution[resampling_type] = spe_clf.target_distr_
plot_class_distribution_comparison(spe_clf, title2=resampling_type)
Original imbalanced class distribution, After under-sampling  (target number {0: 80, 1: 200, 2: 400})

Out:

┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃             ┃                          ┃
┃ #Estimators ┃    Class Distribution    ┃
┃             ┃                          ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃      1      ┃ {0: 80, 1: 200, 2: 400}  ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃    final    ┃ {0: 80, 1: 200, 2: 400}  ┃
┗━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━┛

Over-sampling

By default, over-sampling-based ensemble methods will consider the largest class as the majority class (class 2 with 600 samples). All other classes (class 0 and 1) will be considered as minority classes and will be over-sampled until the number of samples is equalized.

The over-sampling schedule can be customized in the same way as under-sampling.

Take SMOTEBoostClassifier as example

smoteboost_clf = imbens.ensemble.SMOTEBoostClassifier(**init_kwargs)

Train with the default under-sampling setting

smoteboost_clf.fit(X_train, y_train, **fit_kwargs)

all_distribution['Before over-sampling'] = smoteboost_clf.origin_distr_
resampling_type='After default over-sampling'
all_distribution[resampling_type] = smoteboost_clf.target_distr_
plot_class_distribution_comparison(smoteboost_clf, title2=resampling_type)
Original imbalanced class distribution, After default over-sampling

Out:

┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃             ┃                          ┃
┃ #Estimators ┃    Class Distribution    ┃
┃             ┃                          ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃      1      ┃ {0: 600, 1: 600, 2: 600} ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃    final    ┃ {0: 600, 1: 600, 2: 600} ┃
┗━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━┛

Specify the class targeted by the over-sampling

smoteboost_clf.fit(X_train, y_train,
                   target_label=1, # target class 1
                   **fit_kwargs)

resampling_type='After over-sampling (target class 1)'
all_distribution[resampling_type] = smoteboost_clf.target_distr_
plot_class_distribution_comparison(smoteboost_clf, title2=resampling_type)
Original imbalanced class distribution, After over-sampling (target class 1)

Out:

┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃             ┃                          ┃
┃ #Estimators ┃    Class Distribution    ┃
┃             ┃                          ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃      1      ┃ {0: 300, 1: 300, 2: 600} ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃    final    ┃ {0: 300, 1: 300, 2: 600} ┃
┗━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━┛

Specify the desired number of samples after over-sampling

smoteboost_clf.fit(X_train, y_train,
                   n_target_samples=400, # target number of samples 400
                   **fit_kwargs)

resampling_type='After over-sampling (target number 400)'
all_distribution[resampling_type] = smoteboost_clf.target_distr_
plot_class_distribution_comparison(smoteboost_clf, title2=resampling_type)
Original imbalanced class distribution, After over-sampling (target number 400)

Out:

┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃             ┃                          ┃
┃ #Estimators ┃    Class Distribution    ┃
┃             ┃                          ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃      1      ┃ {0: 400, 1: 400, 2: 600} ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃    final    ┃ {0: 400, 1: 400, 2: 600} ┃
┗━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━┛

Specify the desired number of samples of each class after over-sampling

smoteboost_clf.fit(X_train, y_train,
                   n_target_samples={
                       0: 200,
                       1: 400,
                       2: 600,
                   }, # target number of samples
                   **fit_kwargs)

resampling_type='After over-sampling \n(target number {0: 200, 1: 400, 2: 600})'
all_distribution[resampling_type] = smoteboost_clf.target_distr_
plot_class_distribution_comparison(smoteboost_clf, title2=resampling_type)
Original imbalanced class distribution, After over-sampling  (target number {0: 200, 1: 400, 2: 600})

Out:

┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃             ┃                          ┃
┃ #Estimators ┃    Class Distribution    ┃
┃             ┃                          ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃      1      ┃ {0: 200, 1: 400, 2: 600} ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃    final    ┃ {0: 200, 1: 400, 2: 600} ┃
┗━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━┛

Visualize different resampling target

sns.set_context('notebook')
fig, axes = plt.subplots(2, 5, figsize=(20, 8))
for ax, title in zip(axes.flatten(), list(all_distribution.keys())):
    plot_class_distribution(all_distribution[title], ax=ax, palette="Blues_d")
    ax.set(ylim=ylim, title=title)
fig.tight_layout()
Before under-sampling, After default under-sampling, After under-sampling (target class 1), After under-sampling (target number 200), After under-sampling  (target number {0: 80, 1: 200, 2: 400}), Before over-sampling, After default over-sampling, After over-sampling (target class 1), After over-sampling (target number 400), After over-sampling  (target number {0: 200, 1: 400, 2: 600})

Total running time of the script: ( 2 minutes 53.371 seconds)

Estimated memory usage: 36 MB

Gallery generated by Sphinx-Gallery