Train and predict with an ensemble classifier

This example shows the basic usage of an imbalanced_ensemble.ensemble classifier.

This example uses:

# Authors: Zhining Liu <zhining.liu@outlook.com>
# License: MIT
print(__doc__)

# Import imbalanced_ensemble
import imbalanced_ensemble as imbens

# Import utilities
from collections import Counter
import sklearn
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from imbalanced_ensemble.ensemble.base import sort_dict_by_key

# Import plot utilities
import matplotlib.pyplot as plt
from imbalanced_ensemble.utils._plot import plot_2Dprojection_and_cardinality

RANDOM_STATE = 42

Prepare & visualize the data

Make a toy 3-class imbalanced classification task.

# Generate and split a synthetic dataset
X, y = make_classification(n_classes=3, n_samples=2000, class_sep=2,
    weights=[0.1, 0.3, 0.6], n_informative=3, n_redundant=1, flip_y=0,
    n_features=20, n_clusters_per_class=2, random_state=RANDOM_STATE)
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
    test_size=0.5, stratify=y, random_state=RANDOM_STATE)

# Visualize the training dataset
fig = plot_2Dprojection_and_cardinality(X_train, y_train, figsize=(8, 4))
plt.show()

# Print class distribution
print('Training dataset distribution    %s' % sort_dict_by_key(Counter(y_train)))
print('Validation dataset distribution  %s' % sort_dict_by_key(Counter(y_valid)))
Dataset (2D projection by KernelPCA), Class Distribution

Out:

Training dataset distribution    {0: 100, 1: 300, 2: 600}
Validation dataset distribution  {0: 100, 1: 300, 2: 600}

Using ensemble classifiers in imbalanced_ensemble

Take SelfPacedEnsembleClassifier as example

# Initialize an SelfPacedEnsembleClassifier
clf = imbens.ensemble.SelfPacedEnsembleClassifier(random_state=RANDOM_STATE)

# Train an SelfPacedEnsembleClassifier
clf.fit(X_train, y_train)

# Make predictions
y_pred_proba = clf.predict_proba(X_valid)
y_pred = clf.predict(X_valid)

# Evaluate
balanced_acc_score = sklearn.metrics.balanced_accuracy_score(y_valid, y_pred)
print (f'SPE: ensemble of {clf.n_estimators} {clf.base_estimator_}')
print ('Validation Balanced Accuracy: {:.3f}'.format(balanced_acc_score))

Out:

SPE: ensemble of 50 DecisionTreeClassifier()
Validation Balanced Accuracy: 0.986

Set the ensemble size

(parameter n_estimators: int)

from imbalanced_ensemble.ensemble import SelfPacedEnsembleClassifier as SPE
from sklearn.metrics import balanced_accuracy_score

clf = SPE(
    n_estimators=5, # Set ensemble size to 5
    random_state=RANDOM_STATE,
).fit(X_train, y_train)

# Evaluate
balanced_acc_score = balanced_accuracy_score(y_valid, clf.predict(X_valid))
print (f'SPE: ensemble of {clf.n_estimators} {clf.base_estimator_}')
print ('Validation Balanced Accuracy: {:.3f}'.format(balanced_acc_score))

Out:

SPE: ensemble of 5 DecisionTreeClassifier()
Validation Balanced Accuracy: 0.979

Use different base estimator

(parameter base_estimator: estimator object)

from sklearn.svm import SVC

clf = SPE(
    n_estimators=5,
    base_estimator=SVC(probability=True), # Use SVM as the base estimator
    random_state=RANDOM_STATE,
).fit(X_train, y_train)

# Evaluate
balanced_acc_score = balanced_accuracy_score(y_valid, clf.predict(X_valid))
print (f'SPE: ensemble of {clf.n_estimators} {clf.base_estimator_}')
print ('Validation Balanced Accuracy: {:.3f}'.format(balanced_acc_score))

Out:

SPE: ensemble of 5 SVC(probability=True)
Validation Balanced Accuracy: 0.968

Enable training log

(fit() parameter train_verbose: bool, int or dict)

clf = SPE(random_state=RANDOM_STATE).fit(
    X_train, y_train,
    train_verbose=True, # Enable training log
)

Out:

┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃             ┃                          ┃            Data: train             ┃
┃ #Estimators ┃    Class Distribution    ┃               Metric               ┃
┃             ┃                          ┃  acc    balanced_acc   weighted_f1 ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃      1      ┃ {0: 100, 1: 100, 2: 100} ┃ 0.989      0.991          0.989    ┃
┃      5      ┃ {0: 100, 1: 100, 2: 100} ┃ 1.000      1.000          1.000    ┃
┃     10      ┃ {0: 100, 1: 100, 2: 100} ┃ 1.000      1.000          1.000    ┃
┃     15      ┃ {0: 100, 1: 100, 2: 100} ┃ 1.000      1.000          1.000    ┃
┃     20      ┃ {0: 100, 1: 100, 2: 100} ┃ 1.000      1.000          1.000    ┃
┃     25      ┃ {0: 100, 1: 100, 2: 100} ┃ 1.000      1.000          1.000    ┃
┃     30      ┃ {0: 100, 1: 100, 2: 100} ┃ 1.000      1.000          1.000    ┃
┃     35      ┃ {0: 100, 1: 100, 2: 100} ┃ 1.000      1.000          1.000    ┃
┃     40      ┃ {0: 100, 1: 100, 2: 100} ┃ 1.000      1.000          1.000    ┃
┃     45      ┃ {0: 100, 1: 100, 2: 100} ┃ 1.000      1.000          1.000    ┃
┃     50      ┃ {0: 100, 1: 100, 2: 100} ┃ 1.000      1.000          1.000    ┃
┣━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫
┃    final    ┃ {0: 100, 1: 100, 2: 100} ┃ 1.000      1.000          1.000    ┃
┗━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛

Total running time of the script: ( 1 minutes 0.081 seconds)

Estimated memory usage: 17 MB

Gallery generated by Sphinx-Gallery