Source code for silk_ml.imbalanced
import pandas as pd
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
[docs]def resample(X, Y, rate=0.9, strategy='hybrid'):
""" Sampling based methods to balance dataset
Args:
X (pd.DataFrame): Main dataset with the variables
Y (pd.Series): Target variable
rate (float): Ratio of the number of samples in the minority class over
the number of samples in the majority class after resampling
strategy ('hybrid' | 'over_sampling' | 'under_sampling'): Strategy to
balance the dataset
"""
strategies = {
'hybrid': SMOTEENN(sampling_strategy=rate),
'over_sampling': SMOTE(sampling_strategy=rate),
'under_sampling': EditedNearestNeighbours(),
}
resampling = strategies[strategy]
cols = X.columns
X_r, Y_r = resampling.fit_resample(X, Y)
return pd.DataFrame(data=X_r, columns=cols), Y_r