Source code for silk_ml.classification
import pandas as pd
from .features import features_metrics
from .plots import plot_corr, plot_mainfold, plot_roc_cross_val
from .train import cross_validation
from .imbalanced import resample
[docs]class Classifier:
""" General tasks for classification and data analysis
Args:
target (str or None): Categorical variable to classify
filename (str or None): Name with path for reading a csv file
target_name (str or None): Target name for reports
"""
def __init__(self, target=None, filename=None, target_name=None):
pd.set_option('display.max_columns', None)
self._target = target
self.target_name = target_name
if filename and target:
self.read_csv(target, filename)
@property
def target(self):
return self._target
@target.setter
def target(self, target):
""" Sets the target variable and if the data value exists,
the X and Y values are setted as well
Args:
target (str): Categorical variable to classify
"""
self._target = target
if self.data is not None:
self.Y = self.data[target]
self.X = self.data.drop(columns=[target])
[docs] def read_csv(self, target, filename):
""" Reads a CSV file and separate the X and Y variables
Args:
target (str): Categorical variable to classify
filename (str): Name with path for reading a csv file
Returns:
list(pd.DataFrame): `X`, `Y`, and `data` values
"""
self.data = pd.read_csv(filename)
self.target = target
return self.X, self.Y, self.data
[docs] def standardize(self, normalizer, scaler):
""" Applies a normalizer and scaler preprocessing steps
Args:
normalizer (Class.fit_transform): Class that centers the data
scaler (Class.fit_transform): Class that modifies the data boundaries
"""
normalized = normalizer.fit_transform(self.X).transpose()
# Check if in the normalization any data get lost
for i, column in enumerate(self.X.columns.tolist()):
if normalized[i].var() <= 1e-10:
normalized[i] = self.X[column]
return scaler.fit_transform(normalized.transpose())
[docs] def features_metrics(self, plot=None):
""" Checks for each variable the probability of being splited
Args:
plot ('all' or 'categorical' or 'numerical' or None): Plots the
variables, showing the difference in the classes
Returns:
pd.DataFrame: Table of variables and their classification tests
"""
return features_metrics(self.X, self.Y, self.target_name, plot)
[docs] def remove_features(self, features):
""" Remove features from the X values
Args:
features (list(str)): Column's names to remove
"""
self.X = self.X.drop(columns=features)
[docs] def resample(self, rate=0.9, strategy='hybrid'):
""" Sampling based methods to balance dataset
Args:
rate (float): Ratio of the number of samples in the minority class
over the number of samples in the majority class after
resampling
strategy ('hybrid' or 'over_sampling' or 'under_sampling'): Strategy
to balance the dataset
"""
self.X, self.Y = resample(self.X, self.Y, rate, strategy)
[docs] def cross_validation(self, models, scores, folds=30):
""" Validates several models and scores
Args:
models (list(tuple)): Models to evaluate
scores (list(tuple)): Scores to measure the models
folds (int): Number of folds in a (Stratified)KFold
"""
return cross_validation(self.X, self.Y, models, scores, folds)
[docs] def plot_corr(self, values=True):
""" Plots the correlation matrix
Args:
values (bool): Shows each of the correlation values
"""
plot_corr(self.data, values)
[docs] def plot_mainfold(self, method):
""" Plots the reduced space using a mainfold transformation
Args:
method (Class.fit_transform): Mainfold transformation method
"""
plot_mainfold(method, self.data, self.target_name)
[docs] def plot_roc_cross_val(self, models):
""" Plots all the models with their ROC
Args:
models (list(tuple)): Models to evaluate
"""
plot_roc_cross_val(self.X, self.Y, models)