Source code for silk_ml.classification

import pandas as pd

from .features import features_metrics
from .plots import plot_corr, plot_mainfold, plot_roc_cross_val
from .train import cross_validation
from .imbalanced import resample


[docs]class Classifier: """ General tasks for classification and data analysis Args: target (str or None): Categorical variable to classify filename (str or None): Name with path for reading a csv file target_name (str or None): Target name for reports """ def __init__(self, target=None, filename=None, target_name=None): pd.set_option('display.max_columns', None) self._target = target self.target_name = target_name if filename and target: self.read_csv(target, filename) @property def target(self): return self._target @target.setter def target(self, target): """ Sets the target variable and if the data value exists, the X and Y values are setted as well Args: target (str): Categorical variable to classify """ self._target = target if self.data is not None: self.Y = self.data[target] self.X = self.data.drop(columns=[target])
[docs] def read_csv(self, target, filename): """ Reads a CSV file and separate the X and Y variables Args: target (str): Categorical variable to classify filename (str): Name with path for reading a csv file Returns: list(pd.DataFrame): `X`, `Y`, and `data` values """ self.data = pd.read_csv(filename) self.target = target return self.X, self.Y, self.data
[docs] def standardize(self, normalizer, scaler): """ Applies a normalizer and scaler preprocessing steps Args: normalizer (Class.fit_transform): Class that centers the data scaler (Class.fit_transform): Class that modifies the data boundaries """ normalized = normalizer.fit_transform(self.X).transpose() # Check if in the normalization any data get lost for i, column in enumerate(self.X.columns.tolist()): if normalized[i].var() <= 1e-10: normalized[i] = self.X[column] return scaler.fit_transform(normalized.transpose())
[docs] def features_metrics(self, plot=None): """ Checks for each variable the probability of being splited Args: plot ('all' or 'categorical' or 'numerical' or None): Plots the variables, showing the difference in the classes Returns: pd.DataFrame: Table of variables and their classification tests """ return features_metrics(self.X, self.Y, self.target_name, plot)
[docs] def remove_features(self, features): """ Remove features from the X values Args: features (list(str)): Column's names to remove """ self.X = self.X.drop(columns=features)
[docs] def resample(self, rate=0.9, strategy='hybrid'): """ Sampling based methods to balance dataset Args: rate (float): Ratio of the number of samples in the minority class over the number of samples in the majority class after resampling strategy ('hybrid' or 'over_sampling' or 'under_sampling'): Strategy to balance the dataset """ self.X, self.Y = resample(self.X, self.Y, rate, strategy)
[docs] def cross_validation(self, models, scores, folds=30): """ Validates several models and scores Args: models (list(tuple)): Models to evaluate scores (list(tuple)): Scores to measure the models folds (int): Number of folds in a (Stratified)KFold """ return cross_validation(self.X, self.Y, models, scores, folds)
[docs] def plot_corr(self, values=True): """ Plots the correlation matrix Args: values (bool): Shows each of the correlation values """ plot_corr(self.data, values)
[docs] def plot_mainfold(self, method): """ Plots the reduced space using a mainfold transformation Args: method (Class.fit_transform): Mainfold transformation method """ plot_mainfold(method, self.data, self.target_name)
[docs] def plot_roc_cross_val(self, models): """ Plots all the models with their ROC Args: models (list(tuple)): Models to evaluate """ plot_roc_cross_val(self.X, self.Y, models)