Source code for silk_ml.features

import pandas as pd
from scipy.stats import chi2_contingency, ttest_ind

from .plots import plot_categorical, plot_numerical


[docs]def split_classes(X, Y, label): """ Returns the splited value of the dataset using the requested label Args: X (pd.DataFrame): Main dataset with the variables Y (pd.Series): Target variable label (str): Name of the variable to split Returns: tuple(pd.Series, pd.Series): The `positive` and `negative` data splited """ positive = X.loc[Y == 1][label] negative = X.loc[Y != 1][label] return positive, negative
[docs]def features_metrics(X, Y, target_name, plot=None): """ Determines the likelihood from each variable of splitting correctly the dataset Args: X (pd.DataFrame): Main dataset with the variables Y (pd.Series): Target variable target_name (str or None): Target name for reports plot ('all' or 'categorical' or 'numerical' or None): Plots the variables, showing the difference in the classes Returns: pd.DataFrame: Table of variables and their classification tests """ plot_cat = plot in ['all', 'categorical'] plot_num = plot in ['all', 'numerical'] features = {} columns = X.columns.tolist() def is_categorical(column): # Currify the categorical validation return len(X[column].unique().tolist()) <= 2 def test_variable(column): # Currify the call for the p-value calculator if is_categorical(column): test, plot = _test_categorical, plot_cat else: test, plot = _test_numerical, plot_num return test(X, Y, column, target_name, plot) features = { 'cardinality kind': [ 'categorical' if is_categorical(column) else 'numerical' for column in columns ], 'split probability': [ f'{(100 - test_variable(column) * 100):.4f} %' for column in columns ], } return pd.DataFrame(features, index=columns)
def _test_categorical(X, Y, column, target_name, plot_cat): """ Runs the p-value test for the current variable Args: X (pd.DataFrame): Main dataset with the variables Y (pd.Series): Target variable column (str): Name of the variable to test target_name (str or None): Target name for reports plot_cat (bool): Plots the current variable Returns: float: p-value of the variables """ if plot_cat: plot_categorical(X, Y, column, target_name) cont_table = pd.crosstab(Y, X[column], margins=False) test = chi2_contingency(cont_table.values) return test[1] def _test_numerical(X, Y, column, target_name, plot_num): """ Runs the p-value test for the current variable Args: X (pd.DataFrame): Main dataset with the variables Y (pd.Series): Target variable column (str): Name of the variable to test target_name (str or None): Target name for reports plot_num (bool): Plots the current variable Returns: float: p-value of the variables """ positive, negative = split_classes(X, Y, column) if plot_num: plot_numerical(positive, negative, column, target_name) _, p_value = ttest_ind(positive, negative) return p_value