Source code for silk_ml.features

import pandas as pd
from scipy.stats import chi2_contingency, ttest_ind

from .plots import plot_categorical, plot_numerical


[docs]def split_classes(X, Y, label):
    """ Returns the splited value of the dataset using the requested label

    Args:
        X (pd.DataFrame): Main dataset with the variables
        Y (pd.Series): Target variable
        label (str): Name of the variable to split

    Returns:
        tuple(pd.Series, pd.Series): The `positive` and `negative` data splited
    """
    positive = X.loc[Y == 1][label]
    negative = X.loc[Y != 1][label]
    return positive, negative


[docs]def features_metrics(X, Y, target_name, plot=None):
    """ Determines the likelihood from each variable of splitting correctly the dataset

    Args:
        X (pd.DataFrame): Main dataset with the variables
        Y (pd.Series): Target variable
        target_name (str or None): Target name for reports
        plot ('all' or 'categorical' or 'numerical' or None): Plots the
            variables, showing the difference in the classes

    Returns:
        pd.DataFrame: Table of variables and their classification tests
    """
    plot_cat = plot in ['all', 'categorical']
    plot_num = plot in ['all', 'numerical']

    features = {}
    columns = X.columns.tolist()

    def is_categorical(column):
        # Currify the categorical validation
        return len(X[column].unique().tolist()) <= 2

    def test_variable(column):
        # Currify the call for the p-value calculator
        if is_categorical(column):
            test, plot = _test_categorical, plot_cat
        else:
            test, plot = _test_numerical, plot_num
        return test(X, Y, column, target_name, plot)

    features = {
        'cardinality kind': [
            'categorical' if is_categorical(column) else 'numerical'
            for column in columns
        ],
        'split probability': [
            f'{(100 - test_variable(column) * 100):.4f} %'
            for column in columns
        ],
    }
    return pd.DataFrame(features, index=columns)


def _test_categorical(X, Y, column, target_name, plot_cat):
    """ Runs the p-value test for the current variable
    
    Args:
        X (pd.DataFrame): Main dataset with the variables
        Y (pd.Series): Target variable
        column (str): Name of the variable to test
        target_name (str or None): Target name for reports
        plot_cat (bool): Plots the current variable
    
    Returns:
        float: p-value of the variables
    """
    if plot_cat:
        plot_categorical(X, Y, column, target_name)
    cont_table = pd.crosstab(Y, X[column], margins=False)
    test = chi2_contingency(cont_table.values)
    return test[1]


def _test_numerical(X, Y, column, target_name, plot_num):
    """ Runs the p-value test for the current variable
    
    Args:
        X (pd.DataFrame): Main dataset with the variables
        Y (pd.Series): Target variable
        column (str): Name of the variable to test
        target_name (str or None): Target name for reports
        plot_num (bool): Plots the current variable
    
    Returns:
        float: p-value of the variables
    """
    positive, negative = split_classes(X, Y, column)
    if plot_num:
        plot_numerical(positive, negative, column, target_name)
    _, p_value = ttest_ind(positive, negative)
    return p_value