Source code for fuzzytrees.util_tree_criterion_funcs

# _*_coding:utf-8_*_
"""
@author : Zhaoqing Liu
@email  : Zhaoqing.Liu-1@student.uts.edu.au
"""
import math
from abc import ABCMeta, abstractmethod

import numpy as np


# =============================================================================
# Functions for Classification
# =============================================================================

# # For non-fuzzy decision trees
# def calculate_entropy(y):
#     """
#     Calculate the entropy of y.
#     """
#     entropy = 0
#
#     log2 = lambda x: math.log(x) / math.log(2)
#
#     unique_labels = np.unique(y)
#     for label in unique_labels:
#         count = len(y[y == label])
#         p = count / len(y)
#         entropy += -p * log2(p)
#
#     return entropy
#
#
# # For non-fuzzy decision trees
# def calculate_gini_index(y):
#     """
#     Calculate the Gini index of y.
#     """
#     diffsum = 0
#     for i, yi in enumerate(y[:-1], 1):
#         diffsum += np.sum(np.abs(yi - y[i:]))
#     return diffsum / (len(y) ** 2 * np.mean(y))
#
#
# # For non-fuzzy decision trees
# def calculate_gini(y):
#     """
#     Calculate the Gini impurity of y.
#     """
#     # Implementation based on the 1st Formula:
#     # diff = 0
#     # unique_labels = np.unique(y)
#     # for label in unique_labels:
#     #     count = len(y[y == label])
#     #     p = count / len(y)
#     #     diff += p * p
#     #
#     # return 1 - diff
#
#     # Implementation based on the 2nd Formula:
#     gini = 0
#     unique_labels = np.unique(y)
#     for label in unique_labels:
#         count = len(y[y == label])
#         p = count / len(y)
#         gini += p * (1 - p)
#
#     return gini


# For fuzzy decision trees
[docs]def calculate_entropy(y, dm=None):
    """
    Calculate the entropy of y.
    """
    entropy = 0

    log2 = lambda x: math.log(x) / math.log(2)

    unique_labels = np.unique(y)
    for label in unique_labels:
        if dm is not None:
            sum_sub_dm = np.sum(dm[np.where(y == label)[0], :])
            p = sum_sub_dm / np.sum(dm)
            entropy += -p * log2(p)
        else:
            count = len(y[y == label])
            p = count / len(y)
            entropy += -p * log2(p)

    return entropy


# For fuzzy decision trees
[docs]def calculate_gini(y, dm=None):
    """
    Calculate the Gini impurity of y.
    """
    # Implementation based on the 1st Formula:
    # diff = 0
    # unique_labels = np.unique(y)
    # for label in unique_labels:
    #     count = len(y[y == label])
    #     p = count / len(y)
    #     diff += p * p
    #
    # return 1 - diff

    # Implementation based on the 2nd Formula:
    gini = 0
    unique_labels = np.unique(y)
    for label in unique_labels:
        if dm is not None:
            sum_sub_dm = np.sum(dm[np.where(y == label)[0], :])
            p = sum_sub_dm / np.sum(dm)
            gini += p * (1 - p)
        else:
            count = len(y[y == label])
            p = count / len(y)
            gini += p * (1 - p)

    return gini


[docs]def calculate_impurity_gain(y, sub_y_1, sub_y_2, criterion_func, p_subset_true_dm=None, p_subset_false_dm=None):
    """
    Calculate the impurity gain, which is equal to the
    impurity of y minus the entropy of sub_y_1 and sub_y_2.
    """
    impurity = criterion_func(y)

    if p_subset_true_dm is not None and p_subset_false_dm is not None:
        information_gain = impurity - (p_subset_true_dm * criterion_func(sub_y_1[:, -1], sub_y_1[:, :-1])) - (p_subset_false_dm * criterion_func(sub_y_2[:, -1], sub_y_2[:, :-1]))
    else:
        p_1 = len(sub_y_1) / len(y)
        p_2 = len(sub_y_2) / len(y)
        information_gain = impurity - (p_1 * criterion_func(sub_y_1)) - (p_2 * criterion_func(sub_y_2))

    return information_gain


[docs]def calculate_impurity_gain_ratio(y, sub_y_1, sub_y_2, X_sub, criterion_func, p_subset_true_dm=None, p_subset_false_dm=None):
    """
    Calculate the impurity gain ratio.
    """
    information_gain = calculate_impurity_gain(y=y, sub_y_1=sub_y_1, sub_y_2=sub_y_2, criterion_func=criterion_func, p_subset_true_dm=p_subset_true_dm, p_subset_false_dm=p_subset_false_dm)
    intrinsic_value = criterion_func(X_sub)
    information_gain_ratio = information_gain / intrinsic_value

    return information_gain_ratio


[docs]def calculate_value_by_majority_vote(y):
    """
    Calculate value by majority vote.

    Attention
    ---------
    Used in classification decision tree.
    """
    majority_value = None

    max_count = 0
    unique_labels = np.unique(y)
    for label in unique_labels:
        count = len(y[y == label])
        if count > max_count:
            majority_value = label
            max_count = count

    return majority_value


# =============================================================================
# Functions for Regression
# =============================================================================

[docs]def calculate_mse(y_true, y_pred):
    """
    Calculate the Mean Squared Error between y_true and y_pred.
    """
    mse = np.mean(np.power(y_true - y_pred, 2))
    return mse


[docs]def calculate_mae(y_true, y_pred):
    """
    Calculate the Mean Absolute Error between y_true and y_pred.
    """
    mae = np.mean(abs(y_true - y_pred))
    return mae


[docs]def calculate_variance(y):
    """
    Calculate the variance of y.
    """
    mean = np.ones(np.shape(y)) * y.mean(0)
    n_samples = np.shape(y)[0]
    variance = (1 / n_samples) * np.diag((y - mean).T.dot(y - mean))  # T means transposing a matrix.

    return variance


[docs]def calculate_standard_deviation(y):
    """
    Calculate the standard deviation of y.
    """
    std_dev = np.sqrt(calculate_variance(y))

    return std_dev


[docs]def calculate_variance_reduction(y, sub_y_1, sub_y_2, criterion_func, p_subset_true_dm=None, p_subset_false_dm=None):
    """
    Calculate the variance reduction, which is equal to the
    impurity of y minus the entropy of sub_y_1 and sub_y_2.
    """
    var = criterion_func(y)
    var_1 = criterion_func(np.expand_dims(sub_y_1[:, -1], axis=1))
    var_2 = criterion_func(np.expand_dims(sub_y_2[:, -1], axis=1))

    if p_subset_true_dm is not None and p_subset_false_dm is not None:
        p_1 = p_subset_true_dm
        p_2 = p_subset_false_dm
    else:
        p_1 = len(sub_y_1) / len(y)
        p_2 = len(sub_y_2) / len(y)

    # Calculate the variance reduction
    variance_reduction = var - (p_1 * var_1 + p_2 * var_2)

    return sum(variance_reduction)


[docs]def calculate_mean_value(y):
    """
    Calculate the mean of y.

    Parameters
    ----------
    y : array-like of shape (n_samples, n_labels)

    Returns
    -------
    value : array-like of the shape reduced by one dimension,
           at least a 0-d float number
        The mean values.
    """
    value = np.mean(y, axis=0)

    return value if len(value) > 1 else value[0]


# =============================================================================
# Statistical functions
# =============================================================================

[docs]def calculate_proba(y):
    """
    Calculate the probabilities of each element in the set.

    Attention
    ---------
    Before counting, the elements will be reordered from smallest to largest.

    Parameters
    ----------
    y : array-like of shape (n_samples,)
    """
    prob_list = []

    label_values = np.unique(y)
    for label in label_values:
        prob_list.append(np.sum(y == label) / np.shape(y)[0])

    # If the number of dimensions of y is greater than 1,
    # the following method may cause a ValueError: object too deep for desired array.
    # if len(np.shape(y)) > 1:
    #     y = np.squeeze(y)
    # dist = np.bincount(y)
    # for count in dist:
    #     print(count / np.shape(y)[0])

    return prob_list


# =============================================================================
# Loss functions
# =============================================================================

[docs]class LossFunction(metaclass=ABCMeta):
    """
    Base loss function class that encapsulates all
    base functions to be inherited by all derived
    function classes.

    Warnings
    --------
    This class should not be used directly.
    Use derived classes instead.
    """

[docs]    @abstractmethod
    def loss(self, y, y_pred):
        pass

[docs]    @abstractmethod
    def gradient(self, y, y_pred):
        pass


[docs]class LeastSquaresFunction(LossFunction):
    """
    Function class used in a gradient boosting regressor
    (Friedman et al., 1998; Friedman 2001).
    """

[docs]    def loss(self, y, y_pred):
        """Lost function is a Least-square equation: L(y, F) = (y - F) ^ 2 / 2"""
        return 0.5 * np.power((y - y_pred), 2)

[docs]    def gradient(self, y, y_pred):
        return -(y - y_pred)


[docs]class SoftLeastSquaresFunction(LossFunction):
    """
    Function class used in a gradient boosting classifier
    (Friedman et al., 1998; Friedman 2001).
    """

[docs]    def loss(self, y, y_pred):
        """
        Lost function (Least-square equation: L(y, F) = (y - F) ^ 2 / 2)
        is not applicable in classification.
        """
        pass

[docs]    def gradient(self, y, proba):
        return y - proba


# =============================================================================
# Functions for Bagging Ensembles
# =============================================================================

[docs]def majority_vote(y_preds):
    """
    Get the the final classification result by majority voting method.

    Parameters
    ----------
    y_preds : array-like of shape (n_samples, n_estimators)
        NB: The input array needs to be of integer dtype, otherwise a
        TypeError is raised.

    Returns
    -------
    array-like of shape (n_samples, )
    """
    y_pred = []
    for y_p in y_preds:
        y_pred.append(np.bincount(y_p.astype("int")).argmax())
        # np.bincount(): Count number of occurrences of each value in array of non-negative ints.
        # np.argmax(): Return indices of the maximum values along the given axis.

    return y_pred


[docs]def mean_value(y_preds):
    """
    Get the final regression result by averaging method.

    Parameters
    ----------
    y_preds : array-like of shape (n_samples, n_estimators, n_labels)

    Returns
    -------
    y_pred : array-like of the shape (n_samples, n_labels) reduced by one dimension,
           at least array-like of shape (n_samples, )
    """
    y_pred = []
    for y_p in y_preds:
        y_pred.append(calculate_mean_value(y_p))

    return np.array(y_pred)