# _*_coding:utf-8_*_
"""
@author : Zhaoqing Liu
@email : Zhaoqing.Liu-1@student.uts.edu.au
"""
import math
from abc import ABCMeta, abstractmethod
import numpy as np
# =============================================================================
# Functions for Classification
# =============================================================================
# # For non-fuzzy decision trees
# def calculate_entropy(y):
# """
# Calculate the entropy of y.
# """
# entropy = 0
#
# log2 = lambda x: math.log(x) / math.log(2)
#
# unique_labels = np.unique(y)
# for label in unique_labels:
# count = len(y[y == label])
# p = count / len(y)
# entropy += -p * log2(p)
#
# return entropy
#
#
# # For non-fuzzy decision trees
# def calculate_gini_index(y):
# """
# Calculate the Gini index of y.
# """
# diffsum = 0
# for i, yi in enumerate(y[:-1], 1):
# diffsum += np.sum(np.abs(yi - y[i:]))
# return diffsum / (len(y) ** 2 * np.mean(y))
#
#
# # For non-fuzzy decision trees
# def calculate_gini(y):
# """
# Calculate the Gini impurity of y.
# """
# # Implementation based on the 1st Formula:
# # diff = 0
# # unique_labels = np.unique(y)
# # for label in unique_labels:
# # count = len(y[y == label])
# # p = count / len(y)
# # diff += p * p
# #
# # return 1 - diff
#
# # Implementation based on the 2nd Formula:
# gini = 0
# unique_labels = np.unique(y)
# for label in unique_labels:
# count = len(y[y == label])
# p = count / len(y)
# gini += p * (1 - p)
#
# return gini
# For fuzzy decision trees
[docs]def calculate_entropy(y, dm=None):
"""
Calculate the entropy of y.
"""
entropy = 0
log2 = lambda x: math.log(x) / math.log(2)
unique_labels = np.unique(y)
for label in unique_labels:
if dm is not None:
sum_sub_dm = np.sum(dm[np.where(y == label)[0], :])
p = sum_sub_dm / np.sum(dm)
entropy += -p * log2(p)
else:
count = len(y[y == label])
p = count / len(y)
entropy += -p * log2(p)
return entropy
# For fuzzy decision trees
[docs]def calculate_gini(y, dm=None):
"""
Calculate the Gini impurity of y.
"""
# Implementation based on the 1st Formula:
# diff = 0
# unique_labels = np.unique(y)
# for label in unique_labels:
# count = len(y[y == label])
# p = count / len(y)
# diff += p * p
#
# return 1 - diff
# Implementation based on the 2nd Formula:
gini = 0
unique_labels = np.unique(y)
for label in unique_labels:
if dm is not None:
sum_sub_dm = np.sum(dm[np.where(y == label)[0], :])
p = sum_sub_dm / np.sum(dm)
gini += p * (1 - p)
else:
count = len(y[y == label])
p = count / len(y)
gini += p * (1 - p)
return gini
[docs]def calculate_impurity_gain(y, sub_y_1, sub_y_2, criterion_func, p_subset_true_dm=None, p_subset_false_dm=None):
"""
Calculate the impurity gain, which is equal to the
impurity of y minus the entropy of sub_y_1 and sub_y_2.
"""
impurity = criterion_func(y)
if p_subset_true_dm is not None and p_subset_false_dm is not None:
information_gain = impurity - (p_subset_true_dm * criterion_func(sub_y_1[:, -1], sub_y_1[:, :-1])) - (p_subset_false_dm * criterion_func(sub_y_2[:, -1], sub_y_2[:, :-1]))
else:
p_1 = len(sub_y_1) / len(y)
p_2 = len(sub_y_2) / len(y)
information_gain = impurity - (p_1 * criterion_func(sub_y_1)) - (p_2 * criterion_func(sub_y_2))
return information_gain
[docs]def calculate_impurity_gain_ratio(y, sub_y_1, sub_y_2, X_sub, criterion_func, p_subset_true_dm=None, p_subset_false_dm=None):
"""
Calculate the impurity gain ratio.
"""
information_gain = calculate_impurity_gain(y=y, sub_y_1=sub_y_1, sub_y_2=sub_y_2, criterion_func=criterion_func, p_subset_true_dm=p_subset_true_dm, p_subset_false_dm=p_subset_false_dm)
intrinsic_value = criterion_func(X_sub)
information_gain_ratio = information_gain / intrinsic_value
return information_gain_ratio
[docs]def calculate_value_by_majority_vote(y):
"""
Calculate value by majority vote.
Attention
---------
Used in classification decision tree.
"""
majority_value = None
max_count = 0
unique_labels = np.unique(y)
for label in unique_labels:
count = len(y[y == label])
if count > max_count:
majority_value = label
max_count = count
return majority_value
# =============================================================================
# Functions for Regression
# =============================================================================
[docs]def calculate_mse(y_true, y_pred):
"""
Calculate the Mean Squared Error between y_true and y_pred.
"""
mse = np.mean(np.power(y_true - y_pred, 2))
return mse
[docs]def calculate_mae(y_true, y_pred):
"""
Calculate the Mean Absolute Error between y_true and y_pred.
"""
mae = np.mean(abs(y_true - y_pred))
return mae
[docs]def calculate_variance(y):
"""
Calculate the variance of y.
"""
mean = np.ones(np.shape(y)) * y.mean(0)
n_samples = np.shape(y)[0]
variance = (1 / n_samples) * np.diag((y - mean).T.dot(y - mean)) # T means transposing a matrix.
return variance
[docs]def calculate_standard_deviation(y):
"""
Calculate the standard deviation of y.
"""
std_dev = np.sqrt(calculate_variance(y))
return std_dev
[docs]def calculate_variance_reduction(y, sub_y_1, sub_y_2, criterion_func, p_subset_true_dm=None, p_subset_false_dm=None):
"""
Calculate the variance reduction, which is equal to the
impurity of y minus the entropy of sub_y_1 and sub_y_2.
"""
var = criterion_func(y)
var_1 = criterion_func(np.expand_dims(sub_y_1[:, -1], axis=1))
var_2 = criterion_func(np.expand_dims(sub_y_2[:, -1], axis=1))
if p_subset_true_dm is not None and p_subset_false_dm is not None:
p_1 = p_subset_true_dm
p_2 = p_subset_false_dm
else:
p_1 = len(sub_y_1) / len(y)
p_2 = len(sub_y_2) / len(y)
# Calculate the variance reduction
variance_reduction = var - (p_1 * var_1 + p_2 * var_2)
return sum(variance_reduction)
[docs]def calculate_mean_value(y):
"""
Calculate the mean of y.
Parameters
----------
y : array-like of shape (n_samples, n_labels)
Returns
-------
value : array-like of the shape reduced by one dimension,
at least a 0-d float number
The mean values.
"""
value = np.mean(y, axis=0)
return value if len(value) > 1 else value[0]
# =============================================================================
# Statistical functions
# =============================================================================
[docs]def calculate_proba(y):
"""
Calculate the probabilities of each element in the set.
Attention
---------
Before counting, the elements will be reordered from smallest to largest.
Parameters
----------
y : array-like of shape (n_samples,)
"""
prob_list = []
label_values = np.unique(y)
for label in label_values:
prob_list.append(np.sum(y == label) / np.shape(y)[0])
# If the number of dimensions of y is greater than 1,
# the following method may cause a ValueError: object too deep for desired array.
# if len(np.shape(y)) > 1:
# y = np.squeeze(y)
# dist = np.bincount(y)
# for count in dist:
# print(count / np.shape(y)[0])
return prob_list
# =============================================================================
# Loss functions
# =============================================================================
[docs]class LossFunction(metaclass=ABCMeta):
"""
Base loss function class that encapsulates all
base functions to be inherited by all derived
function classes.
Warnings
--------
This class should not be used directly.
Use derived classes instead.
"""
[docs] @abstractmethod
def loss(self, y, y_pred):
pass
[docs] @abstractmethod
def gradient(self, y, y_pred):
pass
[docs]class LeastSquaresFunction(LossFunction):
"""
Function class used in a gradient boosting regressor
(Friedman et al., 1998; Friedman 2001).
"""
[docs] def loss(self, y, y_pred):
"""Lost function is a Least-square equation: L(y, F) = (y - F) ^ 2 / 2"""
return 0.5 * np.power((y - y_pred), 2)
[docs] def gradient(self, y, y_pred):
return -(y - y_pred)
[docs]class SoftLeastSquaresFunction(LossFunction):
"""
Function class used in a gradient boosting classifier
(Friedman et al., 1998; Friedman 2001).
"""
[docs] def loss(self, y, y_pred):
"""
Lost function (Least-square equation: L(y, F) = (y - F) ^ 2 / 2)
is not applicable in classification.
"""
pass
[docs] def gradient(self, y, proba):
return y - proba
# =============================================================================
# Functions for Bagging Ensembles
# =============================================================================
[docs]def majority_vote(y_preds):
"""
Get the the final classification result by majority voting method.
Parameters
----------
y_preds : array-like of shape (n_samples, n_estimators)
NB: The input array needs to be of integer dtype, otherwise a
TypeError is raised.
Returns
-------
array-like of shape (n_samples, )
"""
y_pred = []
for y_p in y_preds:
y_pred.append(np.bincount(y_p.astype("int")).argmax())
# np.bincount(): Count number of occurrences of each value in array of non-negative ints.
# np.argmax(): Return indices of the maximum values along the given axis.
return y_pred
[docs]def mean_value(y_preds):
"""
Get the final regression result by averaging method.
Parameters
----------
y_preds : array-like of shape (n_samples, n_estimators, n_labels)
Returns
-------
y_pred : array-like of the shape (n_samples, n_labels) reduced by one dimension,
at least array-like of shape (n_samples, )
"""
y_pred = []
for y_p in y_preds:
y_pred.append(calculate_mean_value(y_p))
return np.array(y_pred)