Source code for traval.binary_classifier

import numpy as np
import pandas as pd


[docs]class BinaryClassifier: """Class for calculating binary classification statistics.""" stats_abbreviations = { "tp": "true positives", "fp": "false positives", "fn": "false negatives", "tn": "true negatives", "sensitivity": "sensitivity", "tpr": "true positive rate", "fnr": "false negative rate", "specificity": "specificity", "tnr": "true negative rate", "fpr": "false positive rate", "ppv": "positive predictive value", "npv": "negative predictive value", "fdr": "false discovery rate", "for": "false omission rate", "acc": "accuracy", "prev": "prevalence", "informedness": "informedness", "mcc": "matthews correlation coefficient", } def __init__(self, tp, fp, tn, fn): """Initialize class for calculating binary classification statistics. Parameters ---------- tp : int number of True Positives (TP) fp : int number of False Positives (FP) tn : int number of True Negatives (TN) fn : int number of False Negatives (FN) """ self.n_obs = tp + fp + tn + fn self.true_positives = self.tp = tp self.false_positives = self.fp = fp self.true_negatives = self.tn = tn self.false_negatives = self.fn = fn
[docs] @classmethod def from_series_comparison_relative(cls, comparison): """Binary Classification object from SeriesComparisonRelative object. Parameters ---------- comparison : traval.SeriesComparisonRelative object comparing two timeseries with base timeseries Returns ------- BinaryClassifier object for calculating binary classification statistics """ n_true_positives = comparison.idx_r_flagged_in_both.size # hit n_false_positives = comparison.idx_r_flagged_in_s1.size # false alarm n_true_negatives = comparison.idx_r_kept_in_both.size # correct rejections n_false_negatives = comparison.idx_r_flagged_in_s2.size # miss return cls(n_true_positives, n_false_positives, n_true_negatives, n_false_negatives)
[docs] @classmethod def from_confusion_matrix(cls, cmat): """Create BinaryClassifier from confusion matrix. Note ---- Confusion Matrix must be passed as an np.array or pd.DataFrame corresponding to: [[TP, FN], [FP, TN]], like the one returned by `BinaryClassifier.confusion_matrix` Parameters ---------- cmat : np.array or pd.DataFrame a 2x2 dataset with structure [[TP, FN], [FP, TN]] Returns ------- BinaryClassifier BinaryClassifier object based on values in confusion matrix. See also -------- BinaryClassifier.confusion_matrix : for explanation (of abbreviations) """ if isinstance(cmat, pd.DataFrame): [tp, fn], [fp, tn] = cmat.values elif isinstance(cmat, np.ndarray): [tp, fn], [fp, tn] = cmat else: raise TypeError("Cannot parse confusion matrix of type: " f"{type(cmat)}") return cls(tp, fp, tn, fn)
def __add__(self, other): """Add two BinaryClassification objects. Parameters ---------- other : traval.BinaryClassifier other BinaryClassifier object Returns ------- bc : BinaryClassifier new BinaryClassifier object containing sum of two objects """ if isinstance(other, self.__class__): tp = self.true_positives + other.true_positives fp = self.false_positives + other.false_positives tn = self.true_negatives + other.true_negatives fn = self.false_negatives + other.false_negatives else: raise TypeError("other must be BinaryClassifier object!") return BinaryClassifier(tp, fp, tn, fn)
[docs] def confusion_matrix(self, as_array=False): """Calculate confusion matrix. Confusion matrix shows the performance of the algorithm given a certain truth. An abstract example of the confusion matrix: | Algorithm | |-------------------| | error | correct | ------|---------|---------|---------| | error | TP | FN | Truth |---------|---------|---------| | correct | FP | TN | ------|---------|---------|---------| where: - TP: True Positives = errors correctly detected by algorithm - TN: True Negatives = correct values correctly not flagged by algorithm - FP: False Positives = correct values marked as errors by algorithm - FN: False Negatives = errors not detected by algorithm Parameters ---------- as_array : bool, optional return data as array instead of DataFrame, by default False Returns ------- data : pd.DataFrame or np.array confusion matrix """ # create array with data data = np.zeros((2, 2), dtype=int) # true positives = errors correctly identified data[0, 0] = self.true_positives # true negatives = correct observations correctly left alone data[1, 1] = self.true_negatives # false negatives = seen as correct by algorithm but # are errors according to 'truth' data[0, 1] = self.false_negatives # false positives = identified as errors by algorithm but # are correct according to 'truth' data[1, 0] = self.false_positives if as_array: return data else: # create columns and index columns = pd.MultiIndex.from_product([["Algorithm"], ["error", "correct"]]) index = pd.MultiIndex.from_product([['"Truth"'], ["error", "correct"]]) cmat = pd.DataFrame( index=index, columns=columns, data=data, dtype=int) return cmat
@property def matthews_correlation_coefficient(self): """Matthews correlation coefficient (MCC). The MCC is in essence a correlation coefficient between the observed and predicted binary classifications; it returns a value between −1 and +1. A coefficient of +1 represents a perfect prediction, 0 no better than random prediction and −1 indicates total disagreement between prediction and observation. Returns ------- phi : float the Matthews correlation coefficient See also -------- mcc : convenience method for calculating MCC """ # avoid warning when dividing by 0, # returns NaN which is what we want with np.errstate(invalid='ignore'): phi = ((self.tp * self.tn - self.fp * self.fn) / np.sqrt(float((self.tp + self.fp) * (self.tp + self.fn) * (self.tn + self.fp) * (self.tn + self.fn)))) return phi @property def mcc(self): """Convenience method for calculating Matthews correlation coefficient. Returns ------- phi : float the Matthews correlation coefficient See also -------- matthews_correlation_coefficient : more information about the statistic """ return self.matthews_correlation_coefficient @property def sensitivity(self): """Sensitivity or True Positive Rate. Statistic describing ratio of true positives identified, which also says something about the avoidance of false negatives. Sensitivity = TP / (TP + FN) where - TP : True Positives - FN : False Negatives """ tp = self.true_positives fn = self.false_negatives if tp + fn > 0: return tp / (tp + fn) else: return np.nan @property def specificity(self): """Specificity or True Negative Rate. Statistic describing ratio of true negatives identified, which also says something about the avoidance of false positives. Specificity = TN / (TN + FP) where - TN : True Negatives - FP : False Positives """ tn = self.true_negatives fp = self.false_positives if tn + fp > 0: return tn / (tn + fp) else: return np.nan @property def true_positive_rate(self): """True Positive Rate. Synonym for sensitivity. See sensitiviy for description. """ return self.sensitivity @property def true_negative_rate(self): """True Negative Rate. Synonym for specificity. See specificity for description. """ return self.specificity @property def false_positive_rate(self): """False Positive Rate = (1 - specificity). FPR = FP / (FP + TN) where - FP : False Positives - TN : True Negatives """ fp = self.false_positives tn = self.true_negatives if (fp + tn) > 0: return fp / (fp + tn) else: return np.nan @property def false_negative_rate(self): """False Negative Rate = (1 - sensitivity). FNR = FN / (FN + TP) where - FN : False Negatives - TP : True Positives """ fn = self.false_negatives tp = self.true_positives if fn + tp > 0: return fn / (fn + tp) else: return np.nan @property def informedness(self): """Informedness statistic (a.k.a. Youden's J statistic). Measure of diagnostic performance, and has a zero value when a diagnostic test gives the same proportion of positive results for groups with and without a condition, i.e the test is useless. A value of 1 indicates that there are no false positives or false negatives, i.e. the test is perfect. Calculated as: informedness = specificity + sensitivity - 1. """ return self.specificity + self.sensitivity - 1. @property def accuracy(self): """Accuracy of binary classification. ACC = (TP + TN) / (TP + FP + FN + TN) where - TP : True Positives - TN : True Negatives - FP : False Positives - FN : False Negatives """ acc = (self.tp + self.tn) / (self.tp + self.fp + self.fn + self.tn) return acc @property def prevalence(self): """Prevalance of true errors in total population. Prevalence = (TP + FN) / (TP + FP + FN + TN) where - TP : True Positives - TN : True Negatives - FP : False Positives - FN : False Negatives """ prev = (self.tp + self.fn) / (self.tp + self.fp + self.fn + self.tn) return prev @property def positive_predictive_value(self): """Positive predictive value (a.k.a. precision). PPV = TP / (TP + FP) where - TP : True Positives - FP : False Positives """ ppv = self.tp / (self.tp + self.fp) return ppv @property def negative_predictive_value(self): """Negative predictive value. NPV = TN / (TN + FN) where - TN : True Negatives - FN : False Negatives """ if (self.tn + self.fn) > 0: npv = self.tn / (self.tn + self.fn) else: npv = np.nan return npv @property def false_discovery_rate(self): """False discovery rate. FDR = 1 - PPV = FP / (FP + TP) where - TP : True Positives - FP : False Positives """ fdr = self.fp / (self.fp + self.tp) return fdr @property def false_omission_rate(self): """False omission rate. FOR = 1 - NPV = FN / (TN + FN) where - TN : True Negatives - FN : False Negatives """ if (self.fn + self.tn) > 0: for_ = self.fn / (self.fn + self.tn) else: for_ = np.nan return for_
[docs] def get_all_statistics(self, use_abbreviations=True): """Get all statistics in pandas.Series. Parameters ---------- use_abbreviations : bool, optional whether to use abbreviations or full names for index, by default True Returns ------- s : pandas.Series series containing all statistics """ sdict = {} for k, v in self.stats_abbreviations.items(): if use_abbreviations: key = k else: key = v sdict[key] = getattr(self, "_".join(v.split())) s = pd.Series(sdict) return s