Source code for interpretdl.evaluate_models.localization

import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score


def compute_scores_thresholding(gt, exp, threshold):
    ret = np.max(exp) * threshold
    binary_exp_array = exp > ret

    TP = (binary_exp_array * gt).sum()
    predict_pos = binary_exp_array.sum()
    actual_pos = gt.sum()

    precision = TP / predict_pos
    recall = TP / actual_pos
    f1_score = (2 * precision * recall) / (precision + recall + 1e-6)

    return precision, recall, f1_score


def comptue_score_general(gt, exp):
    auc_score = roc_auc_score(gt.flatten(), exp.flatten())
    ap_score = average_precision_score(gt.flatten(), exp.flatten())
    return auc_score, ap_score


[docs]class PointGame(): """Pointing Game Evaluation Method. This evaluator assumes that the explanation result should align with the visual objects. Based on this idea, the evaluation is to compute the alignment between the bounding box or semantic segmentation with the explanations. PointGame computes the alignment to the bounding box. PointGameSegmentation computes the alignment to the semantic segmentation. More details can be found in the original paper: https://arxiv.org/abs/1608.00507. Note that the bounding box of annotations is required for the evaluation. This method does not need models. For API compatibility, we implement it within the same functions as other evaluators. """ def __init__(self): pass
[docs] def evaluate(self, bbox: tuple, exp_array: np.ndarray, threshold=0.25) -> dict: """ Since the explanation is actually a ranking order, PointGame computes two categories of measures. One is based on thresholding. Here, ``threshold`` * max(``exp_array``) is used as the threshold. Based on this, precision, recall and F1 score are computed, *w.r.t.* ``bbox``. Another measure does not depend on the threshold. Here, the ROC AUC score and the Average Precision (both of them are imported from :py:mod:`sklearn.metrics`) are computed. Args: bbox (tuple): A tuple of four integers: (x1, y1, x2, y2), where (x1, y1) is the coordinates of the top-left point *w.r.t.* width and height respectively; (x2, y2) is the coordinates of the bottom-right point *w.r.t.* width and height respectively; exp_array (np.ndarray): the explanation result from an interpretation algorithm. threshold (float, optional): threshold for computing precision, recall and F1 score. Defaults to ``0.25``. Returns: dict: A dict containing ``precision``, ``recall``, ``f1_score`` and ``auc_score``, ``ap_score``, where the first three depend on the threshold and the last two do not. """ gt = np.zeros_like(exp_array, dtype=np.uint8) x1, y1, x2, y2 = bbox gt[y1:y2, x1:x2] = 1 # depends on the threshold precision, recall, f1_score = compute_scores_thresholding(gt, exp_array, threshold) r = {'precision': precision, 'recall': recall, 'f1_score': f1_score} # independ of threshold auc_score, ap_score = comptue_score_general(gt, exp_array) r.update({'auc_score': auc_score, 'ap_score': ap_score}) return r
[docs]class PointGameSegmentation(): """Pointing Game Evaluation Method using Segmentation. This evaluator assumes that the explanation result should align with the visual objects. Based on this idea, the evaluation is to compute the alignment between the bounding box or semantic segmentation with the explanations. PointGame computes the alignment to the bounding box. PointGameSegmentation computes the alignment to the semantic segmentation. More details can be found in the original paper: https://arxiv.org/abs/1608.00507. Note that the semantic segmentation is required for the evaluation. This method does not need models. For API compatibility, we implement it within the same functions as other evaluators. """ def __init__(self): pass
[docs] def evaluate(self, seg_gt: np.ndarray, exp_array: np.ndarray, threshold=0.25) -> dict: """ Since the explanation is actually a ranking order, PointGameSegmentation computes two categories of measures. One is based on thresholding. Here, ``threshold`` * max(``exp_array``) is used as the threshold. Based on this, precision, recall and F1 score are computed, *w.r.t.* ``seg_gt``. Another measure does not depend on the threshold. Here, the ROC AUC score and the Average Precision (both of them are imported from :py:mod:`sklearn.metrics`) are computed. Args: seg_gt (np.ndarray): binary values are supported only currently. exp_array (np.ndarray): the explanation result from an interpretation algorithm. threshold (float, optional): threshold for computing precision, recall and F1 score. Defaults to ``0.25``. Returns: dict: A dict containing ``precision``, ``recall``, ``f1_score`` and ``auc_score``, ``ap_score``, where the first three depend on the threshold and the last two do not. """ gt = seg_gt # depends on the threshold precision, recall, f1_score = compute_scores_thresholding(gt, exp_array, threshold) r = {'precision': precision, 'recall': recall, 'f1_score': f1_score} # independ of threshold auc_score, ap_score = comptue_score_general(gt, exp_array) r.update({'auc_score': auc_score, 'ap_score': ap_score}) return r