Source code for interpretdl.evaluate_models.localization

import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score


def compute_scores_thresholding(gt, exp, threshold):
    ret = np.max(exp) * threshold
    binary_exp_array = exp > ret

    TP = (binary_exp_array * gt).sum()
    predict_pos = binary_exp_array.sum()
    actual_pos = gt.sum()

    precision = TP / predict_pos
    recall = TP / actual_pos
    f1_score = (2 * precision * recall) / (precision + recall + 1e-6)

    return precision, recall, f1_score


def comptue_score_general(gt, exp):
    auc_score = roc_auc_score(gt.flatten(), exp.flatten())
    ap_score = average_precision_score(gt.flatten(), exp.flatten())
    return auc_score, ap_score


[docs]class PointGame():
    """Pointing Game Evaluation Method.

    This evaluator assumes that the explanation result should align with the visual objects. Based on this idea, the 
    evaluation is to compute the alignment between the bounding box or semantic segmentation with the explanations.

    PointGame computes the alignment to the bounding box. PointGameSegmentation computes the alignment to the 
    semantic segmentation.

    More details can be found in the original paper:
    https://arxiv.org/abs/1608.00507.

    Note that the bounding box of annotations is required for the evaluation. This method does not need models. For API
    compatibility, we implement it within the same functions as other evaluators.
    """

    def __init__(self):
        pass

[docs]    def evaluate(self, bbox: tuple, exp_array: np.ndarray, threshold=0.25) -> dict:
        """
        Since the explanation is actually a ranking order, PointGame computes two categories of measures. One is based
        on thresholding. Here, ``threshold`` * max(``exp_array``) is used as the threshold. Based on this, precision,
        recall and F1 score are computed, *w.r.t.* ``bbox``. Another measure does not depend on the threshold. Here, 
        the ROC AUC score and the Average Precision (both of them are imported from :py:mod:`sklearn.metrics`) are 
        computed.

        Args:
            bbox (tuple): A tuple of four integers: (x1, y1, x2, y2), where (x1, y1) is the coordinates of the top-left
                point *w.r.t.* width and height respectively; (x2, y2) is the coordinates of the bottom-right point 
                *w.r.t.* width and height respectively;
            exp_array (np.ndarray): the explanation result from an interpretation algorithm.
            threshold (float, optional): threshold for computing precision, recall and F1 score. Defaults to ``0.25``.

        Returns:
            dict: A dict containing ``precision``, ``recall``, ``f1_score`` and ``auc_score``, ``ap_score``, where the 
            first three depend on the threshold and the last two do not.
        """

        gt = np.zeros_like(exp_array, dtype=np.uint8)
        x1, y1, x2, y2 = bbox
        gt[y1:y2, x1:x2] = 1

        # depends on the threshold
        precision, recall, f1_score = compute_scores_thresholding(gt, exp_array, threshold)
        r = {'precision': precision, 'recall': recall, 'f1_score': f1_score}

        # independ of threshold
        auc_score, ap_score = comptue_score_general(gt, exp_array)
        r.update({'auc_score': auc_score, 'ap_score': ap_score})

        return r


[docs]class PointGameSegmentation():
    """Pointing Game Evaluation Method using Segmentation.

    This evaluator assumes that the explanation result should align with the visual objects. Based on this idea, the 
    evaluation is to compute the alignment between the bounding box or semantic segmentation with the explanations.

    PointGame computes the alignment to the bounding box. PointGameSegmentation computes the alignment to the 
    semantic segmentation.

    More details can be found in the original paper:
    https://arxiv.org/abs/1608.00507.

    Note that the semantic segmentation is required for the evaluation. This method does not need models. For API
    compatibility, we implement it within the same functions as other evaluators.
    """

    def __init__(self):
        pass

[docs]    def evaluate(self, seg_gt: np.ndarray, exp_array: np.ndarray, threshold=0.25) -> dict:
        """        
        Since the explanation is actually a ranking order, PointGameSegmentation computes two categories of measures.
        One is based on thresholding. Here, ``threshold`` * max(``exp_array``) is used as the threshold. Based on this,
        precision, recall and F1 score are computed, *w.r.t.* ``seg_gt``. Another measure does not depend on the 
        threshold. Here, the ROC AUC score and the Average Precision (both of them are imported from 
        :py:mod:`sklearn.metrics`) are computed.

        Args:
            seg_gt (np.ndarray): binary values are supported only currently.
            exp_array (np.ndarray): the explanation result from an interpretation algorithm.
            threshold (float, optional): threshold for computing precision, recall and F1 score. Defaults to ``0.25``.

        Returns:
            dict: A dict containing ``precision``, ``recall``, ``f1_score`` and ``auc_score``, ``ap_score``, where the 
            first three depend on the threshold and the last two do not.
        """
        gt = seg_gt

        # depends on the threshold
        precision, recall, f1_score = compute_scores_thresholding(gt, exp_array, threshold)
        r = {'precision': precision, 'recall': recall, 'f1_score': f1_score}

        # independ of threshold
        auc_score, ap_score = comptue_score_general(gt, exp_array)
        r.update({'auc_score': auc_score, 'ap_score': ap_score})

        return r