Source code for ferret.evaluators.plausibility_measures

import numpy as np
from sklearn.metrics import auc, precision_recall_curve

from ..explainers.explanation import ExplanationWithRationale
from . import BaseEvaluator, EvaluationMetricFamily
from .evaluation import EvaluationMetricOutput
from .utils_from_soft_to_discrete import (
    get_discrete_explanation_topK,
    parse_evaluator_args,
)



[docs]
class AUPRC_PlausibilityEvaluation(BaseEvaluator):
    NAME = "AUPRC_soft_plausibility"
    SHORT_NAME = "auprc_plau"
    LOWER_IS_BETTER = False
    MAX_VALUE = 1.0
    MIN_VALUE = 0.0
    METRIC_FAMILY = EvaluationMetricFamily.PLAUSIBILITY

    def _compute_auprc_soft_scoring(self, true_rationale, soft_scores):
        precision, recall, _ = precision_recall_curve(true_rationale, soft_scores)
        auc_score = auc(recall, precision)
        return auc_score

    def compute_evaluation(
        self,
        explanation_with_rationale: ExplanationWithRationale,
        target=1,
        **evaluation_args
    ):
        """Evaluate an explanation on the Area Under the Precision- Recall (AUPRC) Plausibility metric.

        Args:
            explanation (ExplanationWithRationale): the explanation to evaluate
            evaluation_args (dict):  additional evaluation args

        Returns:
            Evaluation : the AUPRC Plausibility score of the explanation
        """

        # Plausibility - Area Under the Precision- Recall curve (AUPRC) - ERASER
        if isinstance(explanation_with_rationale, ExplanationWithRationale) == False:
            return None
        remove_first_last, only_pos, _, _ = parse_evaluator_args(evaluation_args)

        score_explanation = explanation_with_rationale.scores
        human_rationale = explanation_with_rationale.rationale

        if remove_first_last == True:
            human_rationale = human_rationale[1:-1]
            if self.tokenizer.cls_token == explanation_with_rationale.tokens[0]:
                score_explanation = score_explanation[1:-1]

        # TODO.
        if only_pos:
            # Only positive terms of explanations.
            # https://github.com/hate-alert/HateXplain/blob/daa7955afbe796b00e79817f16621469a38820e0/testing_with_lime.py#L276
            score_explanation = [v if v > 0 else 0 for v in score_explanation]

        auprc_soft_plausibility = self._compute_auprc_soft_scoring(
            human_rationale, score_explanation
        )
        evaluation_output = EvaluationMetricOutput(self, auprc_soft_plausibility)
        return evaluation_output




[docs]
class Tokenf1_PlausibilityEvaluation(BaseEvaluator):
    NAME = "token_f1_hard_plausibility"
    SHORT_NAME = "token_f1_plau"
    METRIC_FAMILY = EvaluationMetricFamily.PLAUSIBILITY
    LOWER_IS_BETTER = False
    MIN_VALUE = 0.0
    MAX_VALUE = 1.0

    def _instance_tp_pos_pred_pos(self, true_expl, pred_expl):
        true_expl = np.array(true_expl)
        pred_expl = np.array(pred_expl)

        assert true_expl.shape[0] == pred_expl.shape[0]

        tp = (true_expl & pred_expl).sum()
        pos = (true_expl).sum()
        pred_pos = (pred_expl).sum()

        """
        Alternative, in the case the rationales are representate by the positional id
        e.g., "i hate you" --> [1,2]

        true_expl = set(true_expl)
        pred_expl = set(pred_expl)

        tp =  len(true_expl & pred_expl)
        pos = len(true_expl)
        pred_pos = len(pred_expl)
        """
        return tp, pos, pred_pos

    def _precision_recall_fmeasure(self, tp, positive, pred_positive):
        precision = tp / pred_positive
        recall = tp / positive
        fmeasure = self._f1(precision, recall)
        return precision, recall, fmeasure

    def _f1(self, _p, _r):
        if _p == 0 or _r == 0:
            return 0
        return 2 * _p * _r / (_p + _r)

    def _score_hard_rationale_predictions_dataset(self, list_true_expl, list_pred_expl):

        """Computes instance micro/macro averaged F1s
        ERASER: https://github.com/jayded/eraserbenchmark/blob/36467f1662812cbd4fbdd66879946cd7338e08ec/rationale_benchmark/metrics.py#L168

        """

        """ Each explanations is provided as one hot encoding --> True if the word is in the explanation, False otherwise
        I hate you --> --> [0, 1, 1]
        One for each instance.
        """
        tot_tp, tot_pos, tot_pred_pos = 0, 0, 0
        macro_prec_sum, macro_rec_sum, macro_f1_sum = 0, 0, 0

        for true_expl, pred_expl in zip(list_true_expl, list_pred_expl):
            tp, pos, pred_pos = self._instance_tp_pos_pred_pos(true_expl, pred_expl)

            instance_prec, instance_rec, instance_f1 = self._precision_recall_fmeasure(
                tp, pos, pred_pos
            )

            # Update for macro computation
            macro_prec_sum += instance_prec
            macro_rec_sum += instance_rec
            macro_f1_sum += instance_f1

            # Update for micro computation
            tot_tp += tp
            tot_pos += pos
            tot_pred_pos += pred_pos

        # Macro computation

        n_explanations = len(list_true_expl)
        macro = {
            "p": macro_prec_sum / n_explanations,
            "r": macro_rec_sum / n_explanations,
            "f1": macro_f1_sum / n_explanations,
        }

        # Micro computation

        micro_prec, micro_rec, micro_f1 = self._precision_recall_fmeasure(
            tot_tp, tot_pos, tot_pred_pos
        )
        micro = {"p": micro_prec, "r": micro_rec, "f1": micro_f1}

        return {"micro": micro, "macro": macro}

    def _score_hard_rationale_predictions_accumulate(self, true_expl, pred_expl):

        """Computes instance micro/macro averaged F1s
        ERASER: https://github.com/jayded/eraserbenchmark/blob/36467f1662812cbd4fbdd66879946cd7338e08ec/rationale_benchmark/metrics.py#L168

        """

        """ Each explanations is provided as one hot encoding --> True if the word is in the explanation, False otherwise
        I hate you --> --> [0, 1, 1]
        One for each instance.
        """

        # For macro computation
        tp, pos, pred_pos = self._instance_tp_pos_pred_pos(true_expl, pred_expl)

        # For micro computation
        instance_prec, instance_rec, instance_f1 = self._precision_recall_fmeasure(
            tp, pos, pred_pos
        )
        return instance_prec, instance_rec, instance_f1, tp, pos, pred_pos

    def compute_evaluation(
        self,
        explanation_with_rationale: ExplanationWithRationale,
        target=1,
        **evaluation_args
    ):

        """Evaluate an explanation on the Token-f1 score Plausibility metric.

        Args:
            explanation (ExplanationWithRationale): the explanation to evaluate
            evaluation_args (dict):  additional evaluation args

        Returns:
            Evaluation : the Token-f1 Plausibility score of the explanation
        """

        if isinstance(explanation_with_rationale, ExplanationWithRationale) == False:
            return None

        # Token fpr - hard rationale predictions. token-level F1 scores
        remove_first_last, only_pos, _, top_k_hard_rationale = parse_evaluator_args(
            evaluation_args
        )
        accumulate_result = evaluation_args.get("accumulate_result", False)

        score_explanation = explanation_with_rationale.scores
        human_rationale = explanation_with_rationale.rationale

        if remove_first_last == True:
            human_rationale = human_rationale[1:-1]
            if self.tokenizer.cls_token == explanation_with_rationale.tokens[0]:
                score_explanation = score_explanation[1:-1]

        topk_score_explanations = get_discrete_explanation_topK(
            score_explanation, top_k_hard_rationale, only_pos=only_pos
        )

        if topk_score_explanations is None:
            # Return default scores
            if accumulate_result:
                return EvaluationMetricOutput(self, [0, 0, 0, 0, 0, 0])
            else:
                return EvaluationMetricOutput(self, 0)

        tp, pos, pred_pos = self._instance_tp_pos_pred_pos(
            human_rationale, topk_score_explanations
        )

        (
            instance_prec,
            instance_rec,
            instance_f1_micro,
        ) = self._precision_recall_fmeasure(tp, pos, pred_pos)

        if accumulate_result:

            output_score = np.array(
                [tp, pos, pred_pos, instance_prec, instance_rec, instance_f1_micro]
            )

            evaluation_output = EvaluationMetricOutput(self.SHORT_NAME, output_score)
        else:

            evaluation_output = EvaluationMetricOutput(
                self, instance_f1_micro
            )

        return evaluation_output

    def aggregate_score(self, score, total, **aggregation_args):
        average = aggregation_args.get("average", "macro")
        (
            total_tp,
            total_pos,
            total_pred_pos,
            macro_prec_sum,
            macro_rec_sum,
            macro_f1_sum,
        ) = tuple(score)

        # Macro computation
        macro = {
            "p": macro_prec_sum / total,
            "r": macro_rec_sum / total,
            "f1": macro_f1_sum / total,
        }

        # Micro computation

        micro_prec, micro_rec, micro_f1 = self._precision_recall_fmeasure(
            total_tp, total_pos, total_pred_pos
        )
        micro = {"p": micro_prec, "r": micro_rec, "f1": micro_f1}

        if average == "macro":
            return macro["f1"]
        elif average == "micro":
            return micro["f1"]
        else:
            raise ValueError()




[docs]
class TokenIOU_PlausibilityEvaluation(BaseEvaluator):
    NAME = "token_IOU_hard_plausibility"
    SHORT_NAME = "token_iou_plau"
    METRIC_FAMILY = EvaluationMetricFamily.PLAUSIBILITY
    LOWER_IS_BETTER = False
    MIN_VALUE = 0.0
    MAX_VALUE = 1.0

    def _token_iou(self, true_expl, pred_expl):
        """From ERASER
        We define IOU on a token level:  for two spans,
            it is the size of the overlap of the tokens they cover divided by the size of their union.
        """

        if type(true_expl) is list:
            true_expl = np.array(true_expl)
        if type(pred_expl) is list:
            pred_expl = np.array(pred_expl)

        assert true_expl.shape[0] == pred_expl.shape[0]

        num = (true_expl & pred_expl).sum()
        denom = (true_expl | pred_expl).sum()

        iou = 0 if denom == 0 else num / denom
        return iou

    def compute_evaluation(
        self,
        explanation_with_rationale: ExplanationWithRationale,
        target=1,
        **evaluation_args
    ):

        """Evaluate an explanation on the Intersection Over Union (IOU) Plausibility metric.

        Args:
            explanation (ExplanationWithRationale): the explanation to evaluate
            evaluation_args (dict):  additional evaluation args

        Returns:
            Evaluation : the IOU Plausibility score of the explanation
        """

        """From ERASER
        'We define IOU on a token level:  for two spans,
        it is the size of the overlap of the tokens they cover divided by the size of their union.''

        Same process as in _token_f1_hard_rationales
        rationale: one hot encoding of the rationale
        soft_score_explanation: soft scores, len = #tokens, floats
        """

        if isinstance(explanation_with_rationale, ExplanationWithRationale) == False:
            return None

        remove_first_last, only_pos, _, top_k_hard_rationale = parse_evaluator_args(
            evaluation_args
        )

        score_explanation = explanation_with_rationale.scores
        human_rationale = explanation_with_rationale.rationale

        if remove_first_last == True:
            human_rationale = human_rationale[1:-1]
            if self.tokenizer.cls_token == explanation_with_rationale.tokens[0]:
                score_explanation = score_explanation[1:-1]

        topk_score_explanations = get_discrete_explanation_topK(
            score_explanation, top_k_hard_rationale, only_pos=only_pos
        )
        if topk_score_explanations is None:
            # Return default scores
            return EvaluationMetricOutput(self, 0)

        token_iou = self._token_iou(human_rationale, topk_score_explanations)

        evaluation_output = EvaluationMetricOutput(self, token_iou)
        return evaluation_output