Source code for ferret.datasets
"""Datasets API"""
from abc import ABC, abstractmethod
from typing import List
TRAIN_SET = "TRAIN_SET"
VALIDATION_SET = "VALIDATION_SET"
TEST_SET = "TEST_SET"
[docs]
class BaseDataset(ABC):
@property
@abstractmethod
def NAME(self):
pass
@property
@abstractmethod
def avg_rationale_size(self):
# Default value
return 5
[docs]
def __init__(self, tokenizer):
self.tokenizer = tokenizer
@abstractmethod
def get_instance(self, idx: int, split_type: str = TEST_SET):
pass
@abstractmethod
def _get_item(self, idx: int, split_type: str = TEST_SET):
pass
@abstractmethod
def _get_text(self, idx, split_type: str = TEST_SET):
pass
@abstractmethod
def _get_rationale(self, idx, split_type: str = TEST_SET):
pass
@abstractmethod
def _get_ground_truth(self, idx, split_type: str = TEST_SET):
pass
def get_true_rationale_from_words_to_tokens(
self, word_based_tokens: List[str], words_based_rationales: List[int]
) -> List[int]:
# original_tokens --> list of words.
# rationale_original_tokens --> 0 or 1, if the token belongs to the rationale or not
# Typically, the importance is associated with each word rather than each token.
# We convert each word in token using the tokenizer. If a word is in the rationale,
# we consider as important all the tokens of the word.
token_rationale = []
for t, rationale_t in zip(word_based_tokens, words_based_rationales):
converted_token = self.tokenizer.encode(t)[1:-1]
for token_i in converted_token:
token_rationale.append(rationale_t)
return token_rationale