Spaces:
Build error
Build error
| from unittest import TestCase | |
| import torch | |
| from torch.utils.data import DataLoader | |
| from transformers import AutoConfig, AutoTokenizer | |
| import autoprompt.utils as utils | |
| class TestEncodeLabel(TestCase): | |
| def setUp(self): | |
| self._tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') | |
| def test_single_token(self): | |
| output = utils.encode_label(self._tokenizer, 'the') | |
| expected_output = torch.tensor([self._tokenizer.convert_tokens_to_ids(['the'])]) | |
| assert torch.equal(output, expected_output) | |
| def test_multiple_tokens(self): | |
| output = utils.encode_label(self._tokenizer, ['a', 'the']) | |
| expected_output = torch.tensor([ | |
| self._tokenizer.convert_tokens_to_ids(['a', 'the']) | |
| ]) | |
| assert torch.equal(output, expected_output) | |
| class TestTriggerTemplatizer(TestCase): | |
| def setUp(self): | |
| self.default_template = '[T] [T] {arbitrary} [T] {fields} [P]' | |
| self.default_config = AutoConfig.from_pretrained('bert-base-cased') | |
| self.default_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') | |
| utils.add_task_specific_tokens(self.default_tokenizer) | |
| self.default_instance = { | |
| 'arbitrary': 'does this', | |
| 'fields': 'work', | |
| 'label': 'and' | |
| } | |
| def test_bert(self): | |
| templatizer = utils.TriggerTemplatizer( | |
| self.default_template, | |
| self.default_config, | |
| self.default_tokenizer, | |
| add_special_tokens=False | |
| ) | |
| model_inputs, label = templatizer(self.default_instance) | |
| # Label should be mapped to its token id | |
| expected_label = torch.tensor([self.default_tokenizer.convert_tokens_to_ids([self.default_instance['label']])]) | |
| assert torch.equal(expected_label, label) | |
| # For BERT ouput is expected to have the following keys | |
| assert 'input_ids' in model_inputs | |
| assert 'token_type_ids' in model_inputs | |
| assert 'attention_mask' in model_inputs | |
| # Test that the custom masks match our expectations | |
| expected_trigger_mask = torch.tensor( | |
| [[True, True, False, False, True, False, False]] | |
| ) | |
| assert torch.equal(expected_trigger_mask, model_inputs['trigger_mask']) | |
| expected_predict_mask = torch.tensor( | |
| [[False, False, False, False, False, False, True]] | |
| ) | |
| assert torch.equal(expected_predict_mask, model_inputs['predict_mask']) | |
| # Lastly, ensure [P] is replaced by a [MASK] token | |
| input_ids = model_inputs['input_ids'] | |
| predict_mask = model_inputs['predict_mask'] | |
| predict_token_id = input_ids[predict_mask].squeeze().item() | |
| assert predict_token_id == self.default_tokenizer.mask_token_id | |
| def test_roberta(self): | |
| config = AutoConfig.from_pretrained('roberta-base') | |
| tokenizer = AutoTokenizer.from_pretrained('roberta-base') | |
| utils.add_task_specific_tokens(tokenizer) | |
| templatizer = utils.TriggerTemplatizer( | |
| self.default_template, | |
| config, | |
| tokenizer, | |
| add_special_tokens=False | |
| ) | |
| model_inputs, label = templatizer(self.default_instance) | |
| # Label should be mapped to its token id | |
| expected_label = torch.tensor([tokenizer.convert_tokens_to_ids([self.default_instance['label']])]) | |
| assert torch.equal(expected_label, label) | |
| # For BERT ouput is expected to have the following keys | |
| print(model_inputs) | |
| assert 'input_ids' in model_inputs | |
| assert 'attention_mask' in model_inputs | |
| # Test that the custom masks match our expectations | |
| expected_trigger_mask = torch.tensor( | |
| [[True, True, False, False, True, False, False]] | |
| ) | |
| assert torch.equal(expected_trigger_mask, model_inputs['trigger_mask']) | |
| expected_predict_mask = torch.tensor( | |
| [[False, False, False, False, False, False, True]] | |
| ) | |
| assert torch.equal(expected_predict_mask, model_inputs['predict_mask']) | |
| # Lastly, ensure [P] is replaced by a [MASK] token | |
| input_ids = model_inputs['input_ids'] | |
| predict_mask = model_inputs['predict_mask'] | |
| predict_token_id = input_ids[predict_mask].squeeze().item() | |
| assert predict_token_id == tokenizer.mask_token_id | |
| class TestCollator(TestCase): | |
| def test_collator(self): | |
| template = '[T] [T] {arbitrary} [T] {fields} [P]' | |
| tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') | |
| config = AutoConfig.from_pretrained('bert-base-cased') | |
| utils.add_task_specific_tokens(tokenizer) | |
| templatizer = utils.TriggerTemplatizer( | |
| template, | |
| config, | |
| tokenizer, | |
| add_special_tokens=False | |
| ) | |
| collator = utils.Collator(pad_token_id=tokenizer.pad_token_id) | |
| instances = [ | |
| {'arbitrary': 'a', 'fields': 'the', 'label': 'hot'}, | |
| {'arbitrary': 'a a', 'fields': 'the the', 'label': 'cold'} | |
| ] | |
| templatized_instances = [templatizer(x) for x in instances] | |
| loader = DataLoader( | |
| templatized_instances, | |
| batch_size=2, | |
| shuffle=False, | |
| collate_fn=collator | |
| ) | |
| model_inputs, labels = next(iter(loader)) | |
| # Check results match our expectations | |
| expected_labels = torch.tensor([ | |
| tokenizer.encode('hot', add_special_tokens=False, add_prefix_space=True), | |
| tokenizer.encode('cold', add_special_tokens=False, add_prefix_space=True), | |
| ]) | |
| assert torch.equal(expected_labels, labels) | |
| expected_trigger_mask = torch.tensor([ | |
| [True, True, False, True, False, False, False, False], | |
| [True, True, False, False, True, False, False, False], | |
| ]) | |
| assert torch.equal(expected_trigger_mask, model_inputs['trigger_mask']) | |
| expected_predict_mask = torch.tensor([ | |
| [False, False, False, False, False, True, False, False], | |
| [False, False, False, False, False, False, False, True], | |
| ]) | |
| assert torch.equal(expected_predict_mask, model_inputs['predict_mask']) | |