Mixtral-QLoRA-test / tests /test_data_collator_completion_only.py

Upload folder using huggingface_hub

fa4458a about 2 years ago

4.24 kB

	# Copyright 2023 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import unittest

	import torch
	from transformers import AutoTokenizer

	from trl import DataCollatorForCompletionOnlyLM


	class DataCollatorForCompletionOnlyLMTester(unittest.TestCase):
	def test_data_collator_finds_response_template_llama2_tokenizer(self):
	# this should ideally be tested with meta-llama/Llama-2-7b-hf
	self.tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/dummy-GPT2-correct-vocab")
	self.instruction = """### System: You are a helpful assistant.

	### User: How much is 2+2?

	### Assistant: 2+2 equals 4"""
	self.instruction_template = "\n### User:"
	self.response_template = "\n### Assistant:"

	# GPT2Tokenizer: [198, 21017, 11787, 25] -> [11787, 25]
	# Llama2Tokenizer: [29871, 13, 2277, 29937, 4911, 29901] -> [2277, 29937, 4911, 29901]
	self.tokenized_instruction_w_context = self.tokenizer.encode(
	self.instruction_template, add_special_tokens=False
	)[2:]

	# GPT2Tokenizer: [198, 21017, 15286, 25] -> [15286, 25]
	# Llama2Tokenizer: [29871, 13, 2277, 29937, 4007, 22137, 29901] -> [2277, 29937, 4007, 22137, 29901]
	self.tokenized_response_w_context = self.tokenizer.encode(self.response_template, add_special_tokens=False)[2:]

	# Plain check on string
	self.assertIn(self.response_template, self.instruction)
	self.tokenized_instruction = self.tokenizer.encode(self.instruction, add_special_tokens=False)

	# Test the fix for #598
	# Pass already tokenized (w context) and truncated response_template so token_ids are like in the instruction + response
	self.collator = DataCollatorForCompletionOnlyLM(self.tokenized_response_w_context, tokenizer=self.tokenizer)
	self.collator.torch_call([self.tokenized_instruction])

	# Test for PR #749
	# Pass already tokenized (w context) instruction and response both so token_ids are like in the instruction + response
	self.collator = DataCollatorForCompletionOnlyLM(
	self.tokenized_response_w_context, self.tokenized_instruction_w_context, tokenizer=self.tokenizer
	)
	self.collator.torch_call([self.tokenized_instruction])

	def test_data_collator_handling_of_long_sequences(self):
	self.tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/dummy-GPT2-correct-vocab")
	self.instruction = """### System: You are a helpful assistant.

	### User: How much is 2+2? I'm asking because I'm not sure. And I'm not sure because I'm not good at math.
	"""
	self.response_template = "\n### Assistant:"
	# check DataCollatorForCompletionOnlyLM using response template only
	self.tokenized_instruction = self.tokenizer.encode(self.instruction, add_special_tokens=False)
	self.collator = DataCollatorForCompletionOnlyLM(self.response_template, tokenizer=self.tokenizer)
	encoded_instance = self.collator.torch_call([self.tokenized_instruction])
	result = torch.all(encoded_instance["labels"] == -100)
	self.assertTrue(result, "Not all values in the tensor are -100.")

	# check DataCollatorForCompletionOnlyLM using response template and instruction template
	self.instruction_template = "\n### User:"
	self.collator = DataCollatorForCompletionOnlyLM(
	self.response_template, self.instruction_template, tokenizer=self.tokenizer
	)
	encoded_instance = self.collator.torch_call([self.tokenized_instruction])
	result = torch.all(encoded_instance["labels"] == -100)
	self.assertTrue(result, "Not all values in the tensor are -100.")