| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import json |
| | import os |
| | import sys |
| | import tempfile |
| | import unittest |
| | from pathlib import Path |
| | from shutil import copyfile |
| |
|
| | from huggingface_hub import snapshot_download, upload_folder |
| |
|
| | import transformers |
| | from transformers import ( |
| | CONFIG_MAPPING, |
| | FEATURE_EXTRACTOR_MAPPING, |
| | MODEL_FOR_AUDIO_TOKENIZATION_MAPPING, |
| | PROCESSOR_MAPPING, |
| | TOKENIZER_MAPPING, |
| | AutoConfig, |
| | AutoFeatureExtractor, |
| | AutoProcessor, |
| | AutoTokenizer, |
| | BaseVideoProcessor, |
| | BertTokenizer, |
| | CLIPImageProcessorFast, |
| | FeatureExtractionMixin, |
| | ImageProcessingMixin, |
| | LlamaTokenizer, |
| | LlavaOnevisionVideoProcessor, |
| | LlavaProcessor, |
| | ProcessorMixin, |
| | SiglipImageProcessor, |
| | SiglipImageProcessorFast, |
| | Wav2Vec2Config, |
| | Wav2Vec2FeatureExtractor, |
| | Wav2Vec2Processor, |
| | ) |
| | from transformers.models.auto.feature_extraction_auto import get_feature_extractor_config |
| | from transformers.models.auto.image_processing_auto import get_image_processor_config |
| | from transformers.models.auto.video_processing_auto import get_video_processor_config |
| | from transformers.testing_utils import TOKEN, TemporaryHubRepo, get_tests_dir, is_staging_test |
| | from transformers.tokenization_python import TOKENIZER_CONFIG_FILE |
| | from transformers.utils import ( |
| | FEATURE_EXTRACTOR_NAME, |
| | PROCESSOR_NAME, |
| | ) |
| |
|
| |
|
| | sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils")) |
| |
|
| | from test_module.custom_configuration import CustomConfig |
| | from test_module.custom_feature_extraction import CustomFeatureExtractor |
| | from test_module.custom_processing import CustomProcessor |
| | from test_module.custom_tokenization import CustomTokenizer |
| |
|
| |
|
| | SAMPLE_PROCESSOR_CONFIG = get_tests_dir("fixtures/dummy_feature_extractor_config.json") |
| | SAMPLE_VOCAB_LLAMA = get_tests_dir("fixtures/test_sentencepiece.model") |
| | SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json") |
| | SAMPLE_CONFIG = get_tests_dir("fixtures/config.json") |
| | SAMPLE_PROCESSOR_CONFIG_DIR = get_tests_dir("fixtures") |
| |
|
| |
|
| | class AutoFeatureExtractorTest(unittest.TestCase): |
| | vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"] |
| |
|
| | def setUp(self): |
| | transformers.dynamic_module_utils.TIME_OUT_REMOTE_CODE = 0 |
| |
|
| | def test_processor_from_model_shortcut(self): |
| | processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") |
| | self.assertIsInstance(processor, Wav2Vec2Processor) |
| |
|
| | def test_processor_from_local_directory_from_repo(self): |
| | with tempfile.TemporaryDirectory() as tmpdirname: |
| | model_config = Wav2Vec2Config() |
| | processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") |
| |
|
| | |
| | model_config.save_pretrained(tmpdirname) |
| | processor.save_pretrained(tmpdirname) |
| |
|
| | processor = AutoProcessor.from_pretrained(tmpdirname) |
| |
|
| | self.assertIsInstance(processor, Wav2Vec2Processor) |
| |
|
| | def test_processor_from_local_subfolder_from_repo(self): |
| | with tempfile.TemporaryDirectory() as tmpdirname: |
| | processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") |
| | processor.save_pretrained(f"{tmpdirname}/processor_subfolder") |
| |
|
| | processor = Wav2Vec2Processor.from_pretrained(tmpdirname, subfolder="processor_subfolder") |
| |
|
| | self.assertIsInstance(processor, Wav2Vec2Processor) |
| |
|
| | def test_processor_from_local_directory_from_extractor_config(self): |
| | with tempfile.TemporaryDirectory() as tmpdirname: |
| | |
| | copyfile(SAMPLE_PROCESSOR_CONFIG, os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME)) |
| | copyfile(SAMPLE_VOCAB, os.path.join(tmpdirname, "vocab.json")) |
| | copyfile(SAMPLE_CONFIG, os.path.join(tmpdirname, "config.json")) |
| |
|
| | processor = AutoProcessor.from_pretrained(tmpdirname) |
| |
|
| | self.assertIsInstance(processor, Wav2Vec2Processor) |
| |
|
| | def test_subcomponent_get_config_dict_saved_as_nested_config(self): |
| | """ |
| | Tests that we can get config dict of a subcomponents of a processor, |
| | even if they were saved as nested dict in `processor_config.json` |
| | """ |
| | |
| | with tempfile.TemporaryDirectory() as tmpdirname: |
| | processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") |
| | processor.save_pretrained(tmpdirname) |
| |
|
| | config_dict_1 = get_feature_extractor_config(tmpdirname) |
| | feature_extractor_1 = Wav2Vec2FeatureExtractor(**config_dict_1) |
| | self.assertIsInstance(feature_extractor_1, Wav2Vec2FeatureExtractor) |
| |
|
| | config_dict_2, _ = FeatureExtractionMixin.get_feature_extractor_dict(tmpdirname) |
| | feature_extractor_2 = Wav2Vec2FeatureExtractor(**config_dict_2) |
| | self.assertIsInstance(feature_extractor_2, Wav2Vec2FeatureExtractor) |
| | self.assertEqual(config_dict_1, config_dict_2) |
| |
|
| | |
| | with tempfile.TemporaryDirectory() as tmpdirname: |
| | processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf") |
| | processor.save_pretrained(tmpdirname) |
| |
|
| | config_dict_1 = get_image_processor_config(tmpdirname) |
| | image_processor_1 = SiglipImageProcessor(**config_dict_1) |
| | self.assertIsInstance(image_processor_1, SiglipImageProcessor) |
| |
|
| | config_dict_2, _ = ImageProcessingMixin.get_image_processor_dict(tmpdirname) |
| | image_processor_2 = SiglipImageProcessor(**config_dict_2) |
| | self.assertIsInstance(image_processor_2, SiglipImageProcessor) |
| | self.assertEqual(config_dict_1, config_dict_2) |
| |
|
| | config_dict_1 = get_video_processor_config(tmpdirname) |
| | video_processor_1 = LlavaOnevisionVideoProcessor(**config_dict_1) |
| | self.assertIsInstance(video_processor_1, LlavaOnevisionVideoProcessor) |
| |
|
| | config_dict_2, _ = BaseVideoProcessor.get_video_processor_dict(tmpdirname) |
| | video_processor_2 = LlavaOnevisionVideoProcessor(**config_dict_2) |
| | self.assertIsInstance(video_processor_2, LlavaOnevisionVideoProcessor) |
| | self.assertEqual(config_dict_1, config_dict_2) |
| |
|
| | def test_processor_from_processor_class(self): |
| | with tempfile.TemporaryDirectory() as tmpdirname: |
| | feature_extractor = Wav2Vec2FeatureExtractor() |
| | tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h") |
| |
|
| | processor = Wav2Vec2Processor(feature_extractor, tokenizer) |
| |
|
| | |
| | processor.save_pretrained(tmpdirname) |
| |
|
| | if not os.path.isfile(os.path.join(tmpdirname, PROCESSOR_NAME)): |
| | |
| | config_dict = {"processor_class": "Wav2Vec2Processor"} |
| | with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as fp: |
| | json.dump(config_dict, fp) |
| |
|
| | |
| | with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE)) as f: |
| | config_dict = json.load(f) |
| | config_dict.pop("processor_class") |
| |
|
| | with open(os.path.join(tmpdirname, TOKENIZER_CONFIG_FILE), "w") as f: |
| | f.write(json.dumps(config_dict)) |
| |
|
| | processor = AutoProcessor.from_pretrained(tmpdirname) |
| |
|
| | self.assertIsInstance(processor, Wav2Vec2Processor) |
| |
|
| | def test_processor_from_tokenizer_processor_class(self): |
| | with tempfile.TemporaryDirectory() as tmpdirname: |
| | feature_extractor = Wav2Vec2FeatureExtractor() |
| | tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h") |
| |
|
| | processor = Wav2Vec2Processor(feature_extractor, tokenizer) |
| |
|
| | |
| | processor.save_pretrained(tmpdirname) |
| |
|
| | |
| | with open(os.path.join(tmpdirname, PROCESSOR_NAME)) as f: |
| | config_dict = json.load(f) |
| | config_dict.pop("processor_class") |
| | with open(os.path.join(tmpdirname, PROCESSOR_NAME), "w") as f: |
| | f.write(json.dumps(config_dict)) |
| |
|
| | processor = AutoProcessor.from_pretrained(tmpdirname) |
| |
|
| | self.assertIsInstance(processor, Wav2Vec2Processor) |
| |
|
| | def test_processor_from_local_directory_from_model_config(self): |
| | with tempfile.TemporaryDirectory() as tmpdirname: |
| | model_config = Wav2Vec2Config(processor_class="Wav2Vec2Processor") |
| | model_config.save_pretrained(tmpdirname) |
| | |
| | copyfile(SAMPLE_VOCAB, os.path.join(tmpdirname, "vocab.json")) |
| | |
| | with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "w") as f: |
| | f.write("{}") |
| |
|
| | processor = AutoProcessor.from_pretrained(tmpdirname) |
| |
|
| | self.assertIsInstance(processor, Wav2Vec2Processor) |
| |
|
| | def test_from_pretrained_dynamic_processor(self): |
| | |
| | with self.assertRaises(ValueError): |
| | processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor_updated") |
| | |
| | with self.assertRaises(ValueError): |
| | processor = AutoProcessor.from_pretrained( |
| | "hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=False |
| | ) |
| |
|
| | processor = AutoProcessor.from_pretrained( |
| | "hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=True |
| | ) |
| | self.assertTrue(processor.special_attribute_present) |
| | self.assertEqual(processor.__class__.__name__, "NewProcessor") |
| |
|
| | feature_extractor = processor.feature_extractor |
| | self.assertTrue(feature_extractor.special_attribute_present) |
| | self.assertEqual(feature_extractor.__class__.__name__, "NewFeatureExtractor") |
| |
|
| | tokenizer = processor.tokenizer |
| | self.assertTrue(tokenizer.special_attribute_present) |
| | self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast") |
| |
|
| | new_processor = AutoProcessor.from_pretrained( |
| | "hf-internal-testing/test_dynamic_processor", trust_remote_code=True, use_fast=False |
| | ) |
| | new_tokenizer = new_processor.tokenizer |
| | self.assertTrue(new_tokenizer.special_attribute_present) |
| | self.assertEqual(new_tokenizer.__class__.__name__, "NewTokenizerFast") |
| |
|
| | def test_new_processor_registration(self): |
| | try: |
| | AutoConfig.register("custom", CustomConfig) |
| | AutoFeatureExtractor.register(CustomConfig, CustomFeatureExtractor) |
| | AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer) |
| | AutoProcessor.register(CustomConfig, CustomProcessor) |
| | |
| | with self.assertRaises(ValueError): |
| | AutoProcessor.register(Wav2Vec2Config, Wav2Vec2Processor) |
| |
|
| | |
| | feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR) |
| |
|
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | vocab_file = os.path.join(tmp_dir, "vocab.txt") |
| | with open(vocab_file, "w", encoding="utf-8") as vocab_writer: |
| | vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens])) |
| | tokenizer = CustomTokenizer(vocab_file) |
| |
|
| | processor = CustomProcessor(feature_extractor, tokenizer) |
| |
|
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | processor.save_pretrained(tmp_dir) |
| | new_processor = AutoProcessor.from_pretrained(tmp_dir) |
| | self.assertIsInstance(new_processor, CustomProcessor) |
| |
|
| | finally: |
| | if "custom" in CONFIG_MAPPING._extra_content: |
| | del CONFIG_MAPPING._extra_content["custom"] |
| | if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: |
| | del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig] |
| | if CustomConfig in TOKENIZER_MAPPING._extra_content: |
| | del TOKENIZER_MAPPING._extra_content[CustomConfig] |
| | if CustomConfig in PROCESSOR_MAPPING._extra_content: |
| | del PROCESSOR_MAPPING._extra_content[CustomConfig] |
| | if CustomConfig in MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content: |
| | del MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content[CustomConfig] |
| |
|
| | def test_from_pretrained_dynamic_processor_conflict(self): |
| | class NewFeatureExtractor(Wav2Vec2FeatureExtractor): |
| | special_attribute_present = False |
| |
|
| | class NewTokenizer(BertTokenizer): |
| | special_attribute_present = False |
| |
|
| | class NewProcessor(ProcessorMixin): |
| | special_attribute_present = False |
| |
|
| | def __init__(self, feature_extractor, tokenizer): |
| | super().__init__(feature_extractor, tokenizer) |
| |
|
| | try: |
| | AutoConfig.register("custom", CustomConfig) |
| | AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor) |
| | AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer) |
| | AutoProcessor.register(CustomConfig, NewProcessor) |
| | |
| | processor = AutoProcessor.from_pretrained("hf-internal-testing/test_dynamic_processor_updated") |
| | self.assertEqual(processor.__class__.__name__, "NewProcessor") |
| | self.assertFalse(processor.special_attribute_present) |
| | self.assertFalse(processor.feature_extractor.special_attribute_present) |
| | self.assertFalse(processor.tokenizer.special_attribute_present) |
| |
|
| | |
| | processor = AutoProcessor.from_pretrained( |
| | "hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=False |
| | ) |
| | self.assertEqual(processor.__class__.__name__, "NewProcessor") |
| | self.assertFalse(processor.special_attribute_present) |
| | self.assertFalse(processor.feature_extractor.special_attribute_present) |
| | self.assertFalse(processor.tokenizer.special_attribute_present) |
| |
|
| | |
| | processor = AutoProcessor.from_pretrained( |
| | "hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=True |
| | ) |
| | self.assertEqual(processor.__class__.__name__, "NewProcessor") |
| | self.assertTrue(processor.special_attribute_present) |
| | self.assertTrue(processor.feature_extractor.special_attribute_present) |
| | self.assertTrue(processor.tokenizer.special_attribute_present) |
| |
|
| | finally: |
| | if "custom" in CONFIG_MAPPING._extra_content: |
| | del CONFIG_MAPPING._extra_content["custom"] |
| | if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: |
| | del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig] |
| | if CustomConfig in TOKENIZER_MAPPING._extra_content: |
| | del TOKENIZER_MAPPING._extra_content[CustomConfig] |
| | if CustomConfig in PROCESSOR_MAPPING._extra_content: |
| | del PROCESSOR_MAPPING._extra_content[CustomConfig] |
| | if CustomConfig in MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content: |
| | del MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content[CustomConfig] |
| |
|
| | def test_from_pretrained_dynamic_processor_with_extra_attributes(self): |
| | class NewFeatureExtractor(Wav2Vec2FeatureExtractor): |
| | pass |
| |
|
| | class NewTokenizer(BertTokenizer): |
| | pass |
| |
|
| | class NewProcessor(ProcessorMixin): |
| | def __init__(self, feature_extractor, tokenizer, processor_attr_1=1, processor_attr_2=True): |
| | super().__init__(feature_extractor, tokenizer) |
| |
|
| | self.processor_attr_1 = processor_attr_1 |
| | self.processor_attr_2 = processor_attr_2 |
| |
|
| | try: |
| | AutoConfig.register("custom", CustomConfig) |
| | AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor) |
| | AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer) |
| | AutoProcessor.register(CustomConfig, NewProcessor) |
| | |
| | processor = AutoProcessor.from_pretrained( |
| | "hf-internal-testing/test_dynamic_processor_updated", processor_attr_2=False |
| | ) |
| | self.assertEqual(processor.__class__.__name__, "NewProcessor") |
| | self.assertEqual(processor.processor_attr_1, 1) |
| | self.assertEqual(processor.processor_attr_2, False) |
| | finally: |
| | if "custom" in CONFIG_MAPPING._extra_content: |
| | del CONFIG_MAPPING._extra_content["custom"] |
| | if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: |
| | del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig] |
| | if CustomConfig in TOKENIZER_MAPPING._extra_content: |
| | del TOKENIZER_MAPPING._extra_content[CustomConfig] |
| | if CustomConfig in PROCESSOR_MAPPING._extra_content: |
| | del PROCESSOR_MAPPING._extra_content[CustomConfig] |
| | if CustomConfig in MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content: |
| | del MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content[CustomConfig] |
| |
|
| | def test_dynamic_processor_with_specific_dynamic_subcomponents(self): |
| | class NewFeatureExtractor(Wav2Vec2FeatureExtractor): |
| | pass |
| |
|
| | class NewTokenizer(BertTokenizer): |
| | pass |
| |
|
| | class NewProcessor(ProcessorMixin): |
| | def __init__(self, feature_extractor, tokenizer): |
| | super().__init__(feature_extractor, tokenizer) |
| |
|
| | try: |
| | AutoConfig.register("custom", CustomConfig) |
| | AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor) |
| | AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer) |
| | AutoProcessor.register(CustomConfig, NewProcessor) |
| | |
| | processor = AutoProcessor.from_pretrained( |
| | "hf-internal-testing/test_dynamic_processor_updated", |
| | ) |
| | self.assertEqual(processor.__class__.__name__, "NewProcessor") |
| | finally: |
| | if "custom" in CONFIG_MAPPING._extra_content: |
| | del CONFIG_MAPPING._extra_content["custom"] |
| | if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content: |
| | del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig] |
| | if CustomConfig in TOKENIZER_MAPPING._extra_content: |
| | del TOKENIZER_MAPPING._extra_content[CustomConfig] |
| | if CustomConfig in PROCESSOR_MAPPING._extra_content: |
| | del PROCESSOR_MAPPING._extra_content[CustomConfig] |
| | if CustomConfig in MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content: |
| | del MODEL_FOR_AUDIO_TOKENIZATION_MAPPING._extra_content[CustomConfig] |
| |
|
| | def test_auto_processor_creates_tokenizer(self): |
| | processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert") |
| | self.assertEqual(processor.__class__.__name__, "BertTokenizer") |
| |
|
| | def test_auto_processor_creates_image_processor(self): |
| | processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-convnext") |
| | self.assertEqual(processor.__class__.__name__, "ConvNextImageProcessor") |
| |
|
| | def test_auto_processor_save_load(self): |
| | processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf") |
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | processor.save_pretrained(tmp_dir) |
| | second_processor = AutoProcessor.from_pretrained(tmp_dir) |
| | self.assertEqual(second_processor.__class__.__name__, processor.__class__.__name__) |
| |
|
| | def test_processor_with_multiple_tokenizers_save_load(self): |
| | """Test that processors with multiple tokenizers save and load correctly.""" |
| |
|
| | class DualTokenizerProcessor(ProcessorMixin): |
| | """A processor with two tokenizers and an image processor.""" |
| |
|
| | def __init__(self, tokenizer, decoder_tokenizer, image_processor): |
| | super().__init__(tokenizer, decoder_tokenizer, image_processor) |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM") |
| | decoder_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") |
| | image_processor = SiglipImageProcessor() |
| |
|
| | processor = DualTokenizerProcessor( |
| | tokenizer=tokenizer, |
| | decoder_tokenizer=decoder_tokenizer, |
| | image_processor=image_processor, |
| | ) |
| |
|
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | processor.save_pretrained(tmp_dir) |
| |
|
| | |
| | self.assertTrue(os.path.exists(os.path.join(tmp_dir, "tokenizer_config.json"))) |
| | self.assertTrue(os.path.isdir(os.path.join(tmp_dir, "decoder_tokenizer"))) |
| | self.assertTrue(os.path.exists(os.path.join(tmp_dir, "decoder_tokenizer", "tokenizer_config.json"))) |
| |
|
| | |
| | with open(os.path.join(tmp_dir, "processor_config.json")) as f: |
| | processor_config = json.load(f) |
| | self.assertIn("image_processor", processor_config) |
| | self.assertNotIn("tokenizer", processor_config) |
| | self.assertNotIn("decoder_tokenizer", processor_config) |
| |
|
| | |
| | loaded_processor = DualTokenizerProcessor.from_pretrained(tmp_dir) |
| |
|
| | |
| | self.assertTrue(hasattr(loaded_processor, "tokenizer")) |
| | self.assertTrue(hasattr(loaded_processor, "decoder_tokenizer")) |
| | self.assertTrue(hasattr(loaded_processor, "image_processor")) |
| |
|
| | |
| | self.assertEqual(loaded_processor.tokenizer.vocab_size, tokenizer.vocab_size) |
| | self.assertEqual(loaded_processor.decoder_tokenizer.vocab_size, decoder_tokenizer.vocab_size) |
| |
|
| | |
| | self.assertEqual(loaded_processor.image_processor.size, image_processor.size) |
| |
|
| | def test_processor_with_multiple_image_processors_save_load(self): |
| | """Test that processors with multiple image processors save and load correctly.""" |
| |
|
| | class DualImageProcessorProcessor(ProcessorMixin): |
| | """A processor with two image processors and a tokenizer.""" |
| |
|
| | def __init__(self, tokenizer, image_processor, encoder_image_processor): |
| | super().__init__(tokenizer, image_processor, encoder_image_processor) |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM") |
| | image_processor = SiglipImageProcessorFast(size={"height": 224, "width": 224}) |
| | encoder_image_processor = CLIPImageProcessorFast(size={"height": 384, "width": 384}) |
| |
|
| | processor = DualImageProcessorProcessor( |
| | tokenizer=tokenizer, |
| | image_processor=image_processor, |
| | encoder_image_processor=encoder_image_processor, |
| | ) |
| |
|
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | processor.save_pretrained(tmp_dir) |
| |
|
| | |
| | with open(os.path.join(tmp_dir, "processor_config.json")) as f: |
| | processor_config = json.load(f) |
| | self.assertIn("image_processor", processor_config) |
| | self.assertIn("encoder_image_processor", processor_config) |
| | self.assertNotIn("tokenizer", processor_config) |
| |
|
| | |
| | self.assertIn("image_processor_type", processor_config["image_processor"]) |
| | self.assertIn("image_processor_type", processor_config["encoder_image_processor"]) |
| | self.assertEqual(processor_config["image_processor"]["image_processor_type"], "SiglipImageProcessorFast") |
| | self.assertEqual( |
| | processor_config["encoder_image_processor"]["image_processor_type"], "CLIPImageProcessorFast" |
| | ) |
| |
|
| | |
| | self.assertEqual(processor_config["image_processor"]["size"], {"height": 224, "width": 224}) |
| | self.assertEqual(processor_config["encoder_image_processor"]["size"], {"height": 384, "width": 384}) |
| |
|
| | |
| | loaded_processor = DualImageProcessorProcessor.from_pretrained(tmp_dir) |
| |
|
| | |
| | self.assertTrue(hasattr(loaded_processor, "tokenizer")) |
| | self.assertTrue(hasattr(loaded_processor, "image_processor")) |
| | self.assertTrue(hasattr(loaded_processor, "encoder_image_processor")) |
| |
|
| | |
| | self.assertEqual(loaded_processor.tokenizer.vocab_size, tokenizer.vocab_size) |
| |
|
| | |
| | self.assertEqual(loaded_processor.image_processor.size, {"height": 224, "width": 224}) |
| | self.assertEqual(loaded_processor.encoder_image_processor.size, {"height": 384, "width": 384}) |
| |
|
| | |
| | self.assertIsInstance(loaded_processor.image_processor, SiglipImageProcessorFast) |
| | self.assertIsInstance(loaded_processor.encoder_image_processor, CLIPImageProcessorFast) |
| |
|
| |
|
| | @is_staging_test |
| | class ProcessorPushToHubTester(unittest.TestCase): |
| | vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"] |
| |
|
| | @classmethod |
| | def setUpClass(cls): |
| | cls._token = TOKEN |
| |
|
| | def test_push_to_hub_via_save_pretrained(self): |
| | with TemporaryHubRepo(token=self._token) as tmp_repo: |
| | processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR) |
| | |
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | processor.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, push_to_hub=True, token=self._token) |
| |
|
| | new_processor = Wav2Vec2Processor.from_pretrained(tmp_repo.repo_id) |
| | for k, v in processor.feature_extractor.__dict__.items(): |
| | self.assertEqual(v, getattr(new_processor.feature_extractor, k)) |
| | self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab()) |
| |
|
| | def test_push_to_hub_in_organization_via_save_pretrained(self): |
| | with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo: |
| | processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR) |
| | |
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | processor.save_pretrained( |
| | tmp_dir, |
| | repo_id=tmp_repo.repo_id, |
| | push_to_hub=True, |
| | token=self._token, |
| | ) |
| |
|
| | new_processor = Wav2Vec2Processor.from_pretrained(tmp_repo.repo_id) |
| | for k, v in processor.feature_extractor.__dict__.items(): |
| | self.assertEqual(v, getattr(new_processor.feature_extractor, k)) |
| | self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab()) |
| |
|
| | def test_push_to_hub_dynamic_processor(self): |
| | with TemporaryHubRepo(token=self._token) as tmp_repo: |
| | CustomFeatureExtractor.register_for_auto_class() |
| | CustomTokenizer.register_for_auto_class() |
| | CustomProcessor.register_for_auto_class() |
| |
|
| | feature_extractor = CustomFeatureExtractor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR) |
| |
|
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | vocab_file = os.path.join(tmp_dir, "vocab.txt") |
| | with open(vocab_file, "w", encoding="utf-8") as vocab_writer: |
| | vocab_writer.write("".join([x + "\n" for x in self.vocab_tokens])) |
| | tokenizer = CustomTokenizer(vocab_file) |
| |
|
| | processor = CustomProcessor(feature_extractor, tokenizer) |
| |
|
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | snapshot_download(tmp_repo.repo_id, token=self._token) |
| | processor.save_pretrained(tmp_dir) |
| |
|
| | |
| | self.assertDictEqual( |
| | processor.feature_extractor.auto_map, |
| | { |
| | "AutoFeatureExtractor": "custom_feature_extraction.CustomFeatureExtractor", |
| | "AutoProcessor": "custom_processing.CustomProcessor", |
| | }, |
| | ) |
| |
|
| | |
| | with open(os.path.join(tmp_dir, "tokenizer_config.json")) as f: |
| | tokenizer_config = json.load(f) |
| | self.assertDictEqual( |
| | tokenizer_config["auto_map"], |
| | { |
| | "AutoTokenizer": ["custom_tokenization.CustomTokenizer", None], |
| | "AutoProcessor": "custom_processing.CustomProcessor", |
| | }, |
| | ) |
| |
|
| | |
| | self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "custom_feature_extraction.py"))) |
| | self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "custom_tokenization.py"))) |
| | self.assertTrue(os.path.isfile(os.path.join(tmp_dir, "custom_processing.py"))) |
| |
|
| | upload_folder(repo_id=tmp_repo.repo_id, folder_path=tmp_dir, token=self._token) |
| |
|
| | new_processor = AutoProcessor.from_pretrained(tmp_repo.repo_id, trust_remote_code=True) |
| | |
| | self.assertEqual(new_processor.__class__.__name__, "CustomProcessor") |
| |
|
| | def test_push_to_hub_with_chat_templates(self): |
| | with tempfile.TemporaryDirectory() as tmp_dir: |
| | tokenizer = LlamaTokenizer.from_pretrained(SAMPLE_VOCAB_LLAMA) |
| | image_processor = SiglipImageProcessor() |
| | chat_template = "default dummy template for testing purposes only" |
| | processor = LlavaProcessor( |
| | tokenizer=tokenizer, image_processor=image_processor, chat_template=chat_template |
| | ) |
| | self.assertEqual(processor.chat_template, chat_template) |
| | with TemporaryHubRepo(token=self._token) as tmp_repo: |
| | processor.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, token=self._token, push_to_hub=True) |
| | reloaded_processor = LlavaProcessor.from_pretrained(tmp_repo.repo_id) |
| | self.assertEqual(processor.chat_template, reloaded_processor.chat_template) |
| | |
| | |
| | self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template) |
| |
|
| | with TemporaryHubRepo(token=self._token) as tmp_repo: |
| | processor.chat_template = {"default": "a", "secondary": "b"} |
| | processor.save_pretrained(tmp_dir, repo_id=tmp_repo.repo_id, token=self._token, push_to_hub=True) |
| | reloaded_processor = LlavaProcessor.from_pretrained(tmp_repo.repo_id) |
| | self.assertEqual(processor.chat_template, reloaded_processor.chat_template) |
| | |
| | |
| | self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template) |
| |
|