Wi11Chan commited on
Commit
5c65cea
·
1 Parent(s): c95b30b

Upload 3 files

Browse files
feature_extraction_vit.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Feature extractor class for ViT."""
16
+
17
+ import warnings
18
+
19
+ from transformers.utils import logging
20
+ from .image_processing_vit import ViTImageProcessor
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+
26
+ class ViTFeatureExtractor(ViTImageProcessor):
27
+ def __init__(self, *args, **kwargs) -> None:
28
+ warnings.warn(
29
+ "The class ViTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
30
+ " use ViTImageProcessor instead.",
31
+ FutureWarning,
32
+ )
33
+ super().__init__(*args, **kwargs)
image_processing_vit.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Image processor class for ViT."""
16
+
17
+ from typing import Dict, List, Optional, Union
18
+
19
+ import numpy as np
20
+
21
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
22
+ from transformers.image_transforms import normalize, rescale, resize, to_channel_dimension_format
23
+ from transformers.image_utils import (
24
+ IMAGENET_STANDARD_MEAN,
25
+ IMAGENET_STANDARD_STD,
26
+ ChannelDimension,
27
+ ImageInput,
28
+ PILImageResampling,
29
+ make_list_of_images,
30
+ to_numpy_array,
31
+ valid_images,
32
+ )
33
+ from transformers.utils import TensorType, logging
34
+
35
+
36
+ logger = logging.get_logger(__name__)
37
+
38
+
39
+ class ViTImageProcessor(BaseImageProcessor):
40
+ r"""
41
+ Constructs a ViT image processor.
42
+
43
+ Args:
44
+ do_resize (`bool`, *optional*, defaults to `True`):
45
+ Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
46
+ size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
47
+ size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
48
+ Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
49
+ method.
50
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
51
+ Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
52
+ `preprocess` method.
53
+ do_rescale (`bool`, *optional*, defaults to `True`):
54
+ Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
55
+ parameter in the `preprocess` method.
56
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
57
+ Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
58
+ `preprocess` method.
59
+ do_normalize (`bool`, *optional*, defaults to `True`):
60
+ Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
61
+ method.
62
+ image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
63
+ Mean to use if normalizing the image. This is a float or list of floats the length of the number of
64
+ channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
65
+ image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
66
+ Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
67
+ number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
68
+ """
69
+
70
+ model_input_names = ["pixel_values"]
71
+
72
+ def __init__(
73
+ self,
74
+ do_resize: bool = True,
75
+ size: Optional[Dict[str, int]] = None,
76
+ resample: PILImageResampling = PILImageResampling.BILINEAR,
77
+ do_rescale: bool = True,
78
+ rescale_factor: Union[int, float] = 1 / 255,
79
+ do_normalize: bool = True,
80
+ image_mean: Optional[Union[float, List[float]]] = None,
81
+ image_std: Optional[Union[float, List[float]]] = None,
82
+ **kwargs,
83
+ ) -> None:
84
+ super().__init__(**kwargs)
85
+ size = size if size is not None else {"height": 224, "width": 224}
86
+ size = get_size_dict(size)
87
+ self.do_resize = do_resize
88
+ self.do_rescale = do_rescale
89
+ self.do_normalize = do_normalize
90
+ self.size = size
91
+ self.resample = resample
92
+ self.rescale_factor = rescale_factor
93
+ self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
94
+ self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
95
+
96
+ def resize(
97
+ self,
98
+ image: np.ndarray,
99
+ size: Dict[str, int],
100
+ resample: PILImageResampling = PILImageResampling.BILINEAR,
101
+ data_format: Optional[Union[str, ChannelDimension]] = None,
102
+ **kwargs,
103
+ ) -> np.ndarray:
104
+ """
105
+ Resize an image to `(size["height"], size["width"])`.
106
+
107
+ Args:
108
+ image (`np.ndarray`):
109
+ Image to resize.
110
+ size (`Dict[str, int]`):
111
+ Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
112
+ resample:
113
+ `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
114
+ data_format (`ChannelDimension` or `str`, *optional*):
115
+ The channel dimension format for the output image. If unset, the channel dimension format of the input
116
+ image is used. Can be one of:
117
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
118
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
119
+
120
+ Returns:
121
+ `np.ndarray`: The resized image.
122
+ """
123
+ size = get_size_dict(size)
124
+ if "height" not in size or "width" not in size:
125
+ raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
126
+ return resize(
127
+ image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
128
+ )
129
+
130
+ def rescale(
131
+ self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
132
+ ) -> np.ndarray:
133
+ """
134
+ Rescale an image by a scale factor. image = image * scale.
135
+
136
+ Args:
137
+ image (`np.ndarray`):
138
+ Image to rescale.
139
+ scale (`float`):
140
+ The scaling factor to rescale pixel values by.
141
+ data_format (`str` or `ChannelDimension`, *optional*):
142
+ The channel dimension format for the output image. If unset, the channel dimension format of the input
143
+ image is used. Can be one of:
144
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
145
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
146
+
147
+ Returns:
148
+ `np.ndarray`: The rescaled image.
149
+ """
150
+ return rescale(image, scale=scale, data_format=data_format, **kwargs)
151
+
152
+ def normalize(
153
+ self,
154
+ image: np.ndarray,
155
+ mean: Union[float, List[float]],
156
+ std: Union[float, List[float]],
157
+ data_format: Optional[Union[str, ChannelDimension]] = None,
158
+ **kwargs,
159
+ ) -> np.ndarray:
160
+ """
161
+ Normalize an image. image = (image - image_mean) / image_std.
162
+
163
+ Args:
164
+ image (`np.ndarray`):
165
+ Image to normalize.
166
+ mean (`float` or `List[float]`):
167
+ Image mean to use for normalization.
168
+ std (`float` or `List[float]`):
169
+ Image standard deviation to use for normalization.
170
+ data_format (`str` or `ChannelDimension`, *optional*):
171
+ The channel dimension format for the output image. If unset, the channel dimension format of the input
172
+ image is used. Can be one of:
173
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
174
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
175
+
176
+ Returns:
177
+ `np.ndarray`: The normalized image.
178
+ """
179
+ return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
180
+
181
+ def preprocess(
182
+ self,
183
+ images: ImageInput,
184
+ do_resize: Optional[bool] = None,
185
+ size: Dict[str, int] = None,
186
+ resample: PILImageResampling = None,
187
+ do_rescale: Optional[bool] = None,
188
+ rescale_factor: Optional[float] = None,
189
+ do_normalize: Optional[bool] = None,
190
+ image_mean: Optional[Union[float, List[float]]] = None,
191
+ image_std: Optional[Union[float, List[float]]] = None,
192
+ return_tensors: Optional[Union[str, TensorType]] = None,
193
+ data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
194
+ **kwargs,
195
+ ):
196
+ """
197
+ Preprocess an image or batch of images.
198
+
199
+ Args:
200
+ images (`ImageInput`):
201
+ Image to preprocess.
202
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
203
+ Whether to resize the image.
204
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
205
+ Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
206
+ resizing.
207
+ resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
208
+ `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
209
+ an effect if `do_resize` is set to `True`.
210
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
211
+ Whether to rescale the image values between [0 - 1].
212
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
213
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
214
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
215
+ Whether to normalize the image.
216
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
217
+ Image mean to use if `do_normalize` is set to `True`.
218
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
219
+ Image standard deviation to use if `do_normalize` is set to `True`.
220
+ return_tensors (`str` or `TensorType`, *optional*):
221
+ The type of tensors to return. Can be one of:
222
+ - Unset: Return a list of `np.ndarray`.
223
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
224
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
225
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
226
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
227
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
228
+ The channel dimension format for the output image. Can be one of:
229
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
230
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
231
+ - Unset: Use the channel dimension format of the input image.
232
+ """
233
+ do_resize = do_resize if do_resize is not None else self.do_resize
234
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
235
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
236
+ resample = resample if resample is not None else self.resample
237
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
238
+ image_mean = image_mean if image_mean is not None else self.image_mean
239
+ image_std = image_std if image_std is not None else self.image_std
240
+
241
+ size = size if size is not None else self.size
242
+ size_dict = get_size_dict(size)
243
+
244
+ images = make_list_of_images(images)
245
+
246
+ if not valid_images(images):
247
+ raise ValueError(
248
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
249
+ "torch.Tensor, tf.Tensor or jax.ndarray."
250
+ )
251
+
252
+ if do_resize and size is None:
253
+ raise ValueError("Size must be specified if do_resize is True.")
254
+
255
+ if do_rescale and rescale_factor is None:
256
+ raise ValueError("Rescale factor must be specified if do_rescale is True.")
257
+
258
+ # All transformations expect numpy arrays.
259
+ images = [to_numpy_array(image) for image in images]
260
+
261
+ if do_resize:
262
+ images = [self.resize(image=image, size=size_dict, resample=resample) for image in images]
263
+
264
+ if do_rescale:
265
+ images = [self.rescale(image=image, scale=rescale_factor) for image in images]
266
+
267
+ if do_normalize:
268
+ images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
269
+
270
+ images = [to_channel_dimension_format(image, data_format) for image in images]
271
+
272
+ data = {"pixel_values": images}
273
+ return BatchFeature(data=data, tensor_type=return_tensors)
preprocessor_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_resize": true,
4
+ "feature_extractor_type": "ViTFeatureExtractor",
5
+ "image_mean": [
6
+ 0.485,
7
+ 0.456,
8
+ 0.406
9
+ ],
10
+ "image_std": [
11
+ 0.229,
12
+ 0.224,
13
+ 0.225
14
+ ],
15
+ "reduce_labels": true,
16
+ "resample": 2,
17
+ "size": 512
18
+ }