shunk031 commited on
Commit
6592b89
·
verified ·
1 Parent(s): 56d2b5b

Upload processing_longclip.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. processing_longclip.py +137 -12
processing_longclip.py CHANGED
@@ -1,20 +1,145 @@
1
- from typing import Union
 
2
 
3
- from transformers import CLIPProcessor, CLIPTokenizer, CLIPTokenizerFast
 
 
4
 
 
5
 
6
- class LongCLIPProcessor(CLIPProcessor):
7
- tokenizer: Union[CLIPTokenizer, CLIPTokenizerFast]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def __call__(
10
- self, text=None, short_text=None, images=None, return_tensors=None, **kwargs
 
 
 
 
 
 
 
11
  ):
12
- encoding = super().__call__(text, images, return_tensors, **kwargs)
13
- if short_text is not None:
14
- short_text_encoding = self.tokenizer(
15
- short_text, return_tensors=return_tensors, **kwargs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  )
17
- encoding["short_input_ids"] = short_text_encoding.input_ids
18
- encoding["short_attention_mask"] = short_text_encoding.attention_mask
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- return encoding
 
 
 
 
 
 
1
+ """
2
+ LongCLIP processor for preprocessing images and text.
3
 
4
+ This module provides a processor that combines image and text preprocessing
5
+ for LongCLIP models.
6
+ """
7
 
8
+ from typing import List, Optional, Union
9
 
10
+ from transformers import CLIPImageProcessor, CLIPTokenizer
11
+ from transformers.processing_utils import ProcessorMixin
12
+
13
+
14
+ class LongCLIPProcessor(ProcessorMixin):
15
+ """
16
+ Processor for LongCLIP that combines image and text preprocessing.
17
+
18
+ This processor wraps CLIPImageProcessor and CLIPTokenizer to provide
19
+ a unified interface for preprocessing inputs for LongCLIP models.
20
+
21
+ Args:
22
+ image_processor (CLIPImageProcessor): Image processor for preprocessing images.
23
+ tokenizer (CLIPTokenizer): Tokenizer for preprocessing text.
24
+
25
+ Attributes:
26
+ image_processor_class (str): Name of the image processor class.
27
+ tokenizer_class (str): Name of the tokenizer class.
28
+
29
+ Example:
30
+ ```python
31
+ >>> from long_clip_hf import LongCLIPProcessor
32
+ >>> from transformers import CLIPImageProcessor, CLIPTokenizer
33
+ >>> from PIL import Image
34
+ >>>
35
+ >>> # Initialize processor
36
+ >>> image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
37
+ >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
38
+ >>> processor = LongCLIPProcessor(image_processor=image_processor, tokenizer=tokenizer)
39
+ >>>
40
+ >>> # Process inputs
41
+ >>> image = Image.open("path/to/image.jpg")
42
+ >>> text = "a photo of a cat"
43
+ >>> inputs = processor(text=text, images=image, return_tensors="pt", padding=True, max_length=248)
44
+ >>>
45
+ >>> # inputs contains both 'input_ids', 'attention_mask' and 'pixel_values'
46
+ ```
47
+ """
48
+
49
+ attributes = ["image_processor", "tokenizer"]
50
+ image_processor_class = "CLIPImageProcessor"
51
+ tokenizer_class = "CLIPTokenizer"
52
+
53
+ def __init__(
54
+ self,
55
+ image_processor: Optional[CLIPImageProcessor] = None,
56
+ tokenizer: Optional[CLIPTokenizer] = None,
57
+ **kwargs,
58
+ ):
59
+ if image_processor is None:
60
+ raise ValueError("You need to specify an `image_processor`.")
61
+ if tokenizer is None:
62
+ raise ValueError("You need to specify a `tokenizer`.")
63
+
64
+ super().__init__(image_processor, tokenizer)
65
 
66
  def __call__(
67
+ self,
68
+ text: Union[str, List[str], None] = None,
69
+ images=None,
70
+ return_tensors: Optional[str] = "pt",
71
+ padding: Union[bool, str] = True,
72
+ max_length: Optional[int] = 248,
73
+ truncation: Optional[bool] = True,
74
+ **kwargs,
75
  ):
76
+ """
77
+ Preprocess text and images for LongCLIP model.
78
+
79
+ Args:
80
+ text (str, List[str], optional): Text or list of texts to process.
81
+ images: Image or list of images to process. Can be PIL Image, numpy array, or tensor.
82
+ return_tensors (str, optional): Type of tensors to return ('pt' for PyTorch).
83
+ padding (bool or str, optional): Padding strategy. Defaults to True.
84
+ max_length (int, optional): Maximum sequence length. Defaults to 248 for LongCLIP.
85
+ truncation (bool, optional): Whether to truncate sequences. Defaults to True.
86
+ **kwargs: Additional keyword arguments.
87
+
88
+ Returns:
89
+ BatchEncoding: Dictionary containing processed inputs with keys:
90
+ - input_ids: Tokenized text (if text provided)
91
+ - attention_mask: Attention mask for text (if text provided)
92
+ - pixel_values: Processed images (if images provided)
93
+ """
94
+ # Process text
95
+ if text is not None:
96
+ text_inputs = self.tokenizer(
97
+ text,
98
+ return_tensors=return_tensors,
99
+ padding=padding,
100
+ max_length=max_length,
101
+ truncation=truncation,
102
+ **kwargs,
103
  )
104
+ else:
105
+ text_inputs = {}
106
+
107
+ # Process images
108
+ if images is not None:
109
+ image_inputs = self.image_processor(
110
+ images,
111
+ return_tensors=return_tensors,
112
+ )
113
+ else:
114
+ image_inputs = {}
115
+
116
+ # Combine inputs
117
+ return {**text_inputs, **image_inputs}
118
+
119
+ def batch_decode(self, *args, **kwargs):
120
+ """
121
+ Decode token IDs back to text.
122
+
123
+ This method is forwarded to the tokenizer's batch_decode method.
124
+ """
125
+ return self.tokenizer.batch_decode(*args, **kwargs)
126
+
127
+ def decode(self, *args, **kwargs):
128
+ """
129
+ Decode token IDs back to text.
130
+
131
+ This method is forwarded to the tokenizer's decode method.
132
+ """
133
+ return self.tokenizer.decode(*args, **kwargs)
134
+
135
+ @property
136
+ def model_input_names(self):
137
+ """
138
+ Get the names of model inputs.
139
 
140
+ Returns:
141
+ List[str]: List of input names.
142
+ """
143
+ tokenizer_input_names = self.tokenizer.model_input_names
144
+ image_processor_input_names = self.image_processor.model_input_names
145
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))