ktrk115 commited on
Commit
0bdb3ea
·
verified ·
1 Parent(s): 599f7fa

Upload processor

Browse files
added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "<begin_of_image>": 49152,
3
+ "<end_of_image>": 49153,
4
+ "<image_sep>": 49154,
5
+ "<image_token>": 49155
6
+ }
image_processing_vqmodel.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from PIL import Image
4
+ from transformers.image_processing_utils import BaseImageProcessor
5
+ from transformers.utils import logging
6
+
7
+ logger = logging.get_logger(__name__)
8
+
9
+
10
+ class VQModelImageProcessor(BaseImageProcessor): # type: ignore
11
+ def __init__(
12
+ self,
13
+ size: int = 256,
14
+ convert_rgb: bool = False,
15
+ resample: Image.Resampling = Image.Resampling.LANCZOS,
16
+ **kwargs: dict,
17
+ ) -> None:
18
+ self.size = size
19
+ self.convert_rgb = convert_rgb
20
+ self.resample = resample
21
+
22
+ def __call__(self, image: Image.Image) -> dict:
23
+ return self.preprocess(image)
24
+
25
+ def preprocess(self, image: Image.Image) -> dict:
26
+ width, height = image.size
27
+ size = (self.size, self.size)
28
+ image = image.resize(size, resample=self.resample)
29
+ image = image.convert("RGBA")
30
+
31
+ if self.convert_rgb:
32
+ # Paste RGBA image on white background
33
+ image_new = Image.new("RGB", image.size, (255, 255, 255))
34
+ image_new.paste(image, mask=image.split()[3])
35
+ image = image_new
36
+
37
+ return {
38
+ "image": self.to_tensor(image),
39
+ "width": width,
40
+ "height": height,
41
+ }
42
+
43
+ def to_tensor(self, image: Image.Image) -> torch.Tensor:
44
+ x = np.array(image) / 127.5 - 1.0
45
+ x = x.transpose(2, 0, 1).astype(np.float32)
46
+ return torch.as_tensor(x)
47
+
48
+ def postprocess(
49
+ self,
50
+ x: torch.Tensor,
51
+ width: int | None = None,
52
+ height: int | None = None,
53
+ ) -> Image.Image:
54
+ x_np = x.detach().cpu().numpy()
55
+ x_np = x_np.transpose(1, 2, 0)
56
+ x_np = (x_np + 1.0) * 127.5
57
+ x_np = np.clip(x_np, 0, 255).astype(np.uint8)
58
+ image = Image.fromarray(x_np)
59
+
60
+ # Resize image
61
+ width = width or self.size
62
+ height = height or self.size
63
+ image = image.resize((width, height), resample=self.resample)
64
+
65
+ return image
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing_vqmodel.VQModelImageProcessor",
4
+ "AutoProcessor": "processing_markupdm.MarkupDMProcessor"
5
+ },
6
+ "convert_rgb": false,
7
+ "image_processor_type": "VQModelImageProcessor",
8
+ "processor_class": "MarkupDMProcessor",
9
+ "resample": 1,
10
+ "size": 256
11
+ }
processing_markupdm.py ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Processor class for MarkupDM."""
2
+
3
+ import math
4
+ import re
5
+ import shutil
6
+ import subprocess
7
+ import tempfile
8
+ from pathlib import Path
9
+
10
+ import numpy as np
11
+ import torch
12
+ from cr_renderer.fonts import FontManager
13
+ from PIL import Image, ImageDraw
14
+ from svg import Style as SVGStyle
15
+ from transformers import (
16
+ ImageProcessingMixin,
17
+ PreTrainedModel,
18
+ PreTrainedTokenizerBase,
19
+ ProcessorMixin,
20
+ )
21
+ from transformers.utils import logging
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ MAXIMUM_DECODE_IMAGE_SIZE = 4096
26
+ IMG_FORMAT = "{:03d}.png"
27
+ FONT_FORMAT = "{:03d}.ttf"
28
+
29
+
30
+ class MarkupDMProcessor(ProcessorMixin): # type: ignore
31
+ attributes = ["tokenizer", "image_processor"]
32
+
33
+ # The superclass checks if the tokenizer is a subclass of `PreTrainedTokenizerBase`
34
+ tokenizer_class = "AutoTokenizer"
35
+ tokenizer: PreTrainedTokenizerBase
36
+
37
+ # and the image_processor is a subclass of `ImageProcessingMixin`.
38
+ image_processor_class = "AutoImageProcessor"
39
+ image_processor: ImageProcessingMixin
40
+
41
+ def __init__(
42
+ self,
43
+ tokenizer: PreTrainedTokenizerBase,
44
+ image_processor: ImageProcessingMixin,
45
+ ):
46
+ super().__init__(tokenizer, image_processor)
47
+
48
+ # Extend the tokenizer if it has not been extended yet.
49
+ if "<begin_of_image>" not in tokenizer.additional_special_tokens:
50
+ self.extend_base_tokenizer(self.tokenizer)
51
+
52
+ # Regular expressions
53
+ boi = "<begin_of_image>"
54
+ img_sep = "<image_sep>"
55
+ self.re_img_size = re.compile(rf"{boi}(\d+){img_sep}(\d+){img_sep}")
56
+ self.re_svg_width = re.compile(r'<svg[^>]*\bwidth="(\d+)"[^>]*>')
57
+ self.re_svg_height = re.compile(r'<svg[^>]*\bheight="(\d+)"[^>]*>')
58
+
59
+ # Font manager
60
+ self.font_manager = None
61
+
62
+ def extend_base_tokenizer(self, tokenizer: PreTrainedTokenizerBase) -> None:
63
+ logger.info("Extending tokenizer...")
64
+ tokenizer.clean_up_tokenization_spaces = False
65
+
66
+ # Add special tokens
67
+ additional_special_tokens = [
68
+ "<begin_of_image>",
69
+ "<end_of_image>",
70
+ "<image_sep>",
71
+ "<image_token>",
72
+ ]
73
+ logger.info(f"Add special tokens: {additional_special_tokens}")
74
+ tokenizer.add_special_tokens(
75
+ {"additional_special_tokens": additional_special_tokens},
76
+ replace_additional_special_tokens=False,
77
+ )
78
+
79
+ def __call__(
80
+ self,
81
+ svg: str | None = None,
82
+ images: list[Image.Image] | None = None,
83
+ filenames: list[str] | None = None,
84
+ vision_model: PreTrainedModel | None = None,
85
+ ) -> dict:
86
+ # Process images
87
+ if not isinstance(images, list):
88
+ images = [images] # type: ignore
89
+
90
+ if len(images) > 0 and images[0] is not None:
91
+ output = self.preprocess_images(images)
92
+ output = self.encode_images(output, vision_model)
93
+ else:
94
+ output = {"width": [], "height": [], "image_ids": []}
95
+
96
+ # Process the entire example
97
+ output.update({"svg": svg, "filenames": filenames})
98
+ output = self.tokenize_example(output)
99
+
100
+ return output
101
+
102
+ def preprocess_images(self, images: list[Image.Image]) -> dict:
103
+ assert images is not None, "Images must be provided."
104
+ output: dict = {"image": [], "width": [], "height": []}
105
+
106
+ for image in images:
107
+ processed = self.image_processor(image)
108
+ for key, value in processed.items():
109
+ output[key].append(value)
110
+
111
+ # Stack tensors
112
+ output["image"] = torch.stack(output["image"])
113
+
114
+ return output
115
+
116
+ def encode_images(self, example: dict, vision_model: PreTrainedModel) -> dict:
117
+ if "images" in example and "width" not in example:
118
+ example = self.preprocess_images(example["images"])
119
+
120
+ assert vision_model is not None, "Vision model must be provided."
121
+ image = example.pop("image").to(vision_model.device)
122
+ with torch.inference_mode():
123
+ _, _, (_, _, image_ids) = vision_model.model.encode(image)
124
+ example["image_ids"] = list(image_ids.view(image.size(0), -1).cpu())
125
+
126
+ return example
127
+
128
+ def tokenize_example(self, example: dict) -> dict:
129
+ # Validate the input example
130
+ for key in ["svg", "filenames", "width", "height", "image_ids"]:
131
+ msg = f"Missing key: {key}."
132
+ if key in ["width", "height", "image_ids"]:
133
+ msg += " Images must be encoded first using `encode_images`."
134
+ assert example.get(key, None) is not None, msg
135
+
136
+ tokenizer = self.tokenizer
137
+ bos_id = tokenizer.bos_token_id
138
+ eos_id = tokenizer.eos_token_id
139
+ bos_id = bos_id if bos_id is not None else eos_id
140
+ boi_id = tokenizer.convert_tokens_to_ids("<begin_of_image>")
141
+ eoi_id = tokenizer.convert_tokens_to_ids("<end_of_image>")
142
+ img_sep_id = tokenizer.convert_tokens_to_ids("<image_sep>")
143
+
144
+ # Tokenize images and build a mapping from image filenames to tokens
145
+ name2token = {}
146
+ for filename, image_ids, width, height in zip(
147
+ example["filenames"],
148
+ example["image_ids"],
149
+ example["width"],
150
+ example["height"],
151
+ ):
152
+ _image_ids = (image_ids + len(tokenizer)).tolist()
153
+ W_tokens = tokenizer.encode(str(width))
154
+ H_tokens = tokenizer.encode(str(height))
155
+
156
+ # Image tokens
157
+ image_tokens = [
158
+ boi_id,
159
+ *W_tokens,
160
+ img_sep_id,
161
+ *H_tokens,
162
+ img_sep_id,
163
+ *_image_ids,
164
+ eoi_id,
165
+ ]
166
+
167
+ name2token[filename] = image_tokens
168
+
169
+ # Tokenize SVG
170
+ # TODO: remove bos_id as it seems to be not necessary in modern practice
171
+ tokens = [bos_id]
172
+ svg = example["svg"]
173
+ while svg:
174
+ # Find the start position of the next image filename
175
+ start, end = len(svg), len(svg)
176
+ for name in name2token.keys():
177
+ _start = svg.find(name)
178
+ if -1 < _start and _start < start:
179
+ start = _start
180
+ end = start + len(name)
181
+
182
+ # Tokenize the text before the image filename
183
+ tokens += tokenizer.encode(svg[:start])
184
+
185
+ # Append the tokenized image
186
+ if start < end:
187
+ tokens += name2token[svg[start:end]]
188
+
189
+ # Update the remaining text
190
+ svg = svg[end:]
191
+
192
+ tokens.append(eos_id)
193
+
194
+ # Format output data
195
+ input_ids = torch.tensor(tokens)
196
+ image_mask = input_ids >= len(tokenizer)
197
+
198
+ # Compute image position ids
199
+ image_pos_ids = torch.zeros_like(input_ids)
200
+ if len(example["image_ids"]) > 0:
201
+ length = example["image_ids"][0].size(0)
202
+ num_images = sum(image_mask) // length
203
+ image_pos_ids[image_mask] = torch.arange(length).repeat(num_images)
204
+
205
+ return {
206
+ "input_ids": input_ids,
207
+ "image_mask": image_mask,
208
+ "image_pos_ids": image_pos_ids,
209
+ }
210
+
211
+ def decode(
212
+ self,
213
+ tokens: torch.Tensor | np.ndarray,
214
+ vision_model: PreTrainedModel | None = None,
215
+ ) -> dict:
216
+ tokenizer = self.tokenizer
217
+ bos = tokenizer.bos_token
218
+ eos = tokenizer.eos_token
219
+ bos = bos if bos is not None else eos
220
+
221
+ # Validate the input tokens
222
+ msg = "Should be reverted from FIM format before decoding."
223
+ for fim_type in ["prefix", "middle", "suffix"]:
224
+ token_id = tokenizer.convert_tokens_to_ids(f"<fim_{fim_type}>")
225
+ if token_id is None:
226
+ token_id = tokenizer.convert_tokens_to_ids(f"<|fim_{fim_type}|>")
227
+ assert token_id is not None, f"{fim_type} token not found"
228
+ assert token_id not in tokens, msg
229
+
230
+ tokens = torch.asarray(tokens).detach().cpu()
231
+ assert tokens.ndim == 1, "Tokens must be 1D."
232
+ boi_id = tokenizer.convert_tokens_to_ids("<begin_of_image>")
233
+ eoi_id = tokenizer.convert_tokens_to_ids("<end_of_image>")
234
+
235
+ # Decode tokens
236
+ svg = ""
237
+ images: list = []
238
+ filenames: list = []
239
+ while len(tokens) > 0:
240
+ # Find the start position of the next image filename
241
+ boi_idx = torch.where(tokens == boi_id)[0]
242
+ eoi_idx = torch.where(tokens == eoi_id)[0]
243
+ if boi_idx.size(0) > 0:
244
+ start = int(boi_idx[0].item())
245
+ end = int(eoi_idx[0].item()) + 1 if eoi_idx.size(0) > 0 else len(tokens)
246
+ assert start < end, "Invalid image tokens."
247
+ else:
248
+ start, end = len(tokens), len(tokens)
249
+
250
+ # Decode the tokens before the image tokens
251
+ svg += tokenizer.decode(tokens[:start])
252
+
253
+ # Decode the image tokens
254
+ if start < end:
255
+ # Extract image size
256
+ image_tokens = tokens[start:end]
257
+ image_text = tokenizer.decode(image_tokens)
258
+ matched = self.re_img_size.match(image_text)
259
+ if matched is not None:
260
+ width, height = map(int, matched.groups())
261
+ else:
262
+ width = self.image_processor.size
263
+ height = self.image_processor.size
264
+
265
+ # Decode tokens to PIL image
266
+ image_mask = image_tokens >= len(tokenizer)
267
+ image_ids = image_tokens[image_mask] - len(tokenizer)
268
+ image = self.decode_image(vision_model, image_ids, width, height)
269
+ filename = IMG_FORMAT.format(len(images))
270
+ svg += filename
271
+
272
+ images.append(image)
273
+ filenames.append(filename)
274
+
275
+ # Update the remaining tokens
276
+ tokens = tokens[end:]
277
+
278
+ # Remove consecutive <bos> and <eos>
279
+ svg = re.sub(rf"({re.escape(bos)})+", bos, svg)
280
+ svg = re.sub(rf"({re.escape(eos)})+", eos, svg)
281
+
282
+ # Extract the text between <bos> and <eos>
283
+ i_bos = svg.find(bos)
284
+ svg = svg[i_bos + len(bos) :] if i_bos > -1 else svg
285
+ i_eos = svg.find(eos, i_bos + 1)
286
+ svg = svg[:i_eos] if i_eos > -1 else svg
287
+
288
+ return {"svg": svg, "images": images, "filenames": filenames}
289
+
290
+ def decode_image(
291
+ self,
292
+ vision_model: PreTrainedModel | None = None,
293
+ image_ids: torch.Tensor | np.ndarray | None = None,
294
+ width: int | None = None,
295
+ height: int | None = None,
296
+ dummy_color: tuple[int, int, int, int] = (200,) * 4,
297
+ pad_value: int = 0,
298
+ ) -> Image.Image:
299
+ # Prepare image size
300
+ width = width or self.image_processor.size
301
+ height = height or self.image_processor.size
302
+ width, height = self.compute_safe_image_size(width, height)
303
+
304
+ if vision_model is None and image_ids is None:
305
+ # Return a dummy image
306
+ return Image.new("RGBA", (width, height), dummy_color)
307
+
308
+ # Compute required length
309
+ assert vision_model is not None, "Vision model must be provided."
310
+ scale_factor = 2 ** (vision_model.model.encoder.num_resolutions - 1)
311
+ latent_size = self.image_processor.size // scale_factor
312
+ required_length = latent_size**2
313
+
314
+ # Pad image ids if necessary
315
+ image_ids = torch.asarray(image_ids, device=vision_model.device)
316
+ code_length = image_ids.shape[0] # type: ignore
317
+ if code_length < required_length:
318
+ pad_size = required_length - code_length
319
+ pad = torch.full((pad_size,), pad_value).to(image_ids)
320
+ image_ids = torch.cat([image_ids, pad])
321
+
322
+ # Decode image
323
+ with torch.inference_mode():
324
+ codebook_entry = vision_model.model.quantize.get_codebook_entry(
325
+ image_ids, (1, latent_size, latent_size, -1)
326
+ )
327
+ recon = vision_model.model.decode(codebook_entry)[0].float()
328
+
329
+ # Postprocess image
330
+ img = self.image_processor.postprocess(
331
+ recon, self.image_processor.size, self.image_processor.size
332
+ )
333
+
334
+ # Mask the padded area
335
+ if code_length < required_length:
336
+ img = self.mask_padded_area(img, code_length, scale_factor)
337
+
338
+ # Resize the image to the original size
339
+ img = img.resize((width, height), resample=self.image_processor.resample)
340
+
341
+ return img # type: ignore
342
+
343
+ def compute_safe_image_size(self, width: int, height: int) -> tuple[int, int]:
344
+ long_edge = max(width, height)
345
+ if MAXIMUM_DECODE_IMAGE_SIZE < long_edge:
346
+ scale = MAXIMUM_DECODE_IMAGE_SIZE / long_edge
347
+ width = min(max(int(width * scale), 1), MAXIMUM_DECODE_IMAGE_SIZE)
348
+ height = min(max(int(height * scale), 1), MAXIMUM_DECODE_IMAGE_SIZE)
349
+ return width, height
350
+
351
+ def mask_padded_area(
352
+ self,
353
+ img: Image.Image,
354
+ code_length: int,
355
+ scale_factor: int,
356
+ fill: tuple[int, int, int, int] = (200, 200, 200, 255),
357
+ ) -> Image.Image:
358
+ draw = ImageDraw.Draw(img, mode="RGBA")
359
+ width, height = img.size
360
+ zw = math.ceil(width / scale_factor)
361
+ cw = code_length % zw
362
+ ch = code_length // zw
363
+ draw.polygon(
364
+ [
365
+ (cw * scale_factor, ch * scale_factor),
366
+ (width, ch * scale_factor),
367
+ (width, height),
368
+ (0, height),
369
+ (0, (ch + 1) * scale_factor),
370
+ (cw * scale_factor, (ch + 1) * scale_factor),
371
+ ],
372
+ fill=fill,
373
+ )
374
+ return img
375
+
376
+ def set_font_manager(self, fonts_path: str | None = None) -> None:
377
+ self.font_manager = FontManager(fonts_path)
378
+
379
+ def render_preprocess(self, example: dict, out_dir: str | Path) -> None:
380
+ msg = "Font manager is not set. Call `set_font_manager` first."
381
+ assert self.font_manager is not None, msg
382
+
383
+ out_dir = Path(out_dir)
384
+ out_dir.mkdir(parents=True, exist_ok=True)
385
+ svg = example["svg"]
386
+
387
+ # Costruct style tag
388
+ found = set()
389
+ style_text = "text{dominant-baseline:text-before-edge}"
390
+ for i, text_str in enumerate(re.findall("<text[^>]*>", svg)):
391
+ matched = re.search('font-family="([^"]*)"', text_str)
392
+ if matched is None:
393
+ logger.warning(f"Font family not found in {text_str}")
394
+ continue
395
+
396
+ # Parse font attributes
397
+ font_family = matched.group(1)
398
+ is_bold = 'font-weight="bold"' in text_str
399
+ is_italic = 'font-style="italic"' in text_str
400
+ font_weight = "bold" if is_bold else "regular"
401
+ if is_italic:
402
+ font_style = "bolditalic" if is_bold else "italic"
403
+ else:
404
+ font_style = font_weight
405
+ key = (font_family, font_weight, font_style)
406
+ if key in found:
407
+ continue
408
+
409
+ font_bytes = self.font_manager.lookup(
410
+ font_family=font_family,
411
+ font_weight=font_weight,
412
+ font_style=font_style,
413
+ )
414
+
415
+ # @font-face
416
+ font_path = FONT_FORMAT.format(i)
417
+ font_face = "@font-face{"
418
+ font_face += f"font-family:'{font_family}';"
419
+ font_face += f"font-weight:{font_weight};"
420
+ font_face += f"font-style:{font_style};"
421
+ font_face += f"src:url('{font_path}');"
422
+ font_face += "}"
423
+ style_text += font_face
424
+
425
+ # Save font
426
+ Path(f"{out_dir}/{font_path}").write_bytes(font_bytes)
427
+ found.add(key)
428
+
429
+ # Insert style tag
430
+ matched = re.search("<svg[^>]*>", svg)
431
+ assert matched is not None, "SVG tag not found"
432
+ i = matched.span()[1]
433
+ style = SVGStyle(text=style_text)
434
+ example["svg"] = svg[:i] + style.as_str() + svg[i:]
435
+
436
+ def render(self, example: dict, save_dir: str | Path | None = None) -> Image.Image:
437
+ with tempfile.TemporaryDirectory() as tmp_dir:
438
+ self.render_preprocess(example, tmp_dir)
439
+
440
+ # Parse the SVG size
441
+ matched = self.re_svg_width.search(example["svg"])
442
+ assert matched is not None, "Width not found in SVG."
443
+ width = int(matched.group(1))
444
+ matched = self.re_svg_height.search(example["svg"])
445
+ assert matched is not None, "Height not found in SVG."
446
+ height = int(matched.group(1))
447
+
448
+ # Convert SVG to HTML
449
+ html = '<!DOCTYPE html><html><body style="margin: 0px">'
450
+ html += f"{example['svg']}</body></html>"
451
+
452
+ # Save HTML
453
+ Path(f"{tmp_dir}/index.html").write_text(html, encoding="utf-8")
454
+
455
+ # Save images
456
+ for img, filename in zip(example["images"], example["filenames"]):
457
+ Path(f"{tmp_dir}/{filename}").parent.mkdir(parents=True, exist_ok=True)
458
+ img.save(f"{tmp_dir}/{filename}")
459
+
460
+ # Take screenshot
461
+ command = [
462
+ "google-chrome",
463
+ "--headless",
464
+ "--disable-web-security",
465
+ "--allow-running-insecure-content",
466
+ "--no-sandbox",
467
+ "--disable-infobars",
468
+ "--hide-scrollbars",
469
+ "--disable-dev-shm-usage",
470
+ "--no-zygote",
471
+ f"--window-size={width},{height}",
472
+ f"--screenshot={tmp_dir}/screenshot.png",
473
+ f"{tmp_dir}/index.html",
474
+ ]
475
+ subprocess.run(command, check=True, stderr=subprocess.DEVNULL)
476
+
477
+ # Load the screenshot as PIL image
478
+ out = Image.open(f"{tmp_dir}/screenshot.png")
479
+ size = (width, height)
480
+ out = out.resize(size, resample=Image.Resampling.LANCZOS) # type: ignore
481
+
482
+ # Copy the result if save_dir is specified
483
+ if save_dir is not None:
484
+ shutil.copytree(tmp_dir, save_dir, dirs_exist_ok=True)
485
+
486
+ return out
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_markupdm.MarkupDMProcessor"
4
+ },
5
+ "processor_class": "MarkupDMProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim_prefix>",
5
+ "<fim_middle>",
6
+ "<fim_suffix>",
7
+ "<fim_pad>",
8
+ "<filename>",
9
+ "<gh_stars>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<empty_output>",
18
+ "<commit_before>",
19
+ "<commit_msg>",
20
+ "<commit_after>",
21
+ "<reponame>",
22
+ "<begin_of_image>",
23
+ "<end_of_image>",
24
+ "<image_sep>",
25
+ "<image_token>"
26
+ ],
27
+ "bos_token": {
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ },
34
+ "eos_token": {
35
+ "content": "<|endoftext|>",
36
+ "lstrip": false,
37
+ "normalized": false,
38
+ "rstrip": false,
39
+ "single_word": false
40
+ },
41
+ "unk_token": {
42
+ "content": "<|endoftext|>",
43
+ "lstrip": false,
44
+ "normalized": false,
45
+ "rstrip": false,
46
+ "single_word": false
47
+ }
48
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<fim_prefix>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<fim_middle>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<fim_suffix>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<fim_pad>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<filename>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<gh_stars>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<issue_start>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_comment>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_closed>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<jupyter_start>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_text>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_code>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_output>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<empty_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<commit_before>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<commit_msg>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "17": {
141
+ "content": "<commit_after>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "18": {
149
+ "content": "<reponame>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "49152": {
157
+ "content": "<begin_of_image>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "49153": {
165
+ "content": "<end_of_image>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "49154": {
173
+ "content": "<image_sep>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "49155": {
181
+ "content": "<image_token>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ }
188
+ },
189
+ "additional_special_tokens": [
190
+ "<|endoftext|>",
191
+ "<fim_prefix>",
192
+ "<fim_middle>",
193
+ "<fim_suffix>",
194
+ "<fim_pad>",
195
+ "<filename>",
196
+ "<gh_stars>",
197
+ "<issue_start>",
198
+ "<issue_comment>",
199
+ "<issue_closed>",
200
+ "<jupyter_start>",
201
+ "<jupyter_text>",
202
+ "<jupyter_code>",
203
+ "<jupyter_output>",
204
+ "<empty_output>",
205
+ "<commit_before>",
206
+ "<commit_msg>",
207
+ "<commit_after>",
208
+ "<reponame>",
209
+ "<begin_of_image>",
210
+ "<end_of_image>",
211
+ "<image_sep>",
212
+ "<image_token>"
213
+ ],
214
+ "auto_map": {
215
+ "AutoProcessor": "processing_markupdm.MarkupDMProcessor"
216
+ },
217
+ "bos_token": "<|endoftext|>",
218
+ "clean_up_tokenization_spaces": false,
219
+ "eos_token": "<|endoftext|>",
220
+ "extra_special_tokens": {},
221
+ "model_max_length": 8192,
222
+ "processor_class": "MarkupDMProcessor",
223
+ "tokenizer_class": "GPT2Tokenizer",
224
+ "unk_token": "<|endoftext|>",
225
+ "vocab_size": 49152
226
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff