Upload processor

Browse files

Files changed (10) hide show

added_tokens.json +6 -0
image_processing_vqmodel.py +65 -0
merges.txt +0 -0
preprocessor_config.json +11 -0
processing_markupdm.py +486 -0
processor_config.json +6 -0
special_tokens_map.json +48 -0
tokenizer.json +0 -0
tokenizer_config.json +226 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "<begin_of_image>": 49152,
+  "<end_of_image>": 49153,
+  "<image_sep>": 49154,
+  "<image_token>": 49155
+}

image_processing_vqmodel.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import numpy as np
+import torch
+from PIL import Image
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class VQModelImageProcessor(BaseImageProcessor):  # type: ignore
+    def __init__(
+        self,
+        size: int = 256,
+        convert_rgb: bool = False,
+        resample: Image.Resampling = Image.Resampling.LANCZOS,
+        **kwargs: dict,
+    ) -> None:
+        self.size = size
+        self.convert_rgb = convert_rgb
+        self.resample = resample
+    def __call__(self, image: Image.Image) -> dict:
+        return self.preprocess(image)
+    def preprocess(self, image: Image.Image) -> dict:
+        width, height = image.size
+        size = (self.size, self.size)
+        image = image.resize(size, resample=self.resample)
+        image = image.convert("RGBA")
+        if self.convert_rgb:
+            # Paste RGBA image on white background
+            image_new = Image.new("RGB", image.size, (255, 255, 255))
+            image_new.paste(image, mask=image.split()[3])
+            image = image_new
+        return {
+            "image": self.to_tensor(image),
+            "width": width,
+            "height": height,
+        }
+    def to_tensor(self, image: Image.Image) -> torch.Tensor:
+        x = np.array(image) / 127.5 - 1.0
+        x = x.transpose(2, 0, 1).astype(np.float32)
+        return torch.as_tensor(x)
+    def postprocess(
+        self,
+        x: torch.Tensor,
+        width: int | None = None,
+        height: int | None = None,
+    ) -> Image.Image:
+        x_np = x.detach().cpu().numpy()
+        x_np = x_np.transpose(1, 2, 0)
+        x_np = (x_np + 1.0) * 127.5
+        x_np = np.clip(x_np, 0, 255).astype(np.uint8)
+        image = Image.fromarray(x_np)
+        # Resize image
+        width = width or self.size
+        height = height or self.size
+        image = image.resize((width, height), resample=self.resample)
+        return image

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_vqmodel.VQModelImageProcessor",
+    "AutoProcessor": "processing_markupdm.MarkupDMProcessor"
+  },
+  "convert_rgb": false,
+  "image_processor_type": "VQModelImageProcessor",
+  "processor_class": "MarkupDMProcessor",
+  "resample": 1,
+  "size": 256
+}

processing_markupdm.py ADDED Viewed

	@@ -0,0 +1,486 @@

+"""Processor class for MarkupDM."""
+import math
+import re
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+import numpy as np
+import torch
+from cr_renderer.fonts import FontManager
+from PIL import Image, ImageDraw
+from svg import Style as SVGStyle
+from transformers import (
+    ImageProcessingMixin,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+)
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+MAXIMUM_DECODE_IMAGE_SIZE = 4096
+IMG_FORMAT = "{:03d}.png"
+FONT_FORMAT = "{:03d}.ttf"
+class MarkupDMProcessor(ProcessorMixin):  # type: ignore
+    attributes = ["tokenizer", "image_processor"]
+    # The superclass checks if the tokenizer is a subclass of `PreTrainedTokenizerBase`
+    tokenizer_class = "AutoTokenizer"
+    tokenizer: PreTrainedTokenizerBase
+    # and the image_processor is a subclass of `ImageProcessingMixin`.
+    image_processor_class = "AutoImageProcessor"
+    image_processor: ImageProcessingMixin
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        image_processor: ImageProcessingMixin,
+    ):
+        super().__init__(tokenizer, image_processor)
+        # Extend the tokenizer if it has not been extended yet.
+        if "<begin_of_image>" not in tokenizer.additional_special_tokens:
+            self.extend_base_tokenizer(self.tokenizer)
+        # Regular expressions
+        boi = "<begin_of_image>"
+        img_sep = "<image_sep>"
+        self.re_img_size = re.compile(rf"{boi}(\d+){img_sep}(\d+){img_sep}")
+        self.re_svg_width = re.compile(r'<svg[^>]*\bwidth="(\d+)"[^>]*>')
+        self.re_svg_height = re.compile(r'<svg[^>]*\bheight="(\d+)"[^>]*>')
+        # Font manager
+        self.font_manager = None
+    def extend_base_tokenizer(self, tokenizer: PreTrainedTokenizerBase) -> None:
+        logger.info("Extending tokenizer...")
+        tokenizer.clean_up_tokenization_spaces = False
+        # Add special tokens
+        additional_special_tokens = [
+            "<begin_of_image>",
+            "<end_of_image>",
+            "<image_sep>",
+            "<image_token>",
+        ]
+        logger.info(f"Add special tokens: {additional_special_tokens}")
+        tokenizer.add_special_tokens(
+            {"additional_special_tokens": additional_special_tokens},
+            replace_additional_special_tokens=False,
+        )
+    def __call__(
+        self,
+        svg: str | None = None,
+        images: list[Image.Image] | None = None,
+        filenames: list[str] | None = None,
+        vision_model: PreTrainedModel | None = None,
+    ) -> dict:
+        # Process images
+        if not isinstance(images, list):
+            images = [images]  # type: ignore
+        if len(images) > 0 and images[0] is not None:
+            output = self.preprocess_images(images)
+            output = self.encode_images(output, vision_model)
+        else:
+            output = {"width": [], "height": [], "image_ids": []}
+        # Process the entire example
+        output.update({"svg": svg, "filenames": filenames})
+        output = self.tokenize_example(output)
+        return output
+    def preprocess_images(self, images: list[Image.Image]) -> dict:
+        assert images is not None, "Images must be provided."
+        output: dict = {"image": [], "width": [], "height": []}
+        for image in images:
+            processed = self.image_processor(image)
+            for key, value in processed.items():
+                output[key].append(value)
+        # Stack tensors
+        output["image"] = torch.stack(output["image"])
+        return output
+    def encode_images(self, example: dict, vision_model: PreTrainedModel) -> dict:
+        if "images" in example and "width" not in example:
+            example = self.preprocess_images(example["images"])
+        assert vision_model is not None, "Vision model must be provided."
+        image = example.pop("image").to(vision_model.device)
+        with torch.inference_mode():
+            _, _, (_, _, image_ids) = vision_model.model.encode(image)
+        example["image_ids"] = list(image_ids.view(image.size(0), -1).cpu())
+        return example
+    def tokenize_example(self, example: dict) -> dict:
+        # Validate the input example
+        for key in ["svg", "filenames", "width", "height", "image_ids"]:
+            msg = f"Missing key: {key}."
+            if key in ["width", "height", "image_ids"]:
+                msg += " Images must be encoded first using `encode_images`."
+            assert example.get(key, None) is not None, msg
+        tokenizer = self.tokenizer
+        bos_id = tokenizer.bos_token_id
+        eos_id = tokenizer.eos_token_id
+        bos_id = bos_id if bos_id is not None else eos_id
+        boi_id = tokenizer.convert_tokens_to_ids("<begin_of_image>")
+        eoi_id = tokenizer.convert_tokens_to_ids("<end_of_image>")
+        img_sep_id = tokenizer.convert_tokens_to_ids("<image_sep>")
+        # Tokenize images and build a mapping from image filenames to tokens
+        name2token = {}
+        for filename, image_ids, width, height in zip(
+            example["filenames"],
+            example["image_ids"],
+            example["width"],
+            example["height"],
+        ):
+            _image_ids = (image_ids + len(tokenizer)).tolist()
+            W_tokens = tokenizer.encode(str(width))
+            H_tokens = tokenizer.encode(str(height))
+            # Image tokens
+            image_tokens = [
+                boi_id,
+                *W_tokens,
+                img_sep_id,
+                *H_tokens,
+                img_sep_id,
+                *_image_ids,
+                eoi_id,
+            ]
+            name2token[filename] = image_tokens
+        # Tokenize SVG
+        # TODO: remove bos_id as it seems to be not necessary in modern practice
+        tokens = [bos_id]
+        svg = example["svg"]
+        while svg:
+            # Find the start position of the next image filename
+            start, end = len(svg), len(svg)
+            for name in name2token.keys():
+                _start = svg.find(name)
+                if -1 < _start and _start < start:
+                    start = _start
+                    end = start + len(name)
+            # Tokenize the text before the image filename
+            tokens += tokenizer.encode(svg[:start])
+            # Append the tokenized image
+            if start < end:
+                tokens += name2token[svg[start:end]]
+            # Update the remaining text
+            svg = svg[end:]
+        tokens.append(eos_id)
+        # Format output data
+        input_ids = torch.tensor(tokens)
+        image_mask = input_ids >= len(tokenizer)
+        # Compute image position ids
+        image_pos_ids = torch.zeros_like(input_ids)
+        if len(example["image_ids"]) > 0:
+            length = example["image_ids"][0].size(0)
+            num_images = sum(image_mask) // length
+            image_pos_ids[image_mask] = torch.arange(length).repeat(num_images)
+        return {
+            "input_ids": input_ids,
+            "image_mask": image_mask,
+            "image_pos_ids": image_pos_ids,
+        }
+    def decode(
+        self,
+        tokens: torch.Tensor | np.ndarray,
+        vision_model: PreTrainedModel | None = None,
+    ) -> dict:
+        tokenizer = self.tokenizer
+        bos = tokenizer.bos_token
+        eos = tokenizer.eos_token
+        bos = bos if bos is not None else eos
+        # Validate the input tokens
+        msg = "Should be reverted from FIM format before decoding."
+        for fim_type in ["prefix", "middle", "suffix"]:
+            token_id = tokenizer.convert_tokens_to_ids(f"<fim_{fim_type}>")
+            if token_id is None:
+                token_id = tokenizer.convert_tokens_to_ids(f"<|fim_{fim_type}|>")
+            assert token_id is not None, f"{fim_type} token not found"
+            assert token_id not in tokens, msg
+        tokens = torch.asarray(tokens).detach().cpu()
+        assert tokens.ndim == 1, "Tokens must be 1D."
+        boi_id = tokenizer.convert_tokens_to_ids("<begin_of_image>")
+        eoi_id = tokenizer.convert_tokens_to_ids("<end_of_image>")
+        # Decode tokens
+        svg = ""
+        images: list = []
+        filenames: list = []
+        while len(tokens) > 0:
+            # Find the start position of the next image filename
+            boi_idx = torch.where(tokens == boi_id)[0]
+            eoi_idx = torch.where(tokens == eoi_id)[0]
+            if boi_idx.size(0) > 0:
+                start = int(boi_idx[0].item())
+                end = int(eoi_idx[0].item()) + 1 if eoi_idx.size(0) > 0 else len(tokens)
+                assert start < end, "Invalid image tokens."
+            else:
+                start, end = len(tokens), len(tokens)
+            # Decode the tokens before the image tokens
+            svg += tokenizer.decode(tokens[:start])
+            # Decode the image tokens
+            if start < end:
+                # Extract image size
+                image_tokens = tokens[start:end]
+                image_text = tokenizer.decode(image_tokens)
+                matched = self.re_img_size.match(image_text)
+                if matched is not None:
+                    width, height = map(int, matched.groups())
+                else:
+                    width = self.image_processor.size
+                    height = self.image_processor.size
+                # Decode tokens to PIL image
+                image_mask = image_tokens >= len(tokenizer)
+                image_ids = image_tokens[image_mask] - len(tokenizer)
+                image = self.decode_image(vision_model, image_ids, width, height)
+                filename = IMG_FORMAT.format(len(images))
+                svg += filename
+                images.append(image)
+                filenames.append(filename)
+            # Update the remaining tokens
+            tokens = tokens[end:]
+        # Remove consecutive <bos> and <eos>
+        svg = re.sub(rf"({re.escape(bos)})+", bos, svg)
+        svg = re.sub(rf"({re.escape(eos)})+", eos, svg)
+        # Extract the text between <bos> and <eos>
+        i_bos = svg.find(bos)
+        svg = svg[i_bos + len(bos) :] if i_bos > -1 else svg
+        i_eos = svg.find(eos, i_bos + 1)
+        svg = svg[:i_eos] if i_eos > -1 else svg
+        return {"svg": svg, "images": images, "filenames": filenames}
+    def decode_image(
+        self,
+        vision_model: PreTrainedModel | None = None,
+        image_ids: torch.Tensor | np.ndarray | None = None,
+        width: int | None = None,
+        height: int | None = None,
+        dummy_color: tuple[int, int, int, int] = (200,) * 4,
+        pad_value: int = 0,
+    ) -> Image.Image:
+        # Prepare image size
+        width = width or self.image_processor.size
+        height = height or self.image_processor.size
+        width, height = self.compute_safe_image_size(width, height)
+        if vision_model is None and image_ids is None:
+            # Return a dummy image
+            return Image.new("RGBA", (width, height), dummy_color)
+        # Compute required length
+        assert vision_model is not None, "Vision model must be provided."
+        scale_factor = 2 ** (vision_model.model.encoder.num_resolutions - 1)
+        latent_size = self.image_processor.size // scale_factor
+        required_length = latent_size**2
+        # Pad image ids if necessary
+        image_ids = torch.asarray(image_ids, device=vision_model.device)
+        code_length = image_ids.shape[0]  # type: ignore
+        if code_length < required_length:
+            pad_size = required_length - code_length
+            pad = torch.full((pad_size,), pad_value).to(image_ids)
+            image_ids = torch.cat([image_ids, pad])
+        # Decode image
+        with torch.inference_mode():
+            codebook_entry = vision_model.model.quantize.get_codebook_entry(
+                image_ids, (1, latent_size, latent_size, -1)
+            )
+            recon = vision_model.model.decode(codebook_entry)[0].float()
+        # Postprocess image
+        img = self.image_processor.postprocess(
+            recon, self.image_processor.size, self.image_processor.size
+        )
+        # Mask the padded area
+        if code_length < required_length:
+            img = self.mask_padded_area(img, code_length, scale_factor)
+        # Resize the image to the original size
+        img = img.resize((width, height), resample=self.image_processor.resample)
+        return img  # type: ignore
+    def compute_safe_image_size(self, width: int, height: int) -> tuple[int, int]:
+        long_edge = max(width, height)
+        if MAXIMUM_DECODE_IMAGE_SIZE < long_edge:
+            scale = MAXIMUM_DECODE_IMAGE_SIZE / long_edge
+            width = min(max(int(width * scale), 1), MAXIMUM_DECODE_IMAGE_SIZE)
+            height = min(max(int(height * scale), 1), MAXIMUM_DECODE_IMAGE_SIZE)
+        return width, height
+    def mask_padded_area(
+        self,
+        img: Image.Image,
+        code_length: int,
+        scale_factor: int,
+        fill: tuple[int, int, int, int] = (200, 200, 200, 255),
+    ) -> Image.Image:
+        draw = ImageDraw.Draw(img, mode="RGBA")
+        width, height = img.size
+        zw = math.ceil(width / scale_factor)
+        cw = code_length % zw
+        ch = code_length // zw
+        draw.polygon(
+            [
+                (cw * scale_factor, ch * scale_factor),
+                (width, ch * scale_factor),
+                (width, height),
+                (0, height),
+                (0, (ch + 1) * scale_factor),
+                (cw * scale_factor, (ch + 1) * scale_factor),
+            ],
+            fill=fill,
+        )
+        return img
+    def set_font_manager(self, fonts_path: str | None = None) -> None:
+        self.font_manager = FontManager(fonts_path)
+    def render_preprocess(self, example: dict, out_dir: str | Path) -> None:
+        msg = "Font manager is not set. Call `set_font_manager` first."
+        assert self.font_manager is not None, msg
+        out_dir = Path(out_dir)
+        out_dir.mkdir(parents=True, exist_ok=True)
+        svg = example["svg"]
+        # Costruct style tag
+        found = set()
+        style_text = "text{dominant-baseline:text-before-edge}"
+        for i, text_str in enumerate(re.findall("<text[^>]*>", svg)):
+            matched = re.search('font-family="([^"]*)"', text_str)
+            if matched is None:
+                logger.warning(f"Font family not found in {text_str}")
+                continue
+            # Parse font attributes
+            font_family = matched.group(1)
+            is_bold = 'font-weight="bold"' in text_str
+            is_italic = 'font-style="italic"' in text_str
+            font_weight = "bold" if is_bold else "regular"
+            if is_italic:
+                font_style = "bolditalic" if is_bold else "italic"
+            else:
+                font_style = font_weight
+            key = (font_family, font_weight, font_style)
+            if key in found:
+                continue
+            font_bytes = self.font_manager.lookup(
+                font_family=font_family,
+                font_weight=font_weight,
+                font_style=font_style,
+            )
+            # @font-face
+            font_path = FONT_FORMAT.format(i)
+            font_face = "@font-face{"
+            font_face += f"font-family:'{font_family}';"
+            font_face += f"font-weight:{font_weight};"
+            font_face += f"font-style:{font_style};"
+            font_face += f"src:url('{font_path}');"
+            font_face += "}"
+            style_text += font_face
+            # Save font
+            Path(f"{out_dir}/{font_path}").write_bytes(font_bytes)
+            found.add(key)
+        # Insert style tag
+        matched = re.search("<svg[^>]*>", svg)
+        assert matched is not None, "SVG tag not found"
+        i = matched.span()[1]
+        style = SVGStyle(text=style_text)
+        example["svg"] = svg[:i] + style.as_str() + svg[i:]
+    def render(self, example: dict, save_dir: str | Path | None = None) -> Image.Image:
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            self.render_preprocess(example, tmp_dir)
+            # Parse the SVG size
+            matched = self.re_svg_width.search(example["svg"])
+            assert matched is not None, "Width not found in SVG."
+            width = int(matched.group(1))
+            matched = self.re_svg_height.search(example["svg"])
+            assert matched is not None, "Height not found in SVG."
+            height = int(matched.group(1))
+            # Convert SVG to HTML
+            html = '<!DOCTYPE html><html><body style="margin: 0px">'
+            html += f"{example['svg']}</body></html>"
+            # Save HTML
+            Path(f"{tmp_dir}/index.html").write_text(html, encoding="utf-8")
+            # Save images
+            for img, filename in zip(example["images"], example["filenames"]):
+                Path(f"{tmp_dir}/{filename}").parent.mkdir(parents=True, exist_ok=True)
+                img.save(f"{tmp_dir}/{filename}")
+            # Take screenshot
+            command = [
+                "google-chrome",
+                "--headless",
+                "--disable-web-security",
+                "--allow-running-insecure-content",
+                "--no-sandbox",
+                "--disable-infobars",
+                "--hide-scrollbars",
+                "--disable-dev-shm-usage",
+                "--no-zygote",
+                f"--window-size={width},{height}",
+                f"--screenshot={tmp_dir}/screenshot.png",
+                f"{tmp_dir}/index.html",
+            ]
+            subprocess.run(command, check=True, stderr=subprocess.DEVNULL)
+            # Load the screenshot as PIL image
+            out = Image.open(f"{tmp_dir}/screenshot.png")
+            size = (width, height)
+            out = out.resize(size, resample=Image.Resampling.LANCZOS)  # type: ignore
+            # Copy the result if save_dir is specified
+            if save_dir is not None:
+                shutil.copytree(tmp_dir, save_dir, dirs_exist_ok=True)
+        return out

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_markupdm.MarkupDMProcessor"
+  },
+  "processor_class": "MarkupDMProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>",
+    "<begin_of_image>",
+    "<end_of_image>",
+    "<image_sep>",
+    "<image_token>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,226 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<begin_of_image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<end_of_image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49154": {
+      "content": "<image_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49155": {
+      "content": "<image_token>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim_prefix>",
+    "<fim_middle>",
+    "<fim_suffix>",
+    "<fim_pad>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<empty_output>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<commit_after>",
+    "<reponame>",
+    "<begin_of_image>",
+    "<end_of_image>",
+    "<image_sep>",
+    "<image_token>"
+  ],
+  "auto_map": {
+    "AutoProcessor": "processing_markupdm.MarkupDMProcessor"
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "processor_class": "MarkupDMProcessor",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff