2ira commited on 14 days ago

Commit

6880c8a

verified ·

1 Parent(s): a2e2e6c

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/generation_config.json +7 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/image_processing_evabyte.py +204 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/model.safetensors.index.json +450 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/modeling_evabyte.py +912 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/multibyte_decoding_evabyte.py +881 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/preprocessor_config.json +18 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/processing_evabyte.py +287 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/processor_config.json +6 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/special_tokens_map.json +98 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/tokenization_evabyte.py +246 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/tokenizer_config.json +596 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/README.md +105 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/config.json +48 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/configuration_evabyte.py +99 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/eva.py +424 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/eva_agg_kernel.py +1766 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/eva_cache.py +761 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/eva_prep_kv_kernel.py +1017 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/eva_pt_ref.py +420 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/generation_config.json +7 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/image_processing_evabyte.py +204 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/model.safetensors.index.json +450 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/modeling_evabyte.py +912 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/multibyte_decoding_evabyte.py +881 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/preprocessor_config.json +18 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/processing_evabyte.py +287 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/processor_config.json +6 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/special_tokens_map.json +98 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/tokenization_evabyte.py +246 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/tokenizer_config.json +596 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/README.md +105 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/config.json +48 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/configuration_evabyte.py +99 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/eva.py +424 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/eva_agg_kernel.py +1766 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/eva_cache.py +761 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/eva_prep_kv_kernel.py +1017 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/eva_pt_ref.py +420 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/generation_config.json +7 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/image_processing_evabyte.py +204 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/model.safetensors.index.json +450 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/modeling_evabyte.py +912 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/multibyte_decoding_evabyte.py +881 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/preprocessor_config.json +18 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/processing_evabyte.py +287 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/processor_config.json +6 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/special_tokens_map.json +98 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/tokenization_evabyte.py +246 -0
ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/tokenizer_config.json +596 -0
ckpts/ocpython_14b_bsz-2m_seq16k_docmask_multipredc2r8_90dynamic-10raw_transsentinel_minsize0ent98line16ow16pack_100B_2m_new_2_step-10000/README.md +105 -0

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.47.1"
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/image_processing_evabyte.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# coding=utf-8
+"""Image processor class for EvaByte."""
+from typing import Dict, List, Optional, Union, Tuple
+import io
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_utils import (
+    ImageInput,
+    PILImageResampling,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from PIL import Image
+def _get_qtable_bytes():
+    return {
+        5: b'\xff\xd8\xff\xdb\x00C\x00\xa0nx\x8cxd\xa0\x8c\x82\x8c\xb4\xaa\xa0\xbe\xf0\xff\xff\xf0\xdc\xdc\xf0\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xdb\x00C\x01\xa0\xb4\xb4\xf0\xd2\xf0\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xd9',
+        10: b'\xff\xd8\xff\xdb\x00C\x00P7<F<2PFAFZUP_x\xc8\x82xnnx\xf5\xaf\xb9\x91\xc8\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xdb\x00C\x01PZZxix\xeb\x82\x82\xeb\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xd9',
+        15: b'\xff\xd8\xff\xdb\x00C\x005%(/(!5/+/<95?P\x85WPIIP\xa3u{a\x85\xc1\xaa\xcb\xc8\xbe\xaa\xba\xb7\xd5\xf0\xff\xff\xd5\xe2\xff\xe6\xb7\xba\xff\xff\xff\xff\xff\xff\xff\xff\xff\xce\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xdb\x00C\x015<<PFP\x9dWW\x9d\xff\xdc\xba\xdc\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xd9',
+        20: b'\xff\xd8\xff\xdb\x00C\x00(\x1c\x1e#\x1e\x19(#!#-+(0<dA<77<{X]Id\x91\x80\x99\x96\x8f\x80\x8c\x8a\xa0\xb4\xe6\xc3\xa0\xaa\xda\xad\x8a\x8c\xc8\xff\xcb\xda\xee\xf5\xff\xff\xff\x9b\xc1\xff\xff\xff\xfa\xff\xe6\xfd\xff\xf8\xff\xdb\x00C\x01(--<5<vAAv\xf8\xa5\x8c\xa5\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xff\xd9',
+        25: b'\xff\xd8\xff\xdb\x00C\x00 \x16\x18\x1c\x18\x14 \x1c\x1a\x1c$" &0P40,,0bFJ:Ptfzxrfpn\x80\x90\xb8\x9c\x80\x88\xae\x8anp\xa0\xda\xa2\xae\xbe\xc4\xce\xd0\xce|\x9a\xe2\xf2\xe0\xc8\xf0\xb8\xca\xce\xc6\xff\xdb\x00C\x01 $$0*0^44^\xc6\x84p\x84\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xff\xd9',
+        30: b'\xff\xd8\xff\xdb\x00C\x00\x1b\x12\x14\x17\x14\x11\x1b\x17\x16\x17\x1e\x1c\x1b (B+(%%(Q:=0B`Ued_U][jx\x99\x81jq\x90s[]\x85\xb5\x86\x90\x9e\xa3\xab\xad\xabg\x80\xbc\xc9\xba\xa6\xc7\x99\xa8\xab\xa4\xff\xdb\x00C\x01\x1b\x1e\x1e(#(N++N\xa4n]n\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xff\xd9',
+        50: b'\xff\xd8\xff\xdb\x00C\x00\x10\x0b\x0c\x0e\x0c\n\x10\x0e\r\x0e\x12\x11\x10\x13\x18(\x1a\x18\x16\x16\x181#%\x1d(:3=<9387@H\\N@DWE78PmQW_bghg>Mqypdx\\egc\xff\xdb\x00C\x01\x10\x12\x12\x18\x15\x18/\x1a\x1a/cB8Bcccccccccccccccccccccccccccccccccccccccccccccccccc\xff\xd9',
+        75: b'\xff\xd8\xff\xdb\x00C\x00\x08\x06\x06\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f\x14\x1d\x1a\x1f\x1e\x1d\x1a\x1c\x1c $.\' ",#\x1c\x1c(7),01444\x1f\'9=82<.342\xff\xdb\x00C\x01\x08\t\t\x0c\x0b\x0c\x18\r\r\x182!\x1c!22222222222222222222222222222222222222222222222222\xff\xd9',
+        95: b'\xff\xd8\xff\xdb\x00C\x00\x02\x01\x01\x01\x01\x01\x02\x01\x01\x01\x02\x02\x02\x02\x02\x04\x03\x02\x02\x02\x02\x05\x04\x04\x03\x04\x06\x05\x06\x06\x06\x05\x06\x06\x06\x07\t\x08\x06\x07\t\x07\x06\x06\x08\x0b\x08\t\n\n\n\n\n\x06\x08\x0b\x0c\x0b\n\x0c\t\n\n\n\xff\xdb\x00C\x01\x02\x02\x02\x02\x02\x02\x05\x03\x03\x05\n\x07\x06\x07\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\xff\xd9',
+        100: b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xff\xdb\x00C\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xff\xd9',
+    }
+def _resize_if_exceeding_max_len(
+    width: int, height: int, min_len: Optional[int] = 16, max_len: Optional[int] = None
+) -> Tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+    Args:
+        height (`int`):
+            Height of the input image.
+        width (`int`):
+            Width of the input image.
+        max_len (`Dict[str, int]`, *optional*, defaults to the maximum size of the image):
+            Defines the maximum dimensions of the image.
+    Returns:
+        The output size of the image after resizing.
+    """
+    max_len = max(height, width) if max_len is None else max_len
+    aspect_ratio = width / height
+    if width >= height and width > max_len:
+        width = max_len
+        height = int(width / aspect_ratio)
+        if height % 2 != 0:
+            height += 1
+    elif height > width and height > max_len:
+        height = max_len
+        width = int(height * aspect_ratio)
+        if width % 2 != 0:
+            width += 1
+    # Avoid resizing to a size smaller than 1
+    height = max(height, min_len)
+    width = max(width, min_len)
+    return width, height
+class EvaByteImageProcessor(BaseImageProcessor):
+    model_input_names = []
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        size: Dict[str, int] = None,
+        do_convert_rgb: bool = True,
+        jpeg_quality: int = 25,
+        jpeg_subsampling: str = "4:2:0",
+        jpeg_streamtype: str = 2,
+        jpeg_restart_marker_blocks: int = 1,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.size = size if size is not None else {"longest_edge": 384}
+        self.do_convert_rgb = do_convert_rgb
+        self.jpeg_quality = jpeg_quality
+        self.jpeg_subsampling = jpeg_subsampling
+        self.jpeg_streamtype = jpeg_streamtype
+        self.jpeg_restart_marker_blocks = jpeg_restart_marker_blocks
+    def jpeg_encode(
+        self,
+        image,
+        jpeg_quality,
+        jpeg_subsampling,
+        jpeg_streamtype,
+        jpeg_restart_marker_blocks,
+    ):
+        with io.BytesIO() as output:
+            image.save(
+                output,
+                format="JPEG",
+                quality=jpeg_quality,
+                subsampling=jpeg_subsampling,
+                streamtype=jpeg_streamtype,
+                restart_marker_blocks=jpeg_restart_marker_blocks
+            )
+            jpeg_bytes = output.getvalue()
+        return jpeg_bytes
+    def jpeg_merge_qtables(
+        self,
+        image_bytes,
+        jpeg_quality=None,
+    ):
+        if jpeg_quality is None:
+            jpeg_quality = self.jpeg_quality
+        qtable_bytes = _get_qtable_bytes()[jpeg_quality]
+        return image_bytes[:2] + qtable_bytes[2:-2] + image_bytes[2:]
+    def resize(
+        self,
+        image: Image,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+    ) -> Image:
+        if "longest_edge" in size:
+            width, height = image.size
+            # Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio
+            width, height = _resize_if_exceeding_max_len(width, height, max_len=size["longest_edge"])
+            size = (width, height)
+        elif "width" in size and "height" in size:
+            size = (size["width"], size["height"])
+        else:
+            raise ValueError("size must be a dictionary with key 'longest_edge' or 'height' and 'width'.")
+        resized_image = image.resize(size, resample=resample)
+        return resized_image
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        resample = None,
+        size: Dict[str, int] = None,
+        do_convert_rgb: bool = None,
+        jpeg_quality: int = None,
+        jpeg_subsampling: str = None,
+        jpeg_streamtype: str = None,
+        jpeg_restart_marker_blocks: int = None,
+    ):
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        jpeg_quality = jpeg_quality if jpeg_quality is not None else self.jpeg_quality
+        jpeg_subsampling = jpeg_subsampling if jpeg_subsampling is not None else self.jpeg_subsampling
+        jpeg_streamtype = jpeg_streamtype if jpeg_streamtype is not None else self.jpeg_streamtype
+        jpeg_restart_marker_blocks = jpeg_restart_marker_blocks if jpeg_restart_marker_blocks is not None else self.jpeg_restart_marker_blocks
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        images_list = images
+        if do_convert_rgb:
+            images_list = [
+                [
+                    image.convert("RGB") for image in images
+                ]
+                for images in images_list
+            ]
+        if do_resize:
+            images_list = [
+                [
+                    self.resize(image=image, size=size, resample=resample)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+        jpeg_bytes = [
+            [
+                self.jpeg_encode(
+                    image,
+                    jpeg_quality,
+                    jpeg_subsampling,
+                    jpeg_streamtype,
+                    jpeg_restart_marker_blocks
+                ) for image in images
+            ]
+            for images in images_list
+        ]
+        return jpeg_bytes

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+  "metadata": {
+    "total_size": 57058938880
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.2.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.26.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.3.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.27.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.4.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.28.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.5.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.6.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.30.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.6.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.30.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.7.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.31.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.7.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.31.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.8.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.32.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.2.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.14.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.38.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.norm.weight": "model-00003-of-00003.safetensors",
+    "lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.10.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.34.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.10.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.34.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.16.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.12.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.36.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.15.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.15.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.39.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.18.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.adaptive_phi": "model-00003-of-00003.safetensors"
+  }
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/modeling_evabyte.py ADDED Viewed

	@@ -0,0 +1,912 @@

+from typing import List, Optional, Tuple, Union
+import math
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_evabyte import EvaByteConfig
+from .multibyte_decoding_evabyte import MultiByteDecodingMixin
+try:
+    import triton
+    USE_TRITON_IMPL = True
+    from .eva import EvaAttention
+    from .eva_agg_kernel import triton_eva_agg_fwd
+    from .eva_prep_kv_kernel import triton_eva_prep_kv_fwd
+except ImportError:
+    USE_TRITON_IMPL = False
+    print("WARNING: triton is not installed, using fallback EVA which might be slow and throw errors")
+    from .eva_pt_ref import EvaAttention
+from .eva_cache import EvaCache, EvaStaticCacheForTriton
+MASK_MIN_VALUE = -10e10
+def prepare_eva_attention_mask(
+        seq_len,
+        device,
+        chunk_size,
+        window_size,
+        use_cache=False,
+        cache=None
+    ):
+    """
+    Prepare attention masks for EVA.
+    """
+    chunk_causal_mask  = None
+    window_causal_mask = None
+    if use_cache:
+        cached_seq_len = cache.get_seq_length()
+        total_seq_len = seq_len + cached_seq_len
+        # cached_seq_len will be 0 during prefilling
+        # padded_seq_len = chunk_size * math.ceil(total_seq_len / chunk_size)
+        padded_seq_len = window_size * math.ceil(total_seq_len / window_size)
+        num_chunks = padded_seq_len // chunk_size
+    else:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        assert seq_len % chunk_size == 0
+        num_chunks = seq_len // chunk_size
+        assert seq_len % window_size == 0
+    # create causal mask
+    ################################
+    # generate chunked causal masks
+    ################################
+    # [b, h, j, c, c]
+    chunks_per_window = window_size // chunk_size
+    if num_chunks >= chunks_per_window:
+        chunk_causal_mask = torch.ones(
+            (chunk_size, num_chunks, num_chunks),
+            device=device,
+            dtype=torch.bool
+        ).triu(0)
+        num_blocks = num_chunks // chunks_per_window
+        chunk_causal_mask = chunk_causal_mask.reshape(
+            chunk_size,
+            num_blocks,
+            chunks_per_window,
+            num_blocks,
+            chunks_per_window
+        ).transpose(-2, -3)
+        block_diag_zero = (
+            torch.eye(num_blocks, device=device, dtype=torch.bool)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .unsqueeze(0)
+        )
+        # Set diagonal blocks to zero
+        chunk_causal_mask = chunk_causal_mask.masked_fill(block_diag_zero, True)
+        # Reshape back to original size
+        chunk_causal_mask = (
+            chunk_causal_mask
+            .transpose(-2, -3)
+            .reshape(chunk_size, num_chunks, num_chunks)
+            .transpose(-2, -3)
+            .reshape(chunk_size * num_chunks, num_chunks)
+            .unsqueeze(0)
+            .unsqueeze(0)
+        )
+    else:
+        chunk_causal_mask = torch.ones(
+            (1, 1, chunk_size, num_chunks, num_chunks),
+            device=device,
+            dtype=torch.bool,
+        ).triu(0).transpose(-2, -3) # [1, 1, c, j, c]
+        chunk_causal_mask = chunk_causal_mask.reshape(
+            1, 1, chunk_size * num_chunks, num_chunks
+        ) # [1, 1, n, c]
+    if use_cache:
+        chunk_causal_mask = chunk_causal_mask[..., cached_seq_len : cached_seq_len + seq_len, :]
+    window_causal_mask = torch.ones(
+        (1, 1, 1, window_size, window_size),
+        device=device
+    ).triu(1).to(torch.bool)
+    return (chunk_causal_mask, window_causal_mask)
+def pad_to_multiple(tensor, multiple, dim=-2, value=0, create_mask=False, left_padding=False):
+    assert dim < 0 # only accept ``dim'' index in a reverse manner
+    seqlen = int(tensor.shape[dim])
+    m = seqlen / multiple
+    if m.is_integer():
+        if create_mask:
+            return tensor, torch.ones(size=(tensor.shape[0], tensor.shape[dim]), dtype=torch.bool, device=tensor.device)
+        else:
+            return tensor
+    remainder = math.ceil(m) * multiple - seqlen
+    pad_offset = (0,) * (-1 - dim) * 2
+    if left_padding:
+        padded_res = F.pad(tensor, (*pad_offset, remainder, 0), value=value)
+    else:
+        padded_res = F.pad(tensor, (*pad_offset, 0, remainder), value=value)
+    if create_mask:
+        # assume dim 0 is the batch size
+        padding_mask = torch.ones(size=(padded_res.shape[0], padded_res.shape[dim]), dtype=torch.bool, device=padded_res.device)
+        if left_padding:
+            padding_mask[:, :remainder] = False
+        else:
+            padding_mask[:, -remainder:] = False
+        return padded_res, padding_mask
+    else:
+        return padded_res
+class EvaByteRMSNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.fp32_ln = True
+        self.variance_epsilon = config.rms_norm_eps
+        self.add_unit_offset = config.norm_add_unit_offset
+        if self.add_unit_offset:
+            self.weight = nn.Parameter(torch.zeros(config.hidden_size))
+        else:
+            self.weight = nn.Parameter(torch.ones(config.hidden_size))
+    def forward(self, hidden_states):
+        _hidden_states = hidden_states.to(torch.float32 if self.fp32_ln else torch.bfloat16)
+        variance = _hidden_states.pow(2).mean(-1, keepdim=True)
+        _hidden_states = _hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        if self.add_unit_offset:
+            return ((1 + self.weight) * _hidden_states).type_as(hidden_states)
+        else:
+            return (self.weight * _hidden_states).type_as(hidden_states)
+class EvaByteRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._set_cos_sin_cache(seq_len=max_position_embeddings,
+                                device=self.inv_freq.device,
+                                dtype=torch.get_default_dtype())
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        # return (
+        #     self.cos_cached[:seq_len].to(dtype=x.dtype),
+        #     self.sin_cached[:seq_len].to(dtype=x.dtype),
+        # )
+        if seq_len < self.max_seq_len_cached:
+            cos_slice = self.cos_cached.split(seq_len, dim=0)[0]
+            sin_slice = self.sin_cached.split(seq_len, dim=0)[0]
+        else:
+            cos_slice = self.cos_cached
+            sin_slice = self.sin_cached
+        return (
+            cos_slice.to(dtype=x.dtype),
+            sin_slice.to(dtype=x.dtype),
+        )
+class EvaByteLinearScalingRotaryEmbedding(EvaByteRotaryEmbedding):
+    """EvaByteRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class EvaByteDynamicNTKScalingRotaryEmbedding(EvaByteRotaryEmbedding):
+    """EvaByteRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * ((self.scaling_factor * seq_len / self.max_position_embeddings) -
+                                (self.scaling_factor - 1))**(self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class EvaByteMLP(nn.Module):
+    def __init__(self, config, layer_idx: int = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.layer_idx = layer_idx
+        self.config = config
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class EvaByteDecoderLayer(nn.Module):
+    def __init__(self, config: EvaByteConfig, layer_idx: int = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = EvaAttention(config=config, layer_idx=layer_idx)
+        self.mlp = EvaByteMLP(config, layer_idx=layer_idx)
+        self.input_layernorm = EvaByteRMSNorm(config)
+        self.post_attention_layernorm = EvaByteRMSNorm(config)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+            cos: Optional[torch.Tensor] = None,
+            sin: Optional[torch.Tensor] = None,
+            multibyte_decoding: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        if self.config.fp32_skip_add:
+            residual = residual.float()
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(hidden_states=hidden_states,
+                                                                            attention_mask=attention_mask,
+                                                                            position_ids=position_ids,
+                                                                            past_key_value=past_key_value,
+                                                                            output_attentions=output_attentions,
+                                                                            use_cache=use_cache,
+                                                                            cos=cos,
+                                                                            sin=sin,
+                                                                            multibyte_decoding=multibyte_decoding)
+        hidden_states = (residual + hidden_states).to(hidden_states.dtype)
+        # Fully Connected
+        residual = hidden_states
+        if self.config.fp32_skip_add:
+            residual = residual.float()
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = (residual + hidden_states).to(hidden_states.dtype)
+        outputs = (hidden_states, )
+        if output_attentions:
+            outputs += (self_attn_weights, )
+        if use_cache:
+            outputs += (present_key_value, )
+        return outputs
+class EvaBytePreTrainedModel(PreTrainedModel):
+    config_class = EvaByteConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["EvaByteDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    def _init_weights(self, module):
+        std = getattr(self.config, "initializer_range", 0.02)
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, EvaByteModel):
+            module.gradient_checkpointing = value
+class EvaByteModel(EvaBytePreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`EvaByteDecoderLayer`]
+    Args:
+        config: EvaByteConfig
+    """
+    def __init__(self, config: EvaByteConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = self.config.max_position_embeddings
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([EvaByteDecoderLayer(config, layer_idx=layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.norm = EvaByteRMSNorm(config)
+        self.gradient_checkpointing = False
+        self.rope = config.rope_theta
+        # Initialize weights and apply final processing
+        self.post_init()
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = EvaByteRotaryEmbedding(self.head_dim,
+                                                   max_position_embeddings=self.max_position_embeddings,
+                                                   base=self.rope)
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = EvaByteLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope)
+            elif scaling_type == "dynamic":
+                self.rotary_emb = EvaByteDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope)
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def _helper_padding_mask(
+            self,
+            padding_mask,
+            causal_mask
+    ):
+        padding_mask = torch.logical_or(padding_mask, padding_mask.transpose(-1, -2))
+        return torch.logical_or(padding_mask, causal_mask)
+    def _prepare_eva_generation_attn_mask_triton(
+        self,
+        attention_mask,
+        input_ids,
+        use_cache,
+        past_key_values
+    ):
+        batch_size, seq_len = input_ids.shape
+        if use_cache and past_key_values.get_seq_length() > 0:
+            # decoding phase
+            if past_key_values.rf_mask[0] is not None:
+                cur_rf_mask = torch.zeros(
+                    (batch_size, 1, seq_len, 1),
+                    dtype=past_key_values.rf_mask[0].dtype,
+                    device=past_key_values.rf_mask[0].device
+                )
+            else:
+                cur_rf_mask = None
+            if past_key_values.s_mask[0] is not None:
+                cur_s_mask = torch.zeros(
+                    (batch_size, 1, seq_len, 1),
+                    dtype=past_key_values.s_mask[0].dtype,
+                    device=past_key_values.s_mask[0].device
+                )
+            else:
+                cur_s_mask = None
+            seen_tokens = past_key_values.get_seq_length()
+            if seen_tokens <= self.config.window_size:
+                rfa_chunks_dummy_mask = None
+            else:
+                if cur_s_mask is not None:
+                    chunks_per_window = int(self.config.window_size // self.config.chunk_size)
+                    # the ongoing decoding step would be (seen_seq_len + 1)-th token
+                    num_windows_seen_so_far = seen_tokens // self.config.window_size
+                    rfa_chunks_dummy_mask = torch.zeros(
+                        (batch_size, 1, seq_len, num_windows_seen_so_far * chunks_per_window),
+                        dtype=past_key_values.s_mask[0].dtype,
+                        device=past_key_values.s_mask[0].device
+                    )
+                else:
+                    rfa_chunks_dummy_mask = None
+            # rf_mask and cur_mask are 0s because we do not want to mask them
+            return (cur_s_mask, cur_rf_mask, rfa_chunks_dummy_mask)
+        if attention_mask is not None and torch.any(attention_mask == 0.0):
+            # convert 0 -> padding to 1 -> padding
+            padded_attention_mask = pad_to_multiple(
+                attention_mask,
+                self.config.window_size,
+                dim=-1,
+                value=0,
+                create_mask=False,
+                left_padding=False
+            )
+            # convert 0 -> padding to 1 -> padding
+            padded_rf_mask = ~padded_attention_mask.unsqueeze(1).unsqueeze(-1).to(torch.bool) # [b, 1, n, 1]
+            # [b, 1, w, j, 1]
+            padded_w_attn_mask = padded_rf_mask.reshape(batch_size, 1, -1, self.config.window_size, 1).to(torch.bool)
+            # [b, 1, w, j, 1] [b, 1, w, 1, j] -> [b, 1, w, j, j]
+            w_padding_mask = torch.logical_or(padded_w_attn_mask, padded_w_attn_mask.transpose(-1, -2))
+            w_causal_mask = torch.ones(
+                (1, 1, 1, self.config.window_size, self.config.window_size),
+                device=input_ids.device
+            ).triu(1).to(torch.bool)
+            s_mask = torch.logical_or(w_padding_mask, w_causal_mask)
+            s_mask = s_mask.reshape(batch_size, 1, -1, self.config.window_size)
+            s_mask = s_mask[..., :seq_len, :]
+            # negate the attention mask to get the padding mask
+            rf_mask = ~attention_mask.unsqueeze(1).unsqueeze(-1).to(torch.bool) # [b, 1, n, 1]
+            return (s_mask, rf_mask)
+        else:
+            return (None, None)
+    def _prepare_eva_generation_attn_mask(
+        self,
+        attention_mask,
+        input_ids,
+        use_cache,
+        past_key_values
+    ):
+        batch_size, seq_len = input_ids.shape
+        if use_cache and past_key_values.get_seq_length() > 0:
+            # decoding phase
+            if past_key_values.rf_mask[0] is not None:
+                rf_mask = torch.zeros(
+                    (batch_size, 1, seq_len, 1),
+                    dtype=past_key_values.rf_mask[0].dtype,
+                    device=past_key_values.rf_mask[0].device
+                )
+            else:
+                rf_mask = None
+            cur_causal_mask = torch.zeros(
+                (batch_size, 1, seq_len, 1),
+                dtype=torch.bool,
+                device=input_ids.device
+            )
+            chunk_causal_mask = torch.ones(
+                (batch_size, 1, seq_len, 1),
+                dtype=torch.bool,
+                device=input_ids.device
+            )
+            # chunk_causal_mask are 1s because we will mask them by default and
+            # will be unmasked when the current singleton attention is processed over
+            return (None, cur_causal_mask, chunk_causal_mask, rf_mask)
+        true_num_chunks = seq_len // self.config.chunk_size
+        chunk_causal_mask, _ = prepare_eva_attention_mask(
+            seq_len,
+            input_ids.device,
+            self.config.chunk_size,
+            self.config.window_size,
+            use_cache=use_cache,
+            cache=past_key_values
+        )
+        chunk_causal_mask = chunk_causal_mask[..., :seq_len, :true_num_chunks]
+        if attention_mask is not None and torch.any(attention_mask == 0.0):
+            # convert 0 -> padding to 1 -> padding
+            rf_mask = ~attention_mask.unsqueeze(1).unsqueeze(-1).to(torch.bool) # [b, 1, n, 1]
+        else:
+            rf_mask = None
+        if seq_len < self.config.window_size:
+            cur_window_mask = torch.ones(
+                (1, 1, seq_len, seq_len),
+                device=input_ids.device
+            ).triu(1).to(torch.bool)
+            if rf_mask is not None:
+                cur_window_mask = self._helper_padding_mask(rf_mask, cur_window_mask)
+            prev_window_mask = None
+        else:
+            if seq_len % self.config.window_size == 0:
+                num_windows = seq_len // self.config.window_size
+                cur_window_mask = None
+                prev_window_mask = torch.ones(
+                    (1, 1, num_windows, self.config.window_size, self.config.window_size),
+                    device=input_ids.device
+                ).triu(1).to(torch.bool)
+                if rf_mask is not None:
+                    prev_rf_mask = rf_mask.reshape(batch_size, 1, -1, self.config.window_size, 1)
+                    prev_window_mask = self._helper_padding_mask(prev_rf_mask, prev_window_mask)
+            else:
+                num_windows = seq_len // self.config.window_size
+                remainder_tokens = seq_len % self.config.window_size
+                cur_window_mask = torch.ones(
+                    (1, 1, remainder_tokens, remainder_tokens),
+                    device=input_ids.device
+                ).triu(1).to(torch.bool)
+                prev_window_mask = torch.ones(
+                    (1, 1, num_windows, self.config.window_size, self.config.window_size),
+                    device=input_ids.device
+                ).triu(1).to(torch.bool)
+                if rf_mask is not None:
+                    prev_rf_mask, cur_rf_mask = torch.split(rf_mask, [seq_len - remainder_tokens, remainder_tokens], dim=-2)
+                    cur_window_mask = self._helper_padding_mask(cur_rf_mask, cur_window_mask)
+                    prev_rf_mask = prev_rf_mask.reshape(batch_size, 1, -1, self.config.window_size, 1)
+                    prev_window_mask = self._helper_padding_mask(prev_rf_mask, prev_window_mask)
+        return (prev_window_mask, cur_window_mask, chunk_causal_mask, rf_mask)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            multibyte_decoding: Optional[bool] = None,
+    ) -> Tuple:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else self.config.output_hidden_states)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if self.gradient_checkpointing and self.training and use_cache:
+            raise ValueError("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+        batch_size, seq_len = input_ids.shape
+        #### Step 0. Hack
+        if (not self.training) and (not use_cache) and (not multibyte_decoding):
+            # forward-only inference mode.
+            # We tweak use_cache to be True to reuse code for generation
+            use_cache = True
+            device = input_ids.device if input_ids is not None else None
+            if position_ids is None:
+                position_ids = torch.arange(0, seq_len, device=device, dtype=int).reshape(1, -1).expand(batch_size, -1)
+        #### Step 1. Prepare caches if in inference mode
+        if use_cache:
+            if past_key_values is not None:
+                assert isinstance(past_key_values, Cache)
+            else:
+                if not USE_TRITON_IMPL:
+                    past_key_values = EvaCache()
+                else:
+                    past_key_values = EvaStaticCacheForTriton(
+                        input_ids.shape[0],
+                        self.config.num_attention_heads,
+                        self.config.window_size,
+                        self.config.hidden_size // self.config.num_attention_heads,
+                        self.config.num_hidden_layers,
+                        self.embed_tokens.weight.dtype,
+                        self.embed_tokens.weight.device,
+                    )
+        if not multibyte_decoding:
+            if use_cache:
+                if USE_TRITON_IMPL:
+                    causal_mask = self._prepare_eva_generation_attn_mask_triton(
+                        attention_mask,
+                        input_ids,
+                        use_cache,
+                        past_key_values
+                    )
+                else:
+                    causal_mask = self._prepare_eva_generation_attn_mask(
+                        attention_mask,
+                        input_ids,
+                        use_cache,
+                        past_key_values
+                    )
+            else:
+                assert self.training
+                assert seq_len % self.config.window_size == 0, "Training is only tested for sequences that are a multiple of window_size"
+                # for training, we need to pass in the attention mask
+                # usually calculated by _prepare_training_attn_mask()
+                causal_mask = attention_mask
+        else:
+            assert use_cache
+            causal_mask = attention_mask
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        max_seq_length = past_seen_tokens + inputs_embeds.shape[1]
+        hidden_states = inputs_embeds
+        if position_ids is None:
+            assert not use_cache, "during decoding we must explicitly pass position_ids to the model call"
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(past_seen_tokens, max_seq_length, device=device, dtype=int).reshape(1, -1).expand(batch_size, -1)
+        cos, sin = self.rotary_emb(hidden_states, seq_len=max_seq_length)
+        assert len(cos.shape) == 2, f"cos should be of shape (max_seq_len, head_dim), got {cos.shape} instead"
+        assert sin.shape == cos.shape, f"sin should be of shape (max_seq_len, head_dim), got {sin.shape} instead"
+        assert len(position_ids.shape) == 2, f"position_ids should be of 2D, got {position_ids.shape} instead"
+        cos = cos[position_ids, :]
+        sin = sin[position_ids, :]
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states, )
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cos,
+                    sin,
+                    multibyte_decoding,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cos=cos,
+                    sin=sin,
+                    multibyte_decoding=multibyte_decoding,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1], )
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states, )
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class EvaByteForCausalLM(EvaBytePreTrainedModel, MultiByteDecodingMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        EvaBytePreTrainedModel.__init__(self, config)
+        self.model = EvaByteModel(config)
+        self.vocab_size = config.vocab_size
+        # define multibyte prediction heads
+        if hasattr(config, "num_pred_heads") and config.num_pred_heads > 1:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size * config.num_pred_heads, bias=False)
+        else:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            return_all_pred_logits: Optional[bool] = None,
+            multibyte_decoding: Optional[bool] = None) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is None:
+            assert past_key_values is None
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            multibyte_decoding=multibyte_decoding,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        if self.config.fp32_logits:
+            logits = logits.float()
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(reduction="none")
+            if hasattr(self.config, "num_pred_heads") and self.config.num_pred_heads > 1:
+                shift_logits = logits.view(logits.shape[0], logits.shape[1], self.config.num_pred_heads, self.config.vocab_size)
+                # shift_logits = shift_logits.view(-1, logits.shape[1] * self.config.num_pred_heads, self.config.vocab_size)
+                shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            else:
+                shift_logits = logits.view(-1, self.config.vocab_size)
+            shift_labels = labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if hasattr(self.config, "num_pred_heads") and self.config.num_pred_heads > 1:
+            all_pred_logits = logits.reshape(logits.shape[0], logits.shape[1], self.config.num_pred_heads, self.config.vocab_size)
+            if return_all_pred_logits:
+                logits = all_pred_logits
+            else:
+                logits = all_pred_logits[..., 0, :]
+        if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return (loss, ) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past_key_values=None,
+                                      attention_mask=None,
+                                      inputs_embeds=None,
+                                      use_cache=True,
+                                      **kwargs):
+        # prefill phase:
+        # input_ids:      b x s
+        # attention_mask: None if no padding or b x s
+        # position_ids :  b x s
+        # token gen phase:
+        # input_ids : b x 1
+        # attention_mask: b x 1 x s
+        # position_ids:  b x 1
+        past_length = 0
+        if past_key_values is not None:
+            assert isinstance(past_key_values, Cache)
+            past_length = past_key_values.get_seq_length()
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        # must initialize position_ids at each step during GPU inference
+        assert position_ids is not None
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), )
+        return reordered_past

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/multibyte_decoding_evabyte.py ADDED Viewed

	@@ -0,0 +1,881 @@

+# The implementation of multibyte deocidng is largely adapted from
+# Medusa decoding: https://github.com/FasterDecoding/Medusa
+import torch
+import torch.nn.functional as F
+from transformers.generation.stopping_criteria import (
+    MaxLengthCriteria,
+    StoppingCriteriaList,
+)
+from typing import Union, List
+from .eva_cache import EvaStaticCacheForTriton
+from .eva_prep_kv_kernel import triton_eva_prep_kv_fwd
+class MultibyteEosTokenCriteria:
+    """
+    This class implements a simple stopping criteria to stop generation whenever
+    the "end-of-sequence" token is generated in the last `new_tokens` tokens.
+    Adapted from
+    https://github.com/huggingface/transformers/blob/main/src/transformers/generation/stopping_criteria.py#L446
+    By default, it uses the `model.generation_config.eos_token_id`.
+    Args:
+        eos_token_id (`Union[int, List[int]]`):
+            The id(s) of the *end-of-sequence* token.
+    """
+    def __init__(self, eos_token_ids: Union[int, List[int]]):
+        if isinstance(eos_token_ids, int):
+            eos_token_ids = [eos_token_ids]
+        self.eos_token_ids = eos_token_ids
+    def __call__(self, input_ids: torch.LongTensor, new_tokens: int) -> bool:
+        current_input_len = input_ids.shape[-1]
+        new_token_ids = input_ids[:, current_input_len - new_tokens:]
+        for eos_token_id in self.eos_token_ids:
+            if torch.any(new_token_ids == eos_token_id):
+                return True
+        return False
+def build_tree(spec):
+    nodes_at_depth = []
+    nodes_at_depth.append([()])  # Root at depth 1
+    for d in range(1, len(spec) + 1):
+        prev_nodes = nodes_at_depth[d - 1]
+        spec_list = spec[d - 1]
+        current_nodes = []
+        for node_idx, node in enumerate(prev_nodes):
+            if node_idx < len(spec_list):
+                num_children = spec_list[node_idx]
+            else:
+                num_children = 0
+            for child_idx in range(num_children):
+                new_node = node + (child_idx,)
+                current_nodes.append(new_node)
+        nodes_at_depth.append(current_nodes)
+    # Flatten the list of nodes, excluding the root node if desired
+    all_nodes = [node for depth_nodes in nodes_at_depth for node in depth_nodes if node]
+    return all_nodes
+evabyte_7b_95 = build_tree(
+    [
+        [10],
+        [10, 8, 2, 2, 1, 1],
+        [10, 4, 2, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 1],
+        [8, 2, 2, 1, 0, 0, 0, 0, 0, 0, 1],
+        [6, 2, 1, 1],
+        [4, 2, 1, 1],
+        [4, 2, 1],
+    ]
+)
+evabyte_7b_31 = build_tree(
+    [
+        [4],
+        [3, 2, 1, 1],
+        [3, 2, 1, 1],
+        [2, 1, 1],
+        [2, 1],
+        [2, 1],
+        [2, 1],
+    ]
+)
+TOPK = 10 # topk for sparse tree (10 is a placeholder and it is sufficient)
+def pad_path(path, length, pad_value=-2):
+    """
+    Pad the given path list with a specific value up to a specified length.
+    Parameters:
+    - path (list): The original list that needs padding.
+    - length (int): The desired length of the padded list.
+    - pad_value (optional, default=-2): The value to use for padding.
+    Returns:
+    - list: A new list based on the original path but padded to the desired length.
+    Example:
+    >>> pad_path([1,2,3], 5)
+    [1, 2, 3, -2, -2]
+    Note:
+    If the given path is already longer than the specified length,
+    then no padding occurs, and the original path is returned.
+    """
+    return path + [pad_value] * (length - len(path))
+def reset_past_key_values(passed_key_values):
+    """
+    Resets the current lengths in the passed key-values to zero.
+    This function is designed to be used during the evaluation of a baseline model.
+    It iterates through each layer's key-values and sets their current lengths to zero,
+    effectively resetting their state.
+    Args:
+    - passed_key_values (list of torch.Tensor): Contains past hidden states and past attention values for each layer.
+    Returns:
+    - passed_key_values (list of torch.Tensor): Updated past hidden states and past attention values with reset lengths.
+    """
+    for i in range(len(passed_key_values)):
+        for j in range(2):
+            passed_key_values[i][j].current_length.fill_(0)
+    return passed_key_values
+def get_nucleus_one_token(logit, temperature, top_p):
+    """
+    Performs token sampling based on the nucleus (top-p) sampling method.
+    This function selects a token from a given logit distribution using the nucleus sampling strategy.
+    It allows for more controlled and diverse generation compared to traditional top-k sampling.
+    Args:
+        logit (torch.Tensor): The logits from a language model output, expected to be a 2D tensor (BxC).
+        temperature (float): A temperature parameter to control the randomness in sampling.
+                             Higher values increase diversity, lower values make selections more deterministic.
+        top_p (float): The cumulative probability threshold for nucleus sampling.
+                       It controls the size of the set of high-probability tokens to consider for sampling.
+    Returns:
+        torch.Tensor: A tensor containing the indices of the sampled tokens.
+    """
+    if top_p >= 1:
+        return torch.multinomial(F.softmax(logit / temperature, dim=-1), 1)
+    logit = logit / temperature
+    probs = torch.softmax(logit, dim=-1)
+    sorted_logits, sorted_indices = torch.sort(probs, descending=True)
+    cum_probs = torch.cumsum(sorted_logits, dim=-1)
+    sorted_indices_to_remove = cum_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+    logit[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logit, dim=-1), 1)
+    return sampled_tokens
+def get_typical_one_token(logit, temperature, posterior_threshold, posterior_alpha):
+    """
+    Implements token sampling based on the typical sampling method.
+    This function selects a token from a given logit distribution using the typical sampling strategy,
+    aiming to balance between diversity and likelihood in a more nuanced way compared to traditional methods.
+    Args:
+        logit (torch.Tensor): The logits from a language model output, expected to be a 2D tensor.
+        temperature (float): A parameter to control the randomness in sampling.
+                              Higher values increase diversity, lower values make selections more deterministic.
+        posterior_threshold (float): A threshold to decide the lower bound of probabilities to be considered for sampling.
+        posterior_alpha (float): A scaling factor applied to the entropy-based adaptive threshold.
+    Returns:
+        torch.Tensor: A tensor containing the indices of the sampled tokens.
+    """
+    logit = logit / temperature
+    probs = torch.softmax(logit, dim=-1)
+    entropy = -torch.sum(
+            probs * torch.log(probs + 1e-5), dim=-1
+        )
+    threshold = torch.minimum(
+            torch.ones_like(entropy) * posterior_threshold,
+            torch.exp(-entropy) * posterior_alpha,
+        )
+    indices_to_remove = probs < threshold.unsqueeze(-1)
+    logit[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logit, dim=-1), 1)
+    return sampled_tokens
+def generate_medusa_buffers(medusa_choices, device="cuda"):
+    """
+    Generate buffers for the Medusa structure based on the provided choices.
+    Parameters:
+    - medusa_choices (list): A nested list representing tree in the Medusa structure.
+    - device (str): Device to which the tensors should be moved. Default is "cuda".
+    Returns:
+    - dict: A dictionary containing buffers related to the Medusa structure.
+    """
+    # Sort the medusa_choices based on their lengths and then their values
+    sorted_medusa_choices = sorted(medusa_choices, key=lambda x: (len(x), x))
+    medusa_len = len(sorted_medusa_choices) + 1
+    # Initialize depth_counts to keep track of how many choices have a particular depth
+    depth_counts = [0] * max([len(path) for path in sorted_medusa_choices])
+    for path in sorted_medusa_choices:
+        depth_counts[len(path) - 1] += 1
+    # Create the attention mask for Medusa
+    medusa_attn_mask = torch.eye(medusa_len, medusa_len)
+    medusa_attn_mask[:, 0] = 1
+    start = 0
+    for i in range(len(depth_counts)):
+        for j in range(depth_counts[i]):
+            cur_medusa_choice = sorted_medusa_choices[start + j]
+            # retrieve ancestor position
+            if len(cur_medusa_choice) == 1:
+                continue
+            ancestor_idx = []
+            for c in range(len(cur_medusa_choice) - 1):
+                ancestor_idx.append(sorted_medusa_choices.index(cur_medusa_choice[:c+1]) + 1)
+            medusa_attn_mask[j + start + 1, ancestor_idx] = 1
+        start += depth_counts[i]
+    # Generate tree indices for the Medusa structure
+    medusa_tree_indices = torch.zeros(medusa_len, dtype=torch.long)
+    medusa_tree_indices[0] = 0
+    start = 0
+    for i in range(len(depth_counts)):
+        for j in range(depth_counts[i]):
+            cur_medusa_choice = sorted_medusa_choices[start + j]
+            medusa_tree_indices[start + j + 1] = cur_medusa_choice[-1] + TOPK * i + 1
+        start += depth_counts[i]
+    # Generate position IDs for the Medusa structure
+    medusa_position_ids = torch.zeros(medusa_len, dtype=torch.long)
+    start = 0
+    for i in range(len(depth_counts)):
+        medusa_position_ids[start + 1: start + depth_counts[i] + 1] = i + 1
+        start += depth_counts[i]
+    # Generate retrieval indices for Medusa structure verification
+    retrieve_indices_nest = []
+    retrieve_paths = []
+    for i in range(len(sorted_medusa_choices)):
+        cur_medusa_choice = sorted_medusa_choices[-i-1]
+        retrieve_indice = []
+        if cur_medusa_choice in retrieve_paths:
+            continue
+        else:
+            for c in range(len(cur_medusa_choice)):
+                retrieve_indice.append(sorted_medusa_choices.index(cur_medusa_choice[:c+1]))
+                retrieve_paths.append(cur_medusa_choice[:c+1])
+        retrieve_indices_nest.append(retrieve_indice)
+    max_length = max([len(x) for x in retrieve_indices_nest])
+    retrieve_indices = [pad_path(path, max_length) for path in retrieve_indices_nest]
+    retrieve_indices = torch.tensor(retrieve_indices, dtype=torch.long)
+    retrieve_indices = retrieve_indices + 1
+    retrieve_indices = torch.cat([torch.zeros((retrieve_indices.shape[0], 1), dtype=torch.long), retrieve_indices], dim=1)
+    # Aggregate the generated buffers into a dictionary
+    medusa_buffers = {
+        "medusa_attn_mask": medusa_attn_mask.unsqueeze(0).unsqueeze(0),
+        "tree_indices": medusa_tree_indices,
+        "medusa_position_ids": medusa_position_ids.unsqueeze(0),
+        "retrieve_indices": retrieve_indices,
+    }
+    # Move the tensors in the dictionary to the specified device
+    medusa_buffers = {
+        k: v.clone().to(device)
+        if isinstance(v, torch.Tensor)
+        else torch.tensor(v, device=device)
+        for k, v in medusa_buffers.items()
+    }
+    return medusa_buffers
+def generate_candidates(
+        medusa_logits,
+        logits,
+        tree_indices,
+        retrieve_indices,
+        temperature = 0,
+        posterior_threshold=0.3,
+        posterior_alpha = 0.09,
+        top_p=0.8,
+        sampling = 'typical',
+        fast = False
+    ):
+    # Say we have 3 heads, and the top-4 for each head are:
+    # [10, 3, 8, 4]
+    # [9, 5, 1, 6]
+    # [7, 16, 3, 2]
+    # candidates_id = 10
+    if temperature == 0 or fast:
+        candidates_ids = torch.argmax(logits[:, -1]).unsqueeze(0)
+    else:
+        if sampling == 'typical':
+            candidates_ids = get_typical_one_token(logits[:, -1], temperature, posterior_threshold, posterior_alpha).squeeze(0)
+        elif sampling == 'nucleus':
+            candidates_ids = get_nucleus_one_token(logits[:, -1], temperature, top_p).squeeze(0)
+        else:
+            raise NotImplementedError
+    # this calculates the top-k medusa logits
+    # candidates_medusa_id = [
+    #   [9, 5, 1, 6]
+    #   [7, 16, 3, 2]
+    # ]
+    candidates_medusa_ids = torch.topk(medusa_logits[:, 0, -1], TOPK, dim=-1).indices
+    # [10, 9, 5, 1, 6, 7, 16, 3, 2]
+    candidate_ids = torch.cat([candidates_ids, candidates_medusa_ids.view(-1)], dim=-1)
+    # based on the pre-defined tree_indices, select the corresponding candidates
+    # if we select top-2 and top-3 for the two heads (we select top-1 for the first head):
+    # tree_candidates = [10, 9, 5, 7, 16, 3, 7, 16, 3]
+    tree_candidate_ids = candidate_ids[tree_indices]
+    # tree_candidate_ids = [10, 9, 5, 7, 16, 3, 7, 16, 3, 0]
+    # Sometimes the tree_indices are padded, so we append a zero here
+    # so that all padded indices select the appended zero.
+    tree_candidate_ids_ext = torch.cat(
+        [
+            tree_candidate_ids,
+            torch.zeros((1), dtype=torch.long, device=tree_candidate_ids.device)
+        ],
+        dim=0
+    )
+    # [[10, 9, 7], [10, 9, 16], [10, 9, 3], [10, 5, 7], [10, 5, 16], [10, 5, 3]]
+    unflattened_candidate_ids = tree_candidate_ids_ext[retrieve_indices]
+    tree_candidate_ids = tree_candidate_ids.unsqueeze(0)
+    return tree_candidate_ids, unflattened_candidate_ids
+def get_nucleus_posterior_mask(logits, candidates, temperature, top_p):
+    """
+    Generates a posterior mask for token candidates using nucleus (top-p) sampling.
+    This function applies nucleus sampling to a set of logits, and then generates a mask indicating
+    which candidate tokens are selected. It adapts the sampling strategy to accommodate for
+    temperature scaling and cumulative probability thresholding.
+    Args:
+        logits (torch.Tensor): A tensor of logits from a language model output.
+        candidates (torch.Tensor): A tensor of candidate tokens to compare against sampled tokens.
+        temperature (float): A parameter to scale the logits, controlling randomness in sampling.
+        top_p (float): The cumulative probability threshold for nucleus sampling.
+    Returns:
+        torch.Tensor: A posterior mask indicating which candidate tokens match the sampled tokens.
+    """
+    # adapted from https://github.com/huggingface/transformers/blob/18a879f47576822aa1a5c49aecb27d89bfa5fa69/examples/run_generation.py#L79
+    # Apply temperature
+    logits = logits[:, :-1] / temperature
+    n_samples, n_tokens = logits.shape[0], logits.shape[1]
+    logits = logits.view(n_samples*n_tokens, -1)
+    if top_p >= 1:
+        sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+        sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+        posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+        return posterior_mask
+    # Convert to probabilities (softmax)
+    probs = F.softmax(logits, dim=-1)
+    # Sort the probabilities
+    sorted_logits, sorted_indices = torch.sort(probs, descending=True)
+    # Compute cumulative probabilities
+    cum_probs = torch.cumsum(sorted_logits, dim=-1)
+    # Create mask for the top-p nucleus
+    sorted_indices_to_remove = cum_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+    # Remove low-probability tokens
+    logits[indices_to_remove] = float('-inf')
+    # Sample from the remaining tokens
+    sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+    sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+    # Create a mask for selected tokens
+    posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+    return posterior_mask
+def get_typical_posterior_mask(logits, candidates, temperature, posterior_threshold, posterior_alpha):
+    """
+    Args:
+        logits (torch.Tensor): A tensor of logits from a language model output.
+        candidates (torch.Tensor): A tensor of candidate tokens to compare against sampled tokens.
+        temperature (float): A parameter to scale the logits, controlling randomness in sampling.
+        posterior_threshold (float): The minimum threshold for probabilities to be considered in sampling.
+        posterior_alpha (float): A scaling factor applied to the entropy-based adaptive threshold.
+    Returns:
+        torch.Tensor: A posterior mask indicating which candidate tokens match the sampled tokens.
+    """
+    logits = logits[:, :-1] / temperature
+    n_samples, n_tokens = logits.shape[0], logits.shape[1]
+    logits = logits.view(n_samples*n_tokens, -1)
+    probs = F.softmax(logits, dim=-1)
+    entropy = -torch.sum(
+            probs * torch.log(probs + 1e-5), dim=-1
+        )
+    threshold = torch.minimum(
+            torch.ones_like(entropy) * posterior_threshold,
+            torch.exp(-entropy) * posterior_alpha,
+        )
+    indices_to_remove = probs < threshold.unsqueeze(-1)
+    logits[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+    sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+    posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+    return posterior_mask
+def evaluate_posterior(
+    logits,
+    candidates,
+    temperature,
+    posterior_threshold=0.3,
+    posterior_alpha = 0.09,
+    top_p=0.8,
+    sampling = 'typical',
+    fast = True
+):
+    if logits.shape[1] <= 1:
+        return torch.tensor(0, dtype=torch.long, device=candidates.device), 0
+    # Greedy decoding based on temperature value
+    if temperature == 0:
+        # Find the tokens that match the maximum logits for each position in the sequence
+        posterior_mask = (
+            candidates[:, 1:] == torch.argmax(logits[:, :-1], dim=-1)
+        ).int()
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        accept_length = candidates_accept_length.max().item()
+        # Choose the best candidate
+        if accept_length == 0:
+            # Default to the first candidate if none are accepted
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+        return best_candidate, accept_length
+    elif sampling == 'typical':
+        if fast:
+            posterior_prob = torch.softmax(logits[:, :-1] / temperature, dim=-1)
+            candidates_prob = torch.gather(
+                posterior_prob, dim=-1, index=candidates[:, 1:].unsqueeze(-1)
+            ).squeeze(-1)
+            posterior_entropy = -torch.sum(
+                posterior_prob * torch.log(posterior_prob + 1e-5), dim=-1
+            )  # torch.sum(torch.log(*)) is faster than torch.prod
+            threshold = torch.minimum(
+                torch.ones_like(posterior_entropy) * posterior_threshold,
+                torch.exp(-posterior_entropy) * posterior_alpha,
+            )
+            posterior_mask = candidates_prob > threshold
+            candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+            # Choose the best candidate based on the evaluated posterior probabilities
+            accept_length = candidates_accept_length.max().item()
+            if accept_length == 0:
+                # If no candidates are accepted, just choose the first one
+                best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+            else:
+                best_candidates = torch.where(candidates_accept_length == accept_length)[0]
+                # Accept the best one according to likelihood
+                likelihood = torch.sum(
+                    torch.log(candidates_prob[best_candidates, :accept_length]), dim=-1
+                )
+                best_candidate = best_candidates[torch.argmax(likelihood)]
+            return best_candidate, accept_length
+        # Calculate posterior probabilities and thresholds for candidate selection
+        posterior_mask = get_typical_posterior_mask(logits, candidates, temperature, posterior_threshold, posterior_alpha)
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        # Choose the best candidate based on the evaluated posterior probabilities
+        accept_length = candidates_accept_length.max().item()
+        if accept_length == 0:
+            # If no candidates are accepted, just choose the first one
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+            # Accept the best one according to likelihood
+        return best_candidate, accept_length
+    elif sampling == 'nucleus':
+        assert top_p < 1.0 + 1e-6, "top_p should between 0 and 1"
+        posterior_mask = get_nucleus_posterior_mask(logits, candidates, temperature, top_p)
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        accept_length = candidates_accept_length.max().item()
+        # Choose the best candidate
+        if accept_length == 0:
+            # Default to the first candidate if none are accepted
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+        return best_candidate, accept_length
+    else:
+        raise NotImplementedError
+def update_inference_inputs(
+    input_ids,
+    medusa_logits,
+    logits,
+    candidate_ids,
+    best_candidate,
+    accept_length,
+):
+    input_ids = torch.cat(
+        [
+            input_ids,
+            candidate_ids[None, best_candidate, : accept_length + 1]
+        ],
+        dim=-1
+    )
+    logits = logits[
+        None, best_candidate, accept_length : accept_length + 1
+    ]
+    medusa_logits = medusa_logits[
+        :, None, best_candidate, accept_length : accept_length + 1
+    ]
+    # Update the new token counter
+    new_token = accept_length + 1
+    return input_ids, medusa_logits, logits, new_token
+def split_logits(full_logits):
+    # logits has shape [b, n, heads, vocab_size]
+    logits = full_logits[..., 0, :]
+    medusa_logits = full_logits[..., 1:, :].permute(2, 0, 1, 3)
+    return medusa_logits, logits
+class MultiByteDecodingMixin:
+    def multi_byte_pred_update_cache(
+        self,
+        past_key_values,
+        retrieve_indices,
+        best_candidate,
+        new_tokens,
+    ):
+        prev_window_len = past_key_values.get_past_window_pos(0)
+        select_indices = (
+            retrieve_indices[best_candidate, : new_tokens] + prev_window_len
+        )
+        for layer_idx in range(self.config.num_hidden_layers):
+            past_key_values.update_past_len(new_tokens, layer_idx)
+            past_window_k = past_key_values.past_window_k[layer_idx]
+            past_window_v = past_key_values.past_window_v[layer_idx]
+            tgt_window_k = past_window_k[..., select_indices, :]
+            tgt_window_v = past_window_v[..., select_indices, :]
+            dst_window_k = past_window_k[..., prev_window_len : prev_window_len + new_tokens, :]
+            dst_window_v = past_window_v[..., prev_window_len : prev_window_len + new_tokens, :]
+            dst_window_k.copy_(tgt_window_k, non_blocking=True)
+            dst_window_v.copy_(tgt_window_v, non_blocking=True)
+            new_window_len = prev_window_len + new_tokens
+            if new_window_len >= self.config.window_size:
+                assert new_window_len < 2 * self.config.window_size
+                dump_k = past_window_k[..., :self.config.window_size, :].clone()
+                dump_v = past_window_v[..., :self.config.window_size, :].clone()
+                _window_len = new_window_len - self.config.window_size
+                if _window_len > 0:
+                    new_window_k = past_window_k[..., self.config.window_size : new_window_len, :]
+                    new_window_v = past_window_v[..., self.config.window_size : new_window_len, :]
+                    _dst_window_k = past_window_k[..., : _window_len, :]
+                    _dst_window_v = past_window_v[..., : _window_len, :]
+                    _dst_window_k.copy_(new_window_k, non_blocking=True)
+                    _dst_window_v.copy_(new_window_v, non_blocking=True)
+                past_key_values.past_window_pos[layer_idx] = _window_len
+            else:
+                dump_k = None
+                dump_v = None
+                past_key_values.past_window_pos[layer_idx] = new_window_len
+            if dump_k is not None and dump_v is not None:
+                rfa_k, rfa_v = triton_eva_prep_kv_fwd(
+                    dump_k, dump_v,
+                    self.model.layers[layer_idx].self_attn.adaptive_mu_k,
+                    self.model.layers[layer_idx].self_attn.adaptive_phi,
+                    None,
+                    self.model.layers[layer_idx].self_attn.head_dim_scaling,
+                    self.model.layers[layer_idx].self_attn.chunk_size
+                )
+                rfa_k, rfa_v = past_key_values.update_chunk_rfas(
+                    rfa_k, rfa_v, layer_idx
+                )
+        return past_key_values
+    def _multi_byte_pred_update_cache_when_prefil_len_eq_window_size(
+        self,
+        past_key_values,
+    ):
+        prev_window_len = past_key_values.get_past_window_pos(0)
+        for layer_idx in range(self.config.num_hidden_layers):
+            past_window_k = past_key_values.past_window_k[layer_idx]
+            past_window_v = past_key_values.past_window_v[layer_idx]
+            new_window_len = prev_window_len
+            if new_window_len == self.config.window_size:
+                dump_k = past_window_k[..., :self.config.window_size, :].clone()
+                dump_v = past_window_v[..., :self.config.window_size, :].clone()
+                past_key_values.past_window_pos[layer_idx] = 0
+                if dump_k is not None and dump_v is not None:
+                    rfa_k, rfa_v = triton_eva_prep_kv_fwd(
+                        dump_k, dump_v,
+                        self.model.layers[layer_idx].self_attn.adaptive_mu_k,
+                        self.model.layers[layer_idx].self_attn.adaptive_phi,
+                        None,
+                        self.model.layers[layer_idx].self_attn.head_dim_scaling,
+                        self.model.layers[layer_idx].self_attn.chunk_size
+                    )
+                    rfa_k, rfa_v = past_key_values.update_chunk_rfas(
+                        rfa_k, rfa_v, layer_idx
+                    )
+        return past_key_values
+    def multi_byte_pred_update_attn_mask(
+        self,
+        last_iter_new_tokens,
+        tree_candidate_ids,
+        past_attn_mask,
+        medusa_attn_mask,
+        past_key_values,
+    ):
+        batch_size, tree_candidate_len = tree_candidate_ids.shape
+        seen_tokens = past_key_values.get_seq_length()
+        # NOTE: past_key_values has been updated so now
+        # seen_tokens incldues new tokens from the last tree iteration
+        assert seen_tokens > 0
+        # so one iteration would not cross two windows
+        assert last_iter_new_tokens < self.config.window_size
+        if past_attn_mask is not None and seen_tokens < self.config.window_size:
+            past_attn_mask = torch.cat(
+                [
+                    past_attn_mask,
+                    torch.ones(
+                        [batch_size, 1, tree_candidate_len, last_iter_new_tokens],
+                        dtype=torch.bool,
+                        device=self.device
+                    )
+                ],
+                dim=-1
+            )
+        else:
+            # we initialize attn mask each time when
+            # 1. the model crosses the window bounary, or
+            # 2. after prefilling
+            chunks_per_window = int(self.config.window_size // self.config.chunk_size)
+            window_tokens = seen_tokens % self.config.window_size
+            num_windows_seen_so_far = seen_tokens // self.config.window_size
+            attn_mask_len = num_windows_seen_so_far * chunks_per_window + window_tokens
+            past_attn_mask = torch.ones(
+                (batch_size, 1, tree_candidate_len, attn_mask_len),
+                dtype=torch.bool,
+                device=self.device
+            )
+        # note that 1 indicates the position is not masked
+        tree_attn_mask = torch.cat(
+            [
+                past_attn_mask,
+                medusa_attn_mask.to(torch.bool)
+            ],
+            dim=-1
+        )
+        return tree_attn_mask, past_attn_mask
+    @torch.no_grad()
+    def multi_byte_generate(
+        self,
+        input_ids,
+        attention_mask=None,
+        temperature=0.0,
+        max_length=None,
+        max_new_tokens=None,
+        stopping_criteria=None,
+        posterior_threshold=0.09,
+        posterior_alpha=0.3,
+        top_p=0.8,
+        sampling='typical',
+        fast=True,
+        do_sample=False,
+        medusa_choices=None,
+        return_acc_lengths=False
+    ):
+        if do_sample or temperature > 0.0:
+            fast = False
+        ### Prepare `max_length` depending on other stopping criteria.
+        if max_new_tokens is not None:
+            max_length = max_new_tokens + input_ids.shape[-1]
+        elif max_new_tokens is None and max_length is None:
+            max_length = getattr(self.config, "max_position_embeddings", 32768)
+        ### Set up stopping criteria
+        eos_stop_criteria = MultibyteEosTokenCriteria(self.generation_config.eos_token_id)
+        stop_criteria = StoppingCriteriaList()
+        if max_length is not None:
+            max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+            stop_criteria.append(
+                MaxLengthCriteria(
+                    max_length=max_length,
+                    max_position_embeddings=max_position_embeddings,
+                )
+            )
+        if stopping_criteria is not None and len(stopping_criteria) > 0:
+            stop_criteria.extend(stopping_criteria)
+        assert input_ids.shape[0] == 1, "Only support batch size 1 for now"
+        assert attention_mask is None, "Only support attention mask None for now"
+        # Avoid modifying the input_ids in-place
+        input_ids = input_ids.clone()
+        position_ids = torch.arange(0, input_ids.shape[1], device=self.device, dtype=int).reshape(1, -1)
+        ####################################################
+        # 0. initialize the medusa buffers
+        ####################################################
+        if medusa_choices is None:
+            medusa_choices = evabyte_7b_95
+        medusa_buffers = generate_medusa_buffers(
+            medusa_choices, device=self.device
+        )
+        past_key_values = EvaStaticCacheForTriton(
+            input_ids.shape[0],
+            self.config.num_attention_heads,
+            # we add 256 to allow tree ids
+            self.config.window_size + 256,
+            self.config.hidden_size // self.config.num_attention_heads,
+            self.config.num_hidden_layers,
+            self.lm_head.weight.dtype,
+            self.lm_head.weight.device,
+        )
+        # prefill to get medusa logits and logits
+        full_logits, past_key_values = self.forward(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=True,
+            past_key_values=past_key_values,
+            return_all_pred_logits=True,
+            multibyte_decoding=False,
+        )
+        # handles an edge case where the prefill length == window_size
+        # we force the previous window to be dumped into RFA chunks
+        past_key_values = self._multi_byte_pred_update_cache_when_prefil_len_eq_window_size(
+            past_key_values
+        )
+        medusa_logits, logits = split_logits(full_logits)
+        past_attn_mask = None
+        last_iter_new_tokens = 0
+        max_iters = 32768
+        if return_acc_lengths:
+            acc_lengths = []
+        for _ in range(max_iters):
+            ####################################################
+            # 1. generate candidate_ids with topk predictions from Medusa heads
+            ####################################################
+            tree_candidate_ids, unflattened_candidate_ids = generate_candidates(
+                medusa_logits,
+                logits,
+                medusa_buffers["tree_indices"],
+                medusa_buffers["retrieve_indices"],
+                temperature=temperature,
+                posterior_alpha=posterior_alpha,
+                posterior_threshold=posterior_threshold,
+                top_p=top_p,
+                sampling=sampling,
+                fast=fast,
+            )
+            ####################################################
+            # 2. Build the medusa attention mask and position ids
+            ####################################################
+            # NOTE: 1 indicates the position is not masked
+            medusa_attn_mask, past_attn_mask = self.multi_byte_pred_update_attn_mask(
+                last_iter_new_tokens,
+                tree_candidate_ids,
+                past_attn_mask,
+                medusa_buffers["medusa_attn_mask"],
+                past_key_values,
+            )
+            medusa_position_ids = medusa_buffers["medusa_position_ids"] + input_ids.shape[1]
+            ####################################################
+            # 3. tree decoding
+            ####################################################
+            tree_full_logits, past_key_values = self.forward(
+                tree_candidate_ids,
+                past_key_values=past_key_values,
+                attention_mask=medusa_attn_mask,
+                position_ids=medusa_position_ids,
+                return_all_pred_logits=True,
+                multibyte_decoding=True,
+            )
+            _medusa_logits, _logits = split_logits(tree_full_logits)
+            medusa_logits = _medusa_logits[..., 0, medusa_buffers["retrieve_indices"], :]
+            logits = _logits[..., 0, medusa_buffers["retrieve_indices"], :]
+            ####################################################
+            # 4. candidate selection
+            ####################################################
+            # if the current iteration, with tree tokens, crosses window
+            # boundaries, trim the condidate_ids to be within the window
+            # so that those exceeded tokens (which will be inaccurate)
+            # will not be considered
+            tree_depth = unflattened_candidate_ids.shape[-1]
+            if tree_depth + past_key_values.get_past_window_pos(0) > self.config.window_size:
+                max_acc_len = self.config.window_size - past_key_values.get_past_window_pos(0)
+                _trimmed_unflattened_candidate_ids = unflattened_candidate_ids[:, :max_acc_len]
+                _trimmed_logits = logits[:, :max_acc_len]
+            else:
+                _trimmed_unflattened_candidate_ids = unflattened_candidate_ids
+                _trimmed_logits = logits
+            best_candidate, accept_length = evaluate_posterior(
+                _trimmed_logits,
+                _trimmed_unflattened_candidate_ids,
+                temperature,
+                posterior_threshold,
+                posterior_alpha,
+                top_p=top_p,
+                sampling=sampling,
+                fast=fast
+            )
+            ####################################################
+            # 5. update model inputs and caches
+            ####################################################
+            input_ids, medusa_logits, logits, last_iter_new_tokens = update_inference_inputs(
+                input_ids,
+                medusa_logits,
+                logits,
+                unflattened_candidate_ids,
+                best_candidate,
+                accept_length,
+            )
+            past_key_values = self.multi_byte_pred_update_cache(
+                past_key_values,
+                medusa_buffers["retrieve_indices"],
+                best_candidate,
+                last_iter_new_tokens,
+            )
+            if return_acc_lengths:
+                acc_lengths.append(last_iter_new_tokens)
+            if stop_criteria(input_ids, None) or eos_stop_criteria(input_ids, last_iter_new_tokens):
+                if return_acc_lengths:
+                    return input_ids, acc_lengths
+                else:
+                    return input_ids
+        if return_acc_lengths:
+            return input_ids, acc_lengths
+        else:
+            return input_ids

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_evabyte.EvaByteImageProcessor",
+    "AutoProcessor": "processing_evabyte.EvaByteProcessor"
+  },
+  "do_convert_rgb": true,
+  "do_resize": true,
+  "image_processor_type": "EvaByteImageProcessor",
+  "jpeg_quality": 25,
+  "jpeg_restart_marker_blocks": 1,
+  "jpeg_streamtype": 2,
+  "jpeg_subsampling": "4:2:0",
+  "processor_class": "EvaByteProcessor",
+  "resample": 1,
+  "size": {
+    "longest_edge": 384
+  }
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/processing_evabyte.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# coding=utf-8
+"""
+Processor class for EvaByte.
+"""
+import base64
+from io import BytesIO
+import requests
+import os
+import PIL
+from PIL import Image
+from typing import List, Optional, Union
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, is_valid_image
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import TensorType, to_py_obj
+def fetch_image(image: Union[str, "PIL.Image.Image"]) -> Image.Image:
+    image_obj = None
+    if isinstance(image, Image.Image):
+        image_obj = image
+    elif image.startswith("http://") or image.startswith("https://"):
+        image_obj = Image.open(BytesIO(requests.get(image, timeout=None).content))
+    elif os.path.isfile(image):
+        image_obj = Image.open(image)
+    elif image.startswith("data:image/"):
+        image = image.split(",")[1]
+        # Try to load as base64
+        try:
+            b64 = base64.decodebytes(image.encode())
+            image = PIL.Image.open(BytesIO(b64))
+        except Exception as e:
+            raise ValueError(
+                f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
+            )
+    else:
+        image_obj = Image.open(image)
+    if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
+    return image_obj
+def is_url(val) -> bool:
+    return isinstance(val, str) and val.startswith("http")
+def is_file(val) -> bool:
+    return isinstance(val, str) and os.path.isfile(val)
+def is_image_or_image_url(elem):
+    return is_url(elem) or is_valid_image(elem) or is_file(elem)
+vl_chat_template = """
+{{- bos_token }}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{{- '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>'}}
+{%- for message in messages %}
+    {%- if (message['role'] != 'user') and (message['role'] != 'assistant') %}
+        {{- raise_exception('Conversation roles must be user or assistant') }}
+    {%- endif %}
+    {%- if message['content'] is string %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}
+    {%- else %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+        {%- for content in message['content'] %}
+            {%- if content['type'] == 'image' %}
+                {{- '<image_placeholder>\n' }}
+            {%- elif content['type'] == 'text' %}
+                {{- content['text'] }}
+            {%- endif %}
+        {%- endfor %}
+        {{- '<|eot_id|>' }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\n\n' }}
+{%- endif %}
+"""
+class EvaByteProcessor(ProcessorMixin):
+    r"""
+    Constructs a EvaByte processor which wraps a EvaByte image processor and a EvaByte tokenizer into a single processor.
+    [`EvaByteProcessor`] offers all the functionalities of [`EvaByteImageProcessor`] and [`EvaByteTokenizer`]. See the
+    [`~EvaByteProcessor.__call__`] and [`~EvaByteProcessor.decode`] for more information.
+    Args:
+        image_processor ([`EvaByteImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`EvaByteTokenizer`], *optional*):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        super().__init__(image_processor, tokenizer)
+        self.t2v_token_id = self.tokenizer.convert_tokens_to_ids("<t2v_token>")
+        self.v2t_token_id = self.tokenizer.convert_tokens_to_ids("<v2t_token>")
+        self.image_placeholder = "<image_placeholder>"
+        self.vl_chat_template = vl_chat_template
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        strip_ending_sentinel: bool = False,
+        encode_only: bool = False,
+        **kwargs
+    ) -> Union[BatchFeature, List[List[int]]]:
+        # processing pipeline:
+        # 1. read images or videos from paths
+        # 2. use image_processor to convert images / videos to byte streams
+        if images is not None:
+            if isinstance(images, bytes):
+                image_bytes_list = [[images]]
+            elif isinstance(images, list) and isinstance(images[0], bytes):
+                image_bytes_list = [images]
+            elif isinstance(images, list) and isinstance(images[0], list) and isinstance(images[0][0], bytes):
+                image_bytes_list = images
+            else:
+                if is_image_or_image_url(images):
+                    images = [[images]]
+                elif isinstance(images, list) and is_image_or_image_url(images[0]):
+                    images = [images]
+                elif (
+                    not isinstance(images, list)
+                    and not isinstance(images[0], list)
+                    and not is_image_or_image_url(images[0][0])
+                ):
+                    raise ValueError(
+                        "Invalid input images. Please provide a single image or a list of images or a list of list of images."
+                    )
+                # Load images if they are URLs
+                images = [[fetch_image(im) if is_url(im) or is_file(im) else im for im in sample] for sample in images]
+                image_bytes_list = self.image_processor(images=images, **kwargs)
+        if not isinstance(text, list):
+            text = [text]
+        assert len(text) == 1, "Only support batch size 1 for now"
+        assert len(text) == len(image_bytes_list), "text and image_bytes_list must have the same length"
+        # TODO: invoke SequenceFeatureExtractor to get batched inputs
+        # 3. tokenize the text and put images / videos byte streams into the placeholders
+        #    surrounded by special tokens like "<image>" and "</image>"
+        batch_input_ids = []
+        if not encode_only:
+            batch_attention_mask = []
+        else:
+            batch_attention_mask = None
+        for t, image_bytes in zip(text, image_bytes_list):
+            text_splits = t.split(self.image_placeholder)
+            if len(text_splits) != len(image_bytes) + 1:
+                raise ValueError(
+                    f"The number of image tokens should be equal to the number of images, "
+                    f"but got {len(text_splits)} and {len(image_bytes) + 1}"
+                )
+            input_ids = [self.tokenizer.bos_token_id]
+            for i, text_part in enumerate(text_splits):
+                # each text part must be non-empty because we added markers around placeholders
+                split_tokens = self.tokenizer.encode(text_part, add_special_tokens=False)
+                input_ids.extend(split_tokens)
+                # Add image bytes after each text part except the last one
+                if i < len(image_bytes):
+                    input_ids.append(self.t2v_token_id)
+                    input_ids.extend([b + self.tokenizer.offset for b in image_bytes[i]])
+                    input_ids.append(self.v2t_token_id)
+            if strip_ending_sentinel and (input_ids[-1] in [self.t2v_token_id, self.v2t_token_id]):
+                input_ids = input_ids[:-1]
+            batch_input_ids.append(input_ids)
+            if not encode_only:
+                batch_attention_mask.append([1] * len(input_ids))
+        if not encode_only:
+            # 4. return batch of features
+            inputs = BatchFeature({
+                "input_ids": batch_input_ids,
+                "attention_mask": batch_attention_mask
+            }, tensor_type=return_tensors)
+            return inputs
+            # # Pad sequences
+            # padded_inputs = self.tokenizer.pad(
+            #     {"input_ids": batch_input_ids},
+            #     padding=True,
+            #     return_attention_mask=True,
+            #     return_tensors=return_tensors,
+            # )
+            # return BatchFeature(data=padded_inputs)
+        else:
+            return batch_input_ids
+    def image_tokens_to_bytes(self, image_token_ids, jpeg_quality=None):
+        image_bytes = bytes([token_id - self.tokenizer.offset for token_id in image_token_ids])
+        image_bytes = self.image_processor.jpeg_merge_qtables(image_bytes, jpeg_quality)
+        return image_bytes
+    def batch_decode(self, sequences, **kwargs):
+        """
+        This method forwards all its arguments to EvaByteTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        rets = [self.decode(seq, **kwargs) for seq in sequences]
+        return tuple(map(list, zip(*rets)))
+    def decode(self, token_ids, **kwargs):
+        """
+        Decodes a sequence of input_ids, handling image tokens separately.
+        Returns a tuple of (decoded_text, images), where images is a list of bytes.
+        """
+        if kwargs and "jpeg_quality" in kwargs:
+            kwargs = kwargs.copy()
+            jpeg_quality = kwargs.pop("jpeg_quality")
+        else:
+            jpeg_quality = None
+        token_ids = to_py_obj(token_ids)
+        # Find indices of t2v_token_id and v2t_token_id
+        t2v_indices = [i for i, token_id in enumerate(token_ids) if token_id == self.t2v_token_id]
+        v2t_indices = [i for i, token_id in enumerate(token_ids) if token_id == self.v2t_token_id]
+        # Check for correct pairing of t2v and v2t tokens
+        if len(t2v_indices) != len(v2t_indices):
+            raise ValueError("Mismatched number of t2v and v2t tokens in token_ids: {} and {}".format(t2v_indices, v2t_indices))
+        # Ensure t2v and v2t tokens are in the correct order
+        for t2v_idx, v2t_idx in zip(t2v_indices, v2t_indices):
+            if t2v_idx >= v2t_idx:
+                raise ValueError("Found t2v_token_id after v2t_token_id in token_ids")
+        # Initialize the start index
+        images = []
+        decoded_text = ""
+        start = 0
+        # Iterate over pairs of t2v and v2t indices
+        for t2v_idx, v2t_idx in zip(t2v_indices, v2t_indices):
+            # Decode text tokens before the image
+            text_token_ids = token_ids[start:t2v_idx]
+            if len(text_token_ids) > 0:
+                decoded_text += self.tokenizer.decode(text_token_ids, **kwargs)
+            # Insert image placeholder
+            decoded_text += self.image_placeholder
+            # Extract image tokens and convert them to bytes
+            image_token_ids = token_ids[t2v_idx + 1 : v2t_idx]
+            image_bytes = self.image_tokens_to_bytes(image_token_ids, jpeg_quality)
+            images.append(image_bytes)
+            # Update the start index to the token after v2t_token_id
+            start = v2t_idx + 1
+        # Decode any remaining text tokens after the last image
+        if start < len(token_ids):
+            text_token_ids = token_ids[start:]
+            decoded_text += self.tokenizer.decode(text_token_ids, **kwargs)
+        return decoded_text, images
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_evabyte.EvaByteProcessor"
+  },
+  "processor_class": "EvaByteProcessor"
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,98 @@

+{
+  "additional_special_tokens": [
+    "<repo_name>",
+    "<file_sep>",
+    "<t2v_token>",
+    "<v2t_token>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "<|eot_id|>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>"
+  ],
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<sep>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/tokenization_evabyte.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# coding=utf-8
+""" Tokenization class for model EvaByte."""
+from typing import List, Optional, Tuple
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+chat_template = """
+{{- bos_token }}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{{- '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>'}}
+{%- for message in messages %}
+    {%- if (message['role'] != 'user') and (message['role'] != 'assistant') %}
+        {{- raise_exception('Conversation roles must be user or assistant') }}
+    {%- endif %}
+    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\n\n' }}
+{%- endif %}
+"""
+class EvaByteTokenizer(PreTrainedTokenizer):
+    def __init__(
+        self,
+        bos_token="<bos>",
+        eos_token="<eos>",
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        extra_ids=59,
+        additional_special_tokens=None,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ) -> None:
+        num_base_special_tokens = 5
+        # Add extra_ids to the special token list
+        if extra_ids > 0 and additional_special_tokens is None:
+            additional_special_tokens = [f"<extra_id_{i}>" for i in range(num_base_special_tokens, extra_ids + num_base_special_tokens)]
+        elif extra_ids > 0 and additional_special_tokens is not None and len(additional_special_tokens) > 0:
+            # Check that we have the right number of extra_id special tokens
+            extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
+            if extra_tokens != extra_ids:
+                raise ValueError(
+                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
+                    " provided to EvaByteTokenizer. In this case the additional_special_tokens must include the"
+                    " extra_ids tokens"
+                )
+        #### override some reserved tokens to support chat template
+        for i, token in enumerate(additional_special_tokens):
+            if token == "<extra_id_5>":
+                token = "<repo_name>"
+            elif token == "<extra_id_6>":
+                token = "<file_sep>"
+            elif token == "<extra_id_7>":
+                token = "<t2v_token>"
+            elif token == "<extra_id_8>":
+                token = "<v2t_token>"
+            elif token == "<extra_id_9>":
+                token = "<|start_header_id|>"
+            elif token == "<extra_id_10>":
+                token = "<|end_header_id|>"
+            elif token == "<extra_id_11>":
+                token = "<|eot_id|>"
+            additional_special_tokens[i] = token
+        # lstrip and rstrip are set to False because we don't want to strip the whitespace from the special tokens
+        # this would be important for the byte tokenizer
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        self._added_tokens_decoder = {
+            0: pad_token,
+            1: bos_token,
+            2: eos_token,
+            3: unk_token, # unk_token is a placeholder
+            4: sep_token,
+            **{i: AddedToken(t, lstrip=False, rstrip=False) for i, t in enumerate(additional_special_tokens, start=num_base_special_tokens)},
+        }
+        self.offset = len(self._added_tokens_decoder)
+        self._utf_vocab_size = 2**8  # utf is 8 bits
+        self.add_bos_token = True
+        self.add_eos_token = False
+        super().__init__(
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            extra_ids=0,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+        self.chat_template = chat_template
+    @property
+    def vocab_size(self):
+        return self._utf_vocab_size
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.offset)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = bos_token_id + token_ids_0 + eos_token_id
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+        return output
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+        return output
+    def _tokenize(self, text: str) -> List[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        tokens = [chr(i) for i in text.encode("utf-8")]
+        return tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if len(token) != 1:
+            token_id = None
+        else:
+            token_id = ord(token) + self.offset
+        return token_id
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) to a byte (str) using the vocab."""
+        token = chr(index - self.offset)
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of bytes (string) to a single string."""
+        bstring = b""
+        for token in tokens:
+            if token in self.added_tokens_decoder:
+                tok_string = self.added_tokens_decoder[token].encode("utf-8")
+            elif token in self.added_tokens_encoder:
+                tok_string = token.encode("utf-8")
+            else:
+                tok_string = bytes([ord(token)])
+            bstring += tok_string
+        string = bstring.decode("utf-8", errors="ignore")
+        return string
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        return ()

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-30000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,596 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<sep>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "6": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "7": {
+      "content": "<t2v_token>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "8": {
+      "content": "<v2t_token>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "9": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "10": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "11": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "12": {
+      "content": "<extra_id_12>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "13": {
+      "content": "<extra_id_13>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "14": {
+      "content": "<extra_id_14>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "15": {
+      "content": "<extra_id_15>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "16": {
+      "content": "<extra_id_16>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "17": {
+      "content": "<extra_id_17>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "18": {
+      "content": "<extra_id_18>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "19": {
+      "content": "<extra_id_19>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "20": {
+      "content": "<extra_id_20>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21": {
+      "content": "<extra_id_21>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "22": {
+      "content": "<extra_id_22>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "23": {
+      "content": "<extra_id_23>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "24": {
+      "content": "<extra_id_24>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "25": {
+      "content": "<extra_id_25>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "26": {
+      "content": "<extra_id_26>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "27": {
+      "content": "<extra_id_27>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "28": {
+      "content": "<extra_id_28>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "29": {
+      "content": "<extra_id_29>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "30": {
+      "content": "<extra_id_30>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "31": {
+      "content": "<extra_id_31>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32": {
+      "content": "<extra_id_32>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "33": {
+      "content": "<extra_id_33>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "34": {
+      "content": "<extra_id_34>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "35": {
+      "content": "<extra_id_35>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "36": {
+      "content": "<extra_id_36>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "37": {
+      "content": "<extra_id_37>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "38": {
+      "content": "<extra_id_38>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "39": {
+      "content": "<extra_id_39>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "40": {
+      "content": "<extra_id_40>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "41": {
+      "content": "<extra_id_41>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "42": {
+      "content": "<extra_id_42>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "43": {
+      "content": "<extra_id_43>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "44": {
+      "content": "<extra_id_44>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "45": {
+      "content": "<extra_id_45>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "46": {
+      "content": "<extra_id_46>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "47": {
+      "content": "<extra_id_47>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "48": {
+      "content": "<extra_id_48>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "49": {
+      "content": "<extra_id_49>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50": {
+      "content": "<extra_id_50>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "51": {
+      "content": "<extra_id_51>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "52": {
+      "content": "<extra_id_52>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "53": {
+      "content": "<extra_id_53>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "54": {
+      "content": "<extra_id_54>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "55": {
+      "content": "<extra_id_55>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "56": {
+      "content": "<extra_id_56>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57": {
+      "content": "<extra_id_57>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "58": {
+      "content": "<extra_id_58>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "59": {
+      "content": "<extra_id_59>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "60": {
+      "content": "<extra_id_60>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "61": {
+      "content": "<extra_id_61>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "62": {
+      "content": "<extra_id_62>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "63": {
+      "content": "<extra_id_63>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<repo_name>",
+    "<file_sep>",
+    "<t2v_token>",
+    "<v2t_token>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "<|eot_id|>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>"
+  ],
+  "auto_map": {
+    "AutoProcessor": "processing_evabyte.EvaByteProcessor",
+    "AutoTokenizer": [
+      "tokenization_evabyte.EvaByteTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<bos>",
+  "chat_template": "\n{{- bos_token }}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{{- '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>'}}\n\n{%- for message in messages %}\n    {%- if (message['role'] != 'user') and (message['role'] != 'assistant') %}\n        {{- raise_exception('Conversation roles must be user or assistant') }}\n    {%- endif %}\n\n    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n{%- endfor %}\n\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\n\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<eos>",
+  "extra_ids": 0,
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "processor_class": "EvaByteProcessor",
+  "sep_token": "<sep>",
+  "tokenizer_class": "EvaByteTokenizer",
+  "unk_token": "<unk>"
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/README.md ADDED Viewed

	@@ -0,0 +1,105 @@

+---
+license: apache-2.0
+---
+# EvaByte Model Card
+**EvaByte** is a 6.5B **byte-level language model** built upon an improved architecture with multibyte prediction and EVA -- an efficient attention mechanism designed for scalability and performance. Trained on 1.5T bytes spanning natural language text, math, and code, EvaByte demonstrates the viability of efficient byte-level processing at scale -- rivaling top open-source tokenizer-based LMs using 5x less training data, excelling in coding tasks, and decoding up to 2x faster.
+## Model Resources
+- **Repository:** https://github.com/openevabyte/evabyte
+- **Blog:** https://hkunlp.github.io/blog/2025/evabyte and https://sambanova.ai/blog/evabyte-efficient-byte-level-language-models-at-scale
+- **Paper:** Coming soon
+## Model Details
+EvaByte is trained using the performant SambaNova SN30 RDU system with a batch size of 8M bytes and 32K context length. The training process consists of 3 phases: after pre-training on 1.2T bytes (yielding **EvaByte-Phase1**), two independent annealing runs (100B and 200B bytes respectively) are conducted with learning rate linearly decayed from 1e-4 to 0. The resulting checkpoints are merged via model soup (**EvaByte**), which then undergoes supervised fine-tuning (**EvaByte-SFT**).
+| Stage | Model |
+|:----- |:-----|
+| Base (before annealing) | [EvaByte-Phase1](https://huggingface.co/evabyte/EvaByte-Phase1) |
+| Base | [EvaByte](https://huggingface.co/evabyte/EvaByte) <-- you are here |
+| SFT  | [EvaByte-SFT](https://huggingface.co/evabyte/EvaByte-SFT) |
+## Usage
+**Note:** Make sure to set `trust_remote_code=True` when loading the model (or tokenizer), as our implementation includes custom code.
+The code snippet below demonstrates EvaByte-6.5B for completion:
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+# Load model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("evabyte/EvaByte", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained("evabyte/EvaByte", torch_dtype=torch.bfloat16, trust_remote_code=True).eval().to("cuda")
+prompt = "The quick brown fox jumps "
+# Tokenize input
+# Option 1: standard HF tokenizer interface
+input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
+# Option 2: Direct UTF-8 byte encoding with offset
+# Note: Each byte is offset by 64 with <bos> prepended.
+input_ids = torch.tensor([[1] + [b + 64 for b in prompt.encode("utf-8")]]).to("cuda")
+# byte-by-byte generation (default)
+generation_output = model.generate(
+    input_ids=input_ids,
+    max_new_tokens=32
+)
+# alternatively, use faster multibyte generation
+generation_output = model.multi_byte_generate(
+    input_ids=input_ids,
+    max_new_tokens=32
+)
+# Decode and print the output
+response = tokenizer.decode(
+    generation_output[0][input_ids.shape[1]:],
+    skip_special_tokens=False,
+    clean_up_tokenization_spaces=False
+)
+print(response)
+# Sample output:
+# over the lazy dog.\n\nThe quick
+```
+### ⚙️ Generation Modes
+EvaByte supports two generation interfaces:
+- `model.generate()`: The default generation method compatible with Huggingface `transformers` library. This approach generates one byte at a time and might be slow.
+- `model.multi_byte_generate()`: A faster alternative that generates multiple bytes per step and usually yields the same result as `model.generate()` under greedy decoding, with the implementation adapted from [Medusa](https://github.com/FasterDecoding/Medusa). `model.multi_byte_generate()` supports a subset of arguments in `model.generate()`:
+    - `input_ids`: the input byte ids.
+    - `temperature`: the temperature for sampling.
+    - `max_length`: the maximum length of the generated sequence.
+    - `max_new_tokens`: the maximum number of new bytes to generate.
+    - `stopping_criteria`: the [stopping criteria](https://huggingface.co/docs/transformers/v4.47.1/en/internal/generation_utils#transformers.StoppingCriteria) for generation.
+    - `top_p`: the top-p parameter for sampling.
+    - `do_sample`: greedy decoding or sampling.
+**Notes and Limitations:**
+- `device_map="auto"` is not supported for >2 GPUs.
+- Only batch size of 1 (with `attention_mask=None`) is supported for decoding.
+- `torch_dtype=torch.bfloat16` is required.
+- The multibyte generation `model.multi_byte_generate()` might return extra bytes after the end-of-sequence sentinel, due to the nature of the multibyte decoding. Manual truncation or cleaning may be needed.
+## Bias, Risks, and Limitations
+As a pretrained base model, **EvaByte** has not been fine-tuned for chat or instruction following, so users should not expect reliable performance in conversational or instruction-based tasks. Like other base models, it does not incorporate any moderation mechanisms, making it possible to generate potentially harmful or inappropriate content.
+## Evaluation
+For detailed evaluation results, check out our blog post at [SambaNova](https://sambanova.ai/blog/evabyte-efficient-byte-level-language-models-at-scale) or [HKUNLP](https://hkunlp.github.io/blog/2025/evabyte).
+## Citation
+```bibtex
+@misc{evabyte,
+    title = {EvaByte: Efficient Byte-level Language Models at Scale},
+    url = {https://hkunlp.github.io/blog/2025/evabyte},
+    author = {Lin Zheng and Xueliang Zhao and Guangtao Wang and Chen Wu and David Dong and Angela Wang and Mingran Wang and Yun Du and Haige Bo and Amol Sharma and Bo Li and Kejie Zhang and Changran Hu and Urmish Thakker and Lingpeng Kong},
+    year = {2025}
+}
+```

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "_name_or_path": null,
+  "architectures": [
+    "EvaByteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_class": "eva",
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_evabyte.EvaByteConfig",
+    "AutoModelForCausalLM": "modeling_evabyte.EvaByteForCausalLM"
+  },
+  "bos_token_id": 1,
+  "chunk_size": 16,
+  "eos_token_id": 2,
+  "fp32_ln": true,
+  "fp32_logits": true,
+  "fp32_skip_add": false,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "init_cutoff_factor": null,
+  "init_fn": "v2",
+  "init_std": 0.01275,
+  "initializer_range": 0.01275,
+  "intermediate_size": 16384,
+  "lazy_init": true,
+  "max_position_embeddings": 16384,
+  "max_seq_length": 16384,
+  "mixedp_attn": true,
+  "model_type": "evabyte",
+  "norm_add_unit_offset": true,
+  "num_attention_heads": 40,
+  "num_chunks": null,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 40,
+  "num_pred_heads": 1,
+  "pad_token_id": 0,
+  "return_dict": false,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 100000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.1",
+  "use_cache": true,
+  "vocab_size": 320,
+  "window_size": 2048
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/configuration_evabyte.py ADDED Viewed

	@@ -0,0 +1,99 @@

+""" EvaByte configuration"""
+from transformers.configuration_utils import PretrainedConfig
+class EvaByteConfig(PretrainedConfig):
+    model_type = "evabyte"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=320,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        norm_add_unit_offset=False,
+        init_fn="mitchell",
+        init_std=0.006,
+        init_cutoff_factor=None,
+        attention_class="mha",
+        window_size=512,
+        num_chunks=None,
+        chunk_size=256,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.norm_add_unit_offset = norm_add_unit_offset
+        self.init_fn = init_fn
+        self.init_std = init_std
+        self.init_cutoff_factor = init_cutoff_factor
+        # Attention-specific paramters
+        self.attention_class = attention_class
+        self.window_size = window_size
+        self.num_chunks = num_chunks
+        self.chunk_size = chunk_size
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/eva.py ADDED Viewed

	@@ -0,0 +1,424 @@

+from typing import Dict, Optional, Tuple, List, Any, Union
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .eva_agg_kernel import eva_agg_func_triton
+from .eva_prep_kv_kernel import eva_prep_kv_func_triton
+try:
+    import triton
+    USE_TRITON_IMPL = True
+except ImportError:
+    USE_TRITON_IMPL = False
+    raise ImportError("Triton is not installed. Please install it by running `pip install triton`.")
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """
+    Rotates half the hidden dims (last dim) of the input.
+    Args:
+        x: Rotary embedded tensor
+    Return:
+        Tensor with half of last dim negated and rotated to the front.
+    """
+    x1, x2 = x.split(x.shape[-1] // 2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor,
+                         position_ids: torch.Tensor) -> torch.Tensor:
+    """
+    Apply rotary embedding (cos, sin) to the query and key tensor on the sequence dimension.
+    The legends for dimensions are defined as:
+    num_heads: number of attention heads
+    current_seq_len: the current batch's sequence length, should be either 1 or max_seq_len
+    max_seq_len: the static sequence length, different from current_seq_len in cached inference case where it is always
+                 maximum lenghth, e.g. the length of static sequence length of KV cache
+    Args:
+        q: Query tensor, of size (batch_size, num_heads, current_seq_len, head_dim)
+        k: Key tensor, of size (batch_size, num_key_value_heads, current_seq_len, head_dim)
+        cos: Cosine base of rotary embedding, of size (max_seq_len, head_dim)
+        sin: Sine base of rotary embedding, of size (max_seq_len, head_dim)
+        position_ids: The position indices of the tokens corresponding to the query and key tensors. It has a size of
+                      (batch_size, current_seq_len).
+    Returns:
+        Embedded query and key tensor of same size as input.
+    """
+    bs, nheads, cur_seq_len, head_dim = q.shape
+    assert len(
+        k.shape) == 4, f"k should be of shape (batch_size, num_heads, current_seq_len, head_dim), got {k.shape} instead"
+    assert k.shape[0] == bs, f"k has a different batch_size {k.shape[0]} compared to q {bs}"
+    assert list(k.shape[2:]) == [cur_seq_len,
+                                 head_dim], f"k has different current_seq_len and/or head_dim compared to q"
+    assert cos.shape[3] == head_dim, f"cos should have dim of head dim {head_dim}, got {cos.shape[3]} instead"
+    assert list(position_ids.shape) in [[bs, cur_seq_len], [1, cur_seq_len]],\
+            f"position_ids should be of shape {[bs, cur_seq_len]} or {[1, cur_seq_len]}, got {position_ids.shape} instead"
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class EvaAttention(nn.Module):
+    """
+        Causal EVA for language modeling.
+    """
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.head_dim_scaling = self.head_dim ** -0.5
+        self.max_position_embeddings = config.max_position_embeddings
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.window_size = config.window_size
+        self.num_chunks = config.num_chunks
+        self.chunk_size = config.chunk_size
+        if self.chunk_size is not None:
+            assert self.window_size >= self.chunk_size and self.window_size % self.chunk_size == 0
+            # chunk_size overrides the number of landmarks
+            self.num_chunks = None
+        self.chunks_per_window = int(self.window_size // self.chunk_size)
+        self.adaptive_phi = nn.Parameter(
+            torch.randn(
+                1,
+                self.num_heads,
+                1,
+                1,
+                self.head_dim
+            ).clamp(-1., 1.) * self.head_dim_scaling
+        )
+        self.adaptive_mu_k = nn.Parameter(
+            torch.randn(
+                1,
+                self.num_heads,
+                1,
+                1,
+                self.head_dim
+            ).clamp(-1., 1.) * self.head_dim_scaling
+        )
+    def _triton_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        assert not output_attentions
+        bsz, q_len, _ = hidden_states.size()
+        if use_cache:
+            if past_key_value is None:
+                raise ValueError
+            assert isinstance(attention_mask, tuple)
+        # infer the model's running mode
+        is_prefilling = use_cache and past_key_value.get_seq_length(self.layer_idx) == 0
+        is_decoding = use_cache and past_key_value.get_seq_length(self.layer_idx) > 0
+        if is_prefilling:
+            assert len(attention_mask) == 2
+            window_mask, intra_chunk_mask = attention_mask
+            chunk_mask = None
+        elif is_decoding:
+            assert len(attention_mask) == 3
+            window_mask, intra_chunk_mask, chunk_mask = attention_mask
+        else:
+            if attention_mask is not None:
+                assert isinstance(attention_mask, tuple) and len(attention_mask) == 3
+                window_mask, chunk_mask, intra_chunk_mask = attention_mask
+            else:
+                window_mask, chunk_mask, intra_chunk_mask = None, None, None
+        ############################################
+        # compute q, k, v from hidden states
+        ############################################
+        # [b, h, q_len, d]
+        q = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        k = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        v = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if use_cache:
+            past_key_value.update_past_len(q.shape[-2], self.layer_idx)
+        ############################################
+        # apply rotary positional embeddings to q, k
+        ############################################
+        q, k = apply_rotary_pos_emb(q, k, cos, sin, position_ids)
+        ############################################
+        # update and get cached singleton tokens
+        # update and cache k and v for calculating chunk-level RFAs
+        ############################################
+        if use_cache:
+            s_k, s_v, dump_k, dump_v = past_key_value.update_singletons_and_chunks(
+                k,
+                v,
+                self.layer_idx,
+                self.window_size,
+            )
+        else:
+            s_k, s_v = k, v
+            dump_k, dump_v = k, v
+        if use_cache:
+            singleton_mask, dump_rf_mask = past_key_value.update_mask(
+                s_mask=window_mask,
+                rf_mask=intra_chunk_mask,
+                layer_idx=self.layer_idx,
+                window_size=self.window_size,
+            )
+        else:
+            singleton_mask = window_mask
+            dump_rf_mask = intra_chunk_mask
+        if dump_k is not None and dump_v is not None:
+            # 1. in prefilling, the input shape is
+            #   dump_k/dump_v: [b, h, n, d]
+            #   rfa_k/rfa_v: [b, h, n // c, d]
+            # 2. in decoding, the input shape is
+            #   k/v: [b, h, w, d]
+            #   rfa_k/rfa_v: [b, h, w//c, d]
+            # 3. in forward inference; the seq_len is already divisible
+            rfa_k, rfa_v = eva_prep_kv_func_triton(
+                dump_k, dump_v,
+                self.adaptive_mu_k, self.adaptive_phi,
+                dump_rf_mask, self.head_dim_scaling, self.chunk_size
+            )
+            # rfa_mask = get_rfa_chunk_mask(dump_rf_mask)
+            if use_cache:
+                rfa_k, rfa_v = past_key_value.update_chunk_rfas(
+                    rfa_k, rfa_v, self.layer_idx
+                )
+        elif use_cache:
+            # if there are not enough elements within the last chunk,
+            # we will only use the cached chunk-level RFAs
+            rfa_k, rfa_v = past_key_value.get_chunk_rfas(self.layer_idx)
+        else:
+            rfa_k, rfa_v = None, None
+        ############################################
+        # compute the full attention output
+        ############################################
+        if is_prefilling:
+            # prefilling
+            # 1. in prefilling, the input shape is
+            #   q: [b, h, n, d]
+            #   k/v: [b, h, n, d]
+            #   rfa_k/rfa_v: [b, h, n // c, d]
+            attn_output = eva_agg_func_triton(
+                q, s_k, s_v,
+                rfa_k, rfa_v,
+                singleton_mask, chunk_mask,
+                self.head_dim_scaling, self.window_size, self.chunks_per_window
+            )
+        elif is_decoding:
+            # 2. in decoding, the input shape is
+            #   q: [b, h, 1, d] or [b, h, z, d] (for multi-byte prediction)
+            #   k/v: [b, h, 1 + s, d]
+            #   rfa_k/rfa_v: [b, h, n // c, d]
+            if rfa_k is not None and rfa_v is not None:
+                # we only take the chunk-level RFAs not in the current window
+                seen_seq_len = past_key_value.get_seq_length(self.layer_idx)
+                if seen_seq_len <= self.window_size:
+                    agg_k = s_k
+                    agg_v = s_v
+                    attn_mask = singleton_mask
+                else:
+                    # NOTE: we already updated the cache so the length now
+                    # includes the current token
+                    # we subtract 1 from seen_seq_len because we want
+                    # if seen_seq_len = 2048 -> num_windows_seen_so_far = 0
+                    # if seen_seq_len = 4096 -> num_windows_seen_so_far = 1
+                    # if seen_seq_len = 4097 -> num_windows_seen_so_far = 2
+                    # NOTE the cat order should be taken care of;
+                    # should align with the order based on which
+                    # the attention mask is constructed
+                    num_windows_seen_so_far = (seen_seq_len - 1) // self.window_size
+                    agg_k = torch.cat([s_k, rfa_k[..., :num_windows_seen_so_far * self.chunks_per_window, :]], dim=-2)
+                    agg_v = torch.cat([s_v, rfa_v[..., :num_windows_seen_so_far * self.chunks_per_window, :]], dim=-2)
+                    if singleton_mask is not None:
+                        assert chunk_mask is not None
+                        attn_mask = torch.cat([singleton_mask, chunk_mask], dim=-1)
+                    else:
+                        attn_mask = singleton_mask
+            else:
+                agg_k = s_k
+                agg_v = s_v
+                attn_mask = singleton_mask
+            attn_output = F.scaled_dot_product_attention(
+                q, agg_k, agg_v,
+                attn_mask=attn_mask,
+                is_causal=False,
+                dropout_p=0.0,
+                scale=self.head_dim_scaling
+            )
+        else:
+            # 3. in single-forward inference
+            attn_output = eva_agg_func_triton(
+                q, s_k, s_v,
+                rfa_k, rfa_v,
+                singleton_mask, chunk_mask,
+                self.head_dim_scaling, self.window_size, self.chunks_per_window
+            )
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def _multibyte_decoding_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # during multi-byte forwarding, we only read caches and do not update them
+        assert not output_attentions
+        bsz, q_len, _ = hidden_states.size()
+        if use_cache and past_key_value is None:
+            raise ValueError
+        assert USE_TRITON_IMPL
+        assert isinstance(attention_mask, torch.Tensor) and attention_mask.dtype == torch.bool
+        assert use_cache and past_key_value.get_seq_length(self.layer_idx) > 0
+        ############################################
+        # compute q, k, v from hidden states
+        ############################################
+        # [b, h, q_len, d]
+        q = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        k = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        v = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        ############################################
+        # apply rotary positional embeddings to q, k
+        ############################################
+        q, k = apply_rotary_pos_emb(q, k, cos, sin, position_ids)
+        ############################################
+        # update and get cached singleton tokens
+        ############################################
+        input_len = k.shape[-2]
+        window_pos = past_key_value.past_window_pos[self.layer_idx]
+        new_window_pos = window_pos + input_len
+        past_key_value.past_window_k[self.layer_idx][:, :, window_pos : new_window_pos, :] = k
+        past_key_value.past_window_v[self.layer_idx][:, :, window_pos : new_window_pos, :] = v
+        s_k = past_key_value.past_window_k[self.layer_idx][:, :, : new_window_pos, :]
+        s_v = past_key_value.past_window_v[self.layer_idx][:, :, : new_window_pos, :]
+        rfa_k, rfa_v = past_key_value.get_chunk_rfas(self.layer_idx)
+        ############################################
+        # compute the full attention output
+        ############################################
+        # 2. in decoding, the input shape is
+        #   q: [b, h, 1, d] or [b, h, z, d] (for multi-byte prediction)
+        #   k/v: [b, h, 1 + s, d]
+        #   rfa_k/rfa_v: [b, h, n // c, d]
+        if rfa_k is not None and rfa_v is not None:
+            # NOTE the cat order should be taken care of;
+            # should align with the order based on which
+            # the attention mask is constructed
+            # agg_k = torch.cat([s_k, rfa_k], dim=-2)
+            # agg_v = torch.cat([s_v, rfa_v], dim=-2)
+            agg_k = torch.cat([rfa_k, s_k], dim=-2)
+            agg_v = torch.cat([rfa_v, s_v], dim=-2)
+        else:
+            agg_k = s_k
+            agg_v = s_v
+        attn_output = F.scaled_dot_product_attention(
+            q, agg_k, agg_v,
+            attn_mask=attention_mask,
+            is_causal=False,
+            dropout_p=0.0,
+            scale=self.head_dim_scaling
+        )
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+        multibyte_decoding: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        assert not output_attentions
+        if use_cache and past_key_value is None:
+            raise ValueError
+        assert USE_TRITON_IMPL
+        if use_cache and multibyte_decoding:
+            return self._multibyte_decoding_forward(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cos=cos,
+                sin=sin,
+            )
+        else:
+            return self._triton_forward(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cos=cos,
+                sin=sin,
+            )

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/eva_agg_kernel.py ADDED Viewed

	@@ -0,0 +1,1766 @@

+import math
+import torch
+import triton
+import triton.language as tl
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
+        "EVEN_W": lambda args: args["WINDOW_SIZE"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _bwd_eva_agg_kernel_dkdv(
+    Q,
+    K,
+    V,
+    WindowMask,
+    DO,
+    LSE,
+    DO_T_O,
+    DK,
+    DV,
+    softmax_scale,
+    stride_qb, stride_qh, stride_qm,
+    stride_kb, stride_kh, stride_kn,
+    stride_vb, stride_vh, stride_vn,
+    stride_window_mask_b, stride_window_mask_m,
+    stride_do_b, stride_do_h, stride_do_m,
+    stride_lse_b, stride_lse_h,
+    stride_do_t_o_b, stride_do_t_o_h,
+    stride_dk_b, stride_dk_h, stride_dk_n,
+    stride_dv_b, stride_dv_h, stride_dv_n,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    headdim,
+    WINDOW_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_W: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_bh = tl.program_id(1)
+    off_h = off_bh % nheads
+    off_b = off_bh // nheads
+    start_n = tl.program_id(0)
+    # determine which window the current KV block belongs to
+    offs_w = (start_n * BLOCK_N) // WINDOW_SIZE
+    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # initialize pointers
+    q_ptrs = (
+        Q +
+        off_b * stride_qb +
+        off_h * stride_qh +
+        offs_m[:, None] * stride_qm + offs_d[None, :]
+    )
+    k_ptrs = (
+        K +
+        off_b * stride_kb +
+        off_h * stride_kh +
+        offs_n[:, None] * stride_kn + offs_d[None, :]
+    )
+    v_ptrs = (
+        V +
+        off_b * stride_vb +
+        off_h * stride_vh +
+        offs_n[:, None] * stride_vn + offs_d[None, :]
+    )
+    do_ptrs = (
+        DO +
+        off_b * stride_do_b +
+        off_h * stride_do_h +
+        offs_m[:, None] * stride_do_m + offs_d[None, :]
+    )
+    do_t_o_ptrs = (
+        DO_T_O +
+        off_b * stride_do_t_o_b +
+        off_h * stride_do_t_o_h +
+        offs_m[:, None]
+    )
+    lse_ptrs = (
+        LSE +
+        off_b * stride_lse_b +
+        off_h * stride_lse_h +
+        offs_m[:, None]
+    )
+    if MASK_TYPE == 1:
+        m_ptrs = (
+            WindowMask +
+            off_b * stride_window_mask_b +
+            (offs_m[:, None] * stride_window_mask_m + offs_n[None, :])
+        )
+    dk_ptrs = (
+        DK +
+        off_b * stride_dk_b +
+        off_h * stride_dk_h +
+        offs_n[:, None] * stride_dk_n + offs_d[None, :]
+    )
+    dv_ptrs = (
+        DV +
+        off_b * stride_dv_b +
+        off_h * stride_dv_h +
+        offs_n[:, None] * stride_dv_n + offs_d[None, :]
+    )
+    # 1. for singletons
+    # determine start and end of query block
+    begin_m = ((start_n * BLOCK_N) // BLOCK_M) * BLOCK_M
+    end_m = tl.minimum((offs_w + 1) * WINDOW_SIZE, seqlen_q)
+    dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            k = tl.load(k_ptrs)
+            v = tl.load(v_ptrs)
+        else:
+            k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+            v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    else:
+        if EVEN_HEADDIM:
+            k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+            v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+        else:
+            k = tl.load(
+                k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
+            )
+            v = tl.load(
+                v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
+            )
+    for start_m in range(begin_m, end_m, BLOCK_M):
+        start_m = tl.multiple_of(start_m, BLOCK_M)
+        # load q, do, and lse
+        if EVEN_M & EVEN_N:
+            if EVEN_HEADDIM:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m
+                )
+            else:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+            do_t_o = tl.load(
+                do_t_o_ptrs + start_m
+            )
+            lse = tl.load(
+                lse_ptrs + start_m
+            )
+        else:
+            if EVEN_HEADDIM:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm,
+                    mask=(start_m + offs_m)[:, None] < seqlen_q,
+                    other=0.0
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m,
+                    mask=(start_m + offs_m)[:, None] < seqlen_q,
+                    other=0.0
+                )
+            else:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm,
+                    mask=((start_m + offs_m)[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                    other=0.0
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m,
+                    mask=((start_m + offs_m)[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                    other=0.0
+                )
+            do_t_o = tl.load(
+                do_t_o_ptrs + start_m,
+                mask=(start_m + offs_m)[:, None] < seqlen_q,
+                other=0.0
+            )
+            lse = tl.load(
+                lse_ptrs + start_m,
+                mask=(start_m + offs_m)[:, None] < seqlen_q,
+                other=0.0
+            )
+        lse = tl.where(lse == float("-inf"), 0.0, lse)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, tl.trans(k))
+        if not EVEN_M:
+            qk += tl.where((start_m + offs_m)[:, None] < seqlen_q, 0, float("-inf"))
+        if MASK_TYPE == 1:
+            if EVEN_M & EVEN_W:
+                mask = tl.load(
+                    m_ptrs + (start_m * stride_window_mask_m) - (offs_w * WINDOW_SIZE)
+                )
+            else:
+                mask = tl.load(
+                    m_ptrs + (start_m * stride_window_mask_m) - (offs_w * WINDOW_SIZE),
+                    mask=((start_m + offs_m)[:, None] < seqlen_q)
+                    & (((start_m * stride_window_mask_m) - (offs_w * WINDOW_SIZE) + offs_n)[None, :] < WINDOW_SIZE),
+                    other=1,
+                )
+            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+            # to multiply with softmax_scale here.
+            # we assume mask already implies the causal masking
+            qk = qk * softmax_scale
+            qk = tl.where(mask, float("-inf"), qk)
+            p = tl.exp(qk - lse)
+        else:
+            qk += tl.where((start_m + offs_m)[:, None] >= offs_n[None, :], 0, float("-inf"))
+            p = tl.exp(qk * softmax_scale - lse)
+        # dp [M, N]
+        dp = tl.dot(do, tl.trans(v))
+        # p [M, N],  dp [M, N], do_t_o [M, 1] -> ds [M, N]
+        ds = (p * (dp - do_t_o) * softmax_scale).to(q.dtype)
+        # p is fp32 and [M, N], convert to q.dtype
+        # do [M, D] -> dv [N, D]
+        dv += tl.dot(tl.trans(p.to(do.dtype)), do)
+        # dk [N, D]
+        dk += tl.dot(tl.trans(ds), q)
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(dv_ptrs, dv)
+            tl.store(dk_ptrs, dk)
+        else:
+            tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
+            tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
+    else:
+        if EVEN_HEADDIM:
+            tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
+            tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
+        else:
+            tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
+            tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_C": lambda args: args["nchunks"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _bwd_eva_agg_kernel_drfa_kv(
+    Q,
+    RFA_K,
+    RFA_V,
+    ChunkMask,
+    DO,
+    LSE,
+    DO_T_O,
+    D_RFA_K,
+    D_RFA_V,
+    softmax_scale,
+    stride_qb, stride_qh, stride_qm,
+    stride_rfa_kb, stride_rfa_kh, stride_rfa_kc,
+    stride_rfa_vb, stride_rfa_vh, stride_rfa_vc,
+    stride_chunk_mask_b, stride_chunk_mask_m,
+    stride_do_b, stride_do_h, stride_do_m,
+    stride_lse_b, stride_lse_h,
+    stride_do_t_o_b, stride_do_t_o_h,
+    stride_d_rfa_k_b, stride_d_rfa_k_h, stride_d_rfa_k_c,
+    stride_d_rfa_v_b, stride_d_rfa_v_h, stride_d_rfa_v_c,
+    nheads,
+    seqlen_q,
+    nchunks,
+    headdim,
+    CHUNKS_PER_WINDOW: tl.constexpr,
+    WINDOW_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_C: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_bh = tl.program_id(1)
+    off_h = off_bh % nheads
+    off_b = off_bh // nheads
+    start_c = tl.program_id(0)
+    # there are 128 chunks per window
+    offs_c = start_c * BLOCK_N + tl.arange(0, BLOCK_N)
+    # determine which window the current KV block belongs to
+    offs_w = (start_c * BLOCK_N) // CHUNKS_PER_WINDOW
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # initialize pointers
+    q_ptrs = (
+        Q +
+        off_b * stride_qb +
+        off_h * stride_qh +
+        (offs_m[:, None] * stride_qm + offs_d[None, :])
+    )
+    do_ptrs = (
+        DO +
+        off_b * stride_do_b +
+        off_h * stride_do_h +
+        (offs_m[:, None] * stride_do_m + offs_d[None, :])
+    )
+    do_t_o_ptrs = (
+        DO_T_O +
+        off_b * stride_do_t_o_b +
+        off_h * stride_do_t_o_h +
+        (offs_m[:, None])
+    )
+    lse_ptrs = (
+        LSE +
+        off_b * stride_lse_b +
+        off_h * stride_lse_h +
+        (offs_m[:, None])
+    )
+    rfa_k_ptrs = (
+        RFA_K +
+        off_b * stride_rfa_kb +
+        off_h * stride_rfa_kh +
+        (offs_c[:, None] * stride_rfa_kc + offs_d[None, :])
+    )
+    rfa_v_ptrs = (
+        RFA_V +
+        off_b * stride_rfa_vb +
+        off_h * stride_rfa_vh +
+        (offs_c[:, None] * stride_rfa_vc + offs_d[None, :])
+    )
+    if MASK_TYPE == 1:
+        rfa_m_ptrs = (
+            ChunkMask +
+            off_b * stride_chunk_mask_b +
+            (offs_m[:, None] * stride_chunk_mask_m + offs_c[None, :])
+        )
+    d_rfa_k_ptrs = (
+        D_RFA_K +
+        off_b * stride_d_rfa_k_b +
+        off_h * stride_d_rfa_k_h +
+        (offs_c[:, None] * stride_d_rfa_k_c + offs_d[None, :])
+    )
+    d_rfa_v_ptrs = (
+        D_RFA_V +
+        off_b * stride_d_rfa_v_b +
+        off_h * stride_d_rfa_v_h +
+        (offs_c[:, None] * stride_d_rfa_v_c + offs_d[None, :])
+    )
+    d_rfa_k = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    d_rfa_v = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    if EVEN_C & EVEN_M:
+        if EVEN_HEADDIM:
+            rfa_k = tl.load(rfa_k_ptrs)
+            rfa_v = tl.load(rfa_v_ptrs)
+        else:
+            rfa_k = tl.load(rfa_k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+            rfa_v = tl.load(rfa_v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    else:
+        if EVEN_HEADDIM:
+            rfa_k = tl.load(rfa_k_ptrs, mask=offs_c[:, None] < nchunks, other=0.0)
+            rfa_v = tl.load(rfa_v_ptrs, mask=offs_c[:, None] < nchunks, other=0.0)
+        else:
+            rfa_k = tl.load(
+                rfa_k_ptrs, mask=(offs_c[:, None] < nchunks) & (offs_d[None, :] < headdim), other=0.0
+            )
+            rfa_v = tl.load(
+                rfa_v_ptrs, mask=(offs_c[:, None] < nchunks) & (offs_d[None, :] < headdim), other=0.0
+            )
+    begin_m = tl.minimum((offs_w + 1) * WINDOW_SIZE, seqlen_q)
+    end_m = seqlen_q
+    for start_m in range(begin_m, end_m, BLOCK_M):
+        start_m = tl.multiple_of(start_m, BLOCK_M)
+        # load q, do, and lse
+        if EVEN_M:
+            if EVEN_HEADDIM:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m
+                )
+            else:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+            do_t_o = tl.load(
+                do_t_o_ptrs + start_m
+            )
+            lse = tl.load(
+                lse_ptrs + start_m
+            )
+        else:
+            if EVEN_HEADDIM:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm,
+                    mask=(start_m + offs_m)[:, None] < seqlen_q,
+                    other=0.0
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m,
+                    mask=(start_m + offs_m)[:, None] < seqlen_q,
+                    other=0.0
+                )
+            else:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm,
+                    mask=((start_m + offs_m)[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                    other=0.0
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m,
+                    mask=((start_m + offs_m)[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                    other=0.0
+                )
+            do_t_o = tl.load(
+                do_t_o_ptrs + start_m,
+                mask=(start_m + offs_m)[:, None] < seqlen_q,
+                other=0.0
+            )
+            lse = tl.load(
+                lse_ptrs + start_m,
+                mask=(start_m + offs_m)[:, None] < seqlen_q,
+                other=0.0
+            )
+        lse = tl.where(lse == float("-inf"), 0.0, lse)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, tl.trans(rfa_k))
+        if not EVEN_M:
+            qk += tl.where((start_m + offs_m)[:, None] < seqlen_q, 0, float("-inf"))
+        if MASK_TYPE == 1:
+            if EVEN_M & EVEN_C:
+                mask = tl.load(
+                    rfa_m_ptrs + (start_m * stride_chunk_mask_m)
+                )
+            else:
+                mask = tl.load(
+                    rfa_m_ptrs + (start_m * stride_chunk_mask_m),
+                    mask=((start_m + offs_m)[:, None] < seqlen_q)
+                    & (offs_c[None, :] < nchunks),
+                    other=1,
+                )
+            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+            # to multiply with softmax_scale here.
+            # we assume mask already implies the causal masking
+            qk = qk * softmax_scale
+            qk = tl.where(mask, float("-inf"), qk)
+            p = tl.exp(qk - lse)
+        else:
+            p = tl.exp(qk * softmax_scale - lse)
+        dp = tl.dot(do, tl.trans(rfa_v))
+        ds = (p * (dp - do_t_o) * softmax_scale).to(q.dtype)
+        # p is fp32, convert to q.dtype
+        d_rfa_v += tl.dot(tl.trans(p.to(do.dtype)), do)
+        # move softmax_scale to ds to save computation
+        d_rfa_k += tl.dot(tl.trans(ds), q)
+    if EVEN_C & EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(d_rfa_v_ptrs, d_rfa_v)
+            tl.store(d_rfa_k_ptrs, d_rfa_k)
+        else:
+            tl.store(d_rfa_v_ptrs, d_rfa_v, mask=offs_d[None, :] < headdim)
+            tl.store(d_rfa_k_ptrs, d_rfa_k, mask=offs_d[None, :] < headdim)
+    else:
+        if EVEN_HEADDIM:
+            tl.store(d_rfa_v_ptrs, d_rfa_v, mask=offs_c[:, None] < nchunks)
+            tl.store(d_rfa_k_ptrs, d_rfa_k, mask=offs_c[:, None] < nchunks)
+        else:
+            tl.store(d_rfa_v_ptrs, d_rfa_v, mask=(offs_c[:, None] < nchunks) & (offs_d[None, :] < headdim))
+            tl.store(d_rfa_k_ptrs, d_rfa_k, mask=(offs_c[:, None] < nchunks) & (offs_d[None, :] < headdim))
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
+        "EVEN_C": lambda args: args["nchunks"] % args["BLOCK_N"] == 0,
+        "EVEN_W": lambda args: args["WINDOW_SIZE"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _bwd_eva_agg_kernel_dq(
+    Q,
+    K,
+    V,
+    RFA_K,
+    RFA_V,
+    WindowMask,
+    ChunkMask,
+    DO,
+    LSE,
+    DO_T_O,
+    DQ,
+    softmax_scale,
+    stride_qb, stride_qh, stride_qm,
+    stride_kb, stride_kh, stride_kn,
+    stride_vb, stride_vh, stride_vn,
+    stride_rfa_kb, stride_rfa_kh, stride_rfa_kc,
+    stride_rfa_vb, stride_rfa_vh, stride_rfa_vc,
+    stride_window_mask_b, stride_window_mask_m,
+    stride_chunk_mask_b, stride_chunk_mask_m,
+    stride_do_b, stride_do_h, stride_do_m,
+    stride_lse_b, stride_lse_h,
+    stride_do_t_o_b, stride_do_t_o_h,
+    stride_dq_b, stride_dq_h, stride_dq_m,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    nchunks,
+    headdim,
+    CHUNKS_PER_WINDOW: tl.constexpr,
+    WINDOW_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    EMPTY_RFA_KV: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_W: tl.constexpr,
+    EVEN_C: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_bh = tl.program_id(1)
+    off_h = off_bh % nheads
+    off_b = off_bh // nheads
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_w = (start_m * BLOCK_M) // WINDOW_SIZE
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_c = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # TODO: add paratheses or not
+    q_ptrs = (
+        Q +
+        off_b * stride_qb +
+        off_h * stride_qh +
+        (offs_m[:, None] * stride_qm + offs_d[None, :])
+    )
+    k_ptrs = (
+        K +
+        off_b * stride_kb +
+        off_h * stride_kh +
+        (offs_n[:, None] * stride_kn + offs_d[None, :])
+    )
+    v_ptrs = (
+        V +
+        off_b * stride_vb +
+        off_h * stride_vh +
+        (offs_n[:, None] * stride_vn + offs_d[None, :])
+    )
+    if EMPTY_RFA_KV == 0:
+        rfa_k_ptrs = (
+            RFA_K +
+            off_b * stride_rfa_kb +
+            off_h * stride_rfa_kh +
+            (offs_c[:, None] * stride_rfa_kc + offs_d[None, :])
+        )
+        rfa_v_ptrs = (
+            RFA_V +
+            off_b * stride_rfa_vb +
+            off_h * stride_rfa_vh +
+            (offs_c[:, None] * stride_rfa_vc + offs_d[None, :])
+        )
+    dq_ptrs = (
+        DQ +
+        off_b * stride_dq_b +
+        off_h * stride_dq_h +
+        (offs_m[:, None] * stride_dq_m + offs_d[None, :])
+    )
+    do_ptrs = (
+        DO +
+        off_b * stride_do_b +
+        off_h * stride_do_h +
+        (offs_m[:, None] * stride_do_m + offs_d[None, :])
+    )
+    do_t_o_ptrs = (
+        DO_T_O +
+        off_b * stride_do_t_o_b +
+        off_h * stride_do_t_o_h +
+        offs_m[:, None]
+    )
+    lse_ptrs = (
+        LSE +
+        off_b * stride_lse_b +
+        off_h * stride_lse_h +
+        offs_m[:, None]
+    )
+    ### load q, do, do_t_o, lse ####
+    if EVEN_M:
+        if EVEN_HEADDIM:
+            q = tl.load(
+                q_ptrs
+            )
+            do = tl.load(
+                do_ptrs
+            )
+        else:
+            q = tl.load(
+                q_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+            do = tl.load(
+                do_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+        do_t_o = tl.load(
+            do_t_o_ptrs
+        )
+        lse = tl.load(
+            lse_ptrs
+        )
+    else:
+        if EVEN_HEADDIM:
+            q = tl.load(
+                q_ptrs,
+                mask=offs_m[:, None] < seqlen_q,
+                other=0.0
+            )
+            do = tl.load(
+                do_ptrs,
+                mask=offs_m[:, None] < seqlen_q,
+                other=0.0
+            )
+        else:
+            q = tl.load(
+                q_ptrs,
+                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+            do = tl.load(
+                do_ptrs,
+                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+        do_t_o = tl.load(
+            do_t_o_ptrs,
+            mask=offs_m[:, None] < seqlen_q,
+            other=0.0
+        )
+        lse = tl.load(
+            lse_ptrs,
+            mask=offs_m[:, None] < seqlen_q,
+            other=0.0
+        )
+    lse = tl.where(lse == float("-inf"), 0.0, lse)
+    lse *= 1.4426950408889634  # log2(e)
+    qk_scale = softmax_scale
+    qk_scale *= 1.4426950408889634  # log2(e)
+    if MASK_TYPE == 1:
+        window_mask_ptrs = (
+            WindowMask +
+            off_b * stride_window_mask_b +
+            (offs_m[:, None] * stride_window_mask_m + offs_n[None, :])
+        )
+        if EMPTY_RFA_KV == 0:
+            chunk_mask_ptrs = (
+                ChunkMask +
+                off_b * stride_chunk_mask_b +
+                (offs_m[:, None] * stride_chunk_mask_m + offs_c[None, :])
+            )
+    dq = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
+    # loop over k, v and update accumulator
+    # Iterate over local singletons;
+    # so we only iterate over blocks within the current window
+    start_idx_n = offs_w * WINDOW_SIZE
+    end_idx_n = tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
+    for start_n in range(start_idx_n, end_idx_n, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        if EVEN_N & EVEN_M:
+            if EVEN_HEADDIM:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn
+                )
+            else:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+        else:
+            if EVEN_HEADDIM:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, tl.trans(k))
+        # Trying to combine the two masks seem to make the result wrong
+        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
+        if MASK_TYPE == 1:
+            if EVEN_M & EVEN_W:
+                window_mask = tl.load(
+                    window_mask_ptrs + start_n - start_idx_n
+                )
+            else:
+                window_mask = tl.load(
+                    window_mask_ptrs + start_n - start_idx_n,
+                    mask=(offs_m[:, None] < seqlen_q)
+                    & ((start_n - start_idx_n + offs_n)[None, :] < WINDOW_SIZE),
+                    other=1,
+                )
+            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+            # to multiply with softmax_scale here.
+            # we assume mask already implies the causal masking
+            qk = qk * qk_scale
+            qk = tl.where(window_mask, float("-inf"), qk)
+            p = tl.exp2(qk - lse)
+        else:
+            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf"))
+            p = tl.exp2(qk * qk_scale - lse)
+        if EVEN_N & EVEN_M:
+            if EVEN_HEADDIM:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn
+                )
+            else:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+        else:
+            if EVEN_HEADDIM:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        dp = tl.dot(do, tl.trans(v))
+        ds = (p * (dp - do_t_o) * softmax_scale).to(q.dtype)
+        dq += tl.dot(ds, k)
+    if EMPTY_RFA_KV == 0:
+        # Iterate over RFA chunks
+        # we only iterate over chunks before the current local singleton window
+        end_idx_c = tl.minimum(offs_w * CHUNKS_PER_WINDOW, nchunks)
+        for start_c in range(0, end_idx_c, BLOCK_N):
+            start_c = tl.multiple_of(start_c, BLOCK_N)
+            # -- compute qk ----
+            if EVEN_C & EVEN_M:
+                if EVEN_HEADDIM:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc
+                    )
+                else:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=offs_d[None, :] < headdim,
+                        other=0.0
+                    )
+            else:
+                if EVEN_HEADDIM:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=(start_c + offs_c)[:, None] < nchunks,
+                        other=0.0,
+                    )
+                else:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=((start_c + offs_c)[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                        other=0.0,
+                    )
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk += tl.dot(q, tl.trans(rfa_k))
+            # Trying to combine the two masks seem to make the result wrong
+            if not EVEN_C:  # Need to mask out otherwise the softmax is wrong
+                qk += tl.where((start_c + offs_c)[None, :] < nchunks, 0, float("-inf"))
+            if MASK_TYPE == 1:
+                if EVEN_C & EVEN_M:
+                    chunk_mask = tl.load(
+                        chunk_mask_ptrs + start_c
+                    )
+                else:
+                    chunk_mask = tl.load(
+                        chunk_mask_ptrs + start_c,
+                        mask=(offs_m[:, None] < seqlen_q) & ((start_c + offs_c)[None, :] < nchunks),
+                        other=1,
+                    )
+                # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+                # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+                # to multiply with softmax_scale here.
+                # we assume mask already implies the causal masking
+                qk = qk * qk_scale
+                qk = tl.where(chunk_mask, float("-inf"), qk)
+                p = tl.exp2(qk - lse)
+            else:
+                p = tl.exp2(qk * qk_scale - lse)
+            if EVEN_C & EVEN_M:
+                if EVEN_HEADDIM:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc
+                    )
+                else:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=offs_d[None, :] < headdim,
+                        other=0.0
+                    )
+            else:
+                if EVEN_HEADDIM:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=(start_c + offs_n)[:, None] < nchunks,
+                        other=0.0,
+                    )
+                else:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=((start_c + offs_n)[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                        other=0.0,
+                    )
+            dp = tl.dot(do, tl.trans(rfa_v))
+            ds = (p * (dp - do_t_o) * softmax_scale).to(q.dtype)
+            dq += tl.dot(ds, rfa_k)
+    start_m = tl.program_id(0)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    dq_ptrs = (
+        DQ +
+        off_b * stride_dq_b +
+        off_h * stride_dq_h +
+        (offs_m[:, None] * stride_dq_m + offs_d[None, :])
+    )
+    if EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(
+                dq_ptrs, dq
+            )
+        else:
+            tl.store(
+                dq_ptrs, dq,
+                mask=offs_d[None, :] < headdim
+            )
+    else:
+        if EVEN_HEADDIM:
+            tl.store(
+                dq_ptrs, dq,
+                mask=offs_m[:, None] < seqlen_q
+            )
+        else:
+            tl.store(
+                dq_ptrs, dq,
+                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)
+            )
+_capability_90_config = {
+    "fwd": {
+        (torch.bfloat16, 64): (128, 128, 4, 3),
+        (torch.bfloat16, 128): (128, 128, 8, 3),
+        (torch.float32, 64): (128, 64, 8, 3),
+        (torch.float32, 128): (64, 32, 4, 3),
+    },
+    "bwd_dq": {
+        (torch.bfloat16, 64): (128, 64, 4, 3),
+        (torch.bfloat16, 128): (128, 64, 8, 3),
+        (torch.float32, 64): (128, 64, 8, 2),
+        (torch.float32, 128): (32, 32, 4, 2),
+    },
+    "bwd_dkdv": {
+        (torch.bfloat16, 64): (128, 64, 4, 2),
+        (torch.bfloat16, 128): (128, 64, 8, 2),
+        (torch.float32, 64): (128, 64, 8, 2),
+        (torch.float32, 128): (32, 32, 4, 1),
+    },
+    "bwd_drfa_kv": {
+        (torch.bfloat16, 64): (128, 64, 4, 2),
+        (torch.bfloat16, 128): (128, 64, 8, 2),
+        (torch.float32, 64): (128, 64, 8, 2),
+        (torch.float32, 128): (32, 32, 4, 1),
+    }
+}
+_capability_80_config = {
+    "fwd": {
+        (torch.bfloat16, 64): (64, 64, 4, 3),
+        (torch.bfloat16, 128): (64, 64, 8, 3),
+        (torch.float32, 64): (64, 32, 4, 2),
+        (torch.float32, 128): (64, 32, 8, 1),
+    },
+    "bwd_dq": {
+        (torch.bfloat16, 64): (64, 64, 4, 3),
+        (torch.bfloat16, 128): (64, 32, 4, 2),
+        (torch.float32, 64): (32, 32, 4, 2),
+        (torch.float32, 128): (32, 32, 4, 2),
+    },
+    "bwd_dkdv": {
+        (torch.bfloat16, 64): (64, 64, 4, 3),
+        (torch.bfloat16, 128): (32, 32, 4, 2),
+        (torch.float32, 64): (32, 32, 4, 1),
+        (torch.float32, 128): (16, 64, 8, 1),
+    },
+    "bwd_drfa_kv": {
+        (torch.bfloat16, 64): (64, 64, 4, 3),
+        (torch.bfloat16, 128): (64, 32, 4, 3),
+        (torch.float32, 64): (32, 32, 4, 1),
+        (torch.float32, 128): (32, 32, 4, 1),
+    }
+}
+def _get_config(dtype, head_dim, mode) -> tuple[int, int, int, int]:
+    capability = torch.cuda.get_device_capability()
+    if capability >= (9, 0):
+        kernel_config = _capability_90_config[mode].get((dtype, head_dim), (32, 32, 4, 1))
+    elif capability >= (8, 0):
+        kernel_config = _capability_80_config[mode].get((dtype, head_dim), (16, 16, 4, 1))
+    else:
+        if mode == "fwd":
+            if dtype == torch.float32:
+                kernel_config = (32, 16, 4, 2)
+            else:
+                kernel_config = (64, 32, 4, 2)
+        else:
+            if dtype == torch.float32:
+                kernel_config = (16, 16, 4, 1)
+            else:
+                kernel_config = (32, 32, 4, 1)
+    return kernel_config
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
+        "EVEN_C": lambda args: args["nchunks"] % args["BLOCK_N"] == 0,
+        "EVEN_W": lambda args: args["WINDOW_SIZE"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _fwd_eva_agg_kernel(
+    Q,
+    K,
+    V,
+    RFA_K,
+    RFA_V,
+    WindowMask,
+    ChunkMask,
+    Out,
+    LSE,
+    softmax_scale,
+    stride_qb, stride_qh, stride_qm,
+    stride_kb, stride_kh, stride_kn,
+    stride_vb, stride_vh, stride_vn,
+    stride_rfa_kb, stride_rfa_kh, stride_rfa_kc,
+    stride_rfa_vb, stride_rfa_vh, stride_rfa_vc,
+    stride_window_mask_b, stride_window_mask_m,
+    stride_chunk_mask_b, stride_chunk_mask_m,
+    stride_ob, stride_oh, stride_om,
+    stride_lse_b, stride_lse_h,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    nchunks,
+    headdim,
+    CHUNKS_PER_WINDOW: tl.constexpr,
+    WINDOW_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    EMPTY_RFA_KV: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_W: tl.constexpr,
+    EVEN_C: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_bh = tl.program_id(1)
+    off_h = off_bh % nheads
+    off_b = off_bh // nheads
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_w = (start_m * BLOCK_M) // WINDOW_SIZE
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_c = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # TODO: add paratheses or not
+    q_ptrs = (
+        Q +
+        off_b * stride_qb +
+        off_h * stride_qh +
+        (offs_m[:, None] * stride_qm + offs_d[None, :])
+    )
+    k_ptrs = (
+        K +
+        off_b * stride_kb +
+        off_h * stride_kh +
+        (offs_n[:, None] * stride_kn + offs_d[None, :])
+    )
+    v_ptrs = (
+        V +
+        off_b * stride_vb +
+        off_h * stride_vh +
+        (offs_n[:, None] * stride_vn + offs_d[None, :])
+    )
+    if EMPTY_RFA_KV == 0:
+        rfa_k_ptrs = (
+            RFA_K +
+            off_b * stride_rfa_kb +
+            off_h * stride_rfa_kh +
+            (offs_c[:, None] * stride_rfa_kc + offs_d[None, :])
+        )
+        rfa_v_ptrs = (
+            RFA_V +
+            off_b * stride_rfa_vb +
+            off_h * stride_rfa_vh +
+            (offs_c[:, None] * stride_rfa_vc + offs_d[None, :])
+        )
+    qk_scale = softmax_scale
+    qk_scale *= 1.4426950408889634  # log2(e)
+    if MASK_TYPE == 1:
+        window_mask_ptrs = (
+            WindowMask +
+            off_b * stride_window_mask_b +
+            (offs_m[:, None] * stride_window_mask_m + offs_n[None, :])
+        )
+        if EMPTY_RFA_KV == 0:
+            chunk_mask_ptrs = (
+                ChunkMask +
+                off_b * stride_chunk_mask_b +
+                (offs_m[:, None] * stride_chunk_mask_m + offs_c[None, :])
+            )
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    d_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
+    # load q: it will stay in SRAM throughout
+    # [2022-10-30] TD: Triton bug - in the case of EVEN_M=True and EVEN_N=False, if we just call
+    # tl.load(q_ptrs), we get the wrong output!
+    if EVEN_M & EVEN_N:
+        if EVEN_HEADDIM:
+            q = tl.load(
+                q_ptrs
+            )
+        else:
+            q = tl.load(
+                q_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            q = tl.load(
+                q_ptrs,
+                mask=offs_m[:, None] < seqlen_q,
+                other=0.0
+            )
+        else:
+            q = tl.load(
+                q_ptrs,
+                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+    # loop over k, v and update accumulator
+    # Iterate over local singletons;
+    # so we only iterate over blocks within the current window
+    start_idx_n = offs_w * WINDOW_SIZE
+    end_idx_n = tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
+    for start_n in range(start_idx_n, end_idx_n, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        if EVEN_N & EVEN_M:
+            if EVEN_HEADDIM:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn
+                )
+            else:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+        else:
+            if EVEN_HEADDIM:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, tl.trans(k))
+        # Trying to combine the two masks seem to make the result wrong
+        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
+        if MASK_TYPE == 1:
+            if EVEN_M & EVEN_W:
+                window_mask = tl.load(
+                    window_mask_ptrs + start_n - start_idx_n
+                )
+            else:
+                window_mask = tl.load(
+                    window_mask_ptrs + start_n - start_idx_n,
+                    mask=(offs_m[:, None] < seqlen_q)
+                    & ((start_n - start_idx_n + offs_n)[None, :] < WINDOW_SIZE),
+                    other=1,
+                )
+            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+            # to multiply with softmax_scale here.
+            # we assume mask already implies the causal masking
+            qk = qk * qk_scale
+            qk = tl.where(window_mask, float("-inf"), qk)
+            m_ij = tl.maximum(tl.max(qk, 1), m_i)
+            masked_out_rows = (m_ij == float("-inf"))
+            m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+            p = tl.exp2(qk - m_ij_masked[:, None])
+        else:
+            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf"))
+            m_ij = tl.maximum(tl.max(qk, 1) * qk_scale, m_i)
+            masked_out_rows = (m_ij == float("-inf"))
+            m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+            p = tl.exp2(qk * qk_scale - m_ij_masked[:, None])
+        d_ij = tl.sum(p, 1)
+        # scale acc_o
+        prev_scale = tl.exp2(m_i - m_ij_masked)
+        # # -- update output accumulator --
+        acc_o = acc_o * prev_scale[:, None]
+        # update acc_o
+        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
+            if EVEN_HEADDIM:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn
+                )
+            else:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+        else:
+            if EVEN_HEADDIM:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        p = p.to(v.dtype)
+        acc_o = tl.dot(p, v, acc_o)
+        # -- update statistics
+        d_i = d_i * prev_scale + d_ij
+        m_i = m_ij
+    if EMPTY_RFA_KV == 0:
+        # Iterate over RFA chunks
+        # we only iterate over chunks before the current local singleton window
+        end_idx_c = tl.minimum(offs_w * CHUNKS_PER_WINDOW, nchunks)
+        for start_c in range(0, end_idx_c, BLOCK_N):
+            start_c = tl.multiple_of(start_c, BLOCK_N)
+            # -- compute qk ----
+            if EVEN_C & EVEN_M:
+                if EVEN_HEADDIM:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc
+                    )
+                else:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=offs_d[None, :] < headdim,
+                        other=0.0
+                    )
+            else:
+                if EVEN_HEADDIM:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=(start_c + offs_c)[:, None] < nchunks,
+                        other=0.0,
+                    )
+                else:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=((start_c + offs_c)[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                        other=0.0,
+                    )
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk += tl.dot(q, tl.trans(rfa_k))
+            # Trying to combine the two masks seem to make the result wrong
+            if not EVEN_C:  # Need to mask out otherwise the softmax is wrong
+                qk += tl.where((start_c + offs_c)[None, :] < nchunks, 0, float("-inf"))
+            if MASK_TYPE == 1:
+                if EVEN_C & EVEN_M:
+                    chunk_mask = tl.load(
+                        chunk_mask_ptrs + start_c
+                    )
+                else:
+                    chunk_mask = tl.load(
+                        chunk_mask_ptrs + start_c,
+                        mask=(offs_m[:, None] < seqlen_q) & ((start_c + offs_c)[None, :] < nchunks),
+                        other=1,
+                    )
+                # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+                # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+                # to multiply with softmax_scale here.
+                # we assume mask already implies the causal masking
+                qk = qk * qk_scale
+                qk = tl.where(chunk_mask, float("-inf"), qk)
+                m_ij = tl.maximum(tl.max(qk, 1), m_i)
+                masked_out_rows = (m_ij == float("-inf"))
+                m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+                p = tl.exp2(qk - m_ij_masked[:, None])
+            else:
+                m_ij = tl.maximum(tl.max(qk, 1) * qk_scale, m_i)
+                masked_out_rows = (m_ij == float("-inf"))
+                m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+                p = tl.exp2(qk * qk_scale - m_ij_masked[:, None])
+            d_ij = tl.sum(p, 1)
+            # scale acc_o
+            prev_scale = tl.exp2(m_i - m_ij_masked)
+            # # -- update output accumulator --
+            acc_o = acc_o * prev_scale[:, None]
+            # update acc_o
+            # TODO: If we just do "if EVEN_N", there seems to be some race condition ?
+            if EVEN_C & EVEN_M:
+                if EVEN_HEADDIM:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc
+                    )
+                else:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=offs_d[None, :] < headdim,
+                        other=0.0
+                    )
+            else:
+                if EVEN_HEADDIM:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=(start_c + offs_n)[:, None] < nchunks,
+                        other=0.0,
+                    )
+                else:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=((start_c + offs_n)[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                        other=0.0,
+                    )
+            p = p.to(rfa_v.dtype)
+            acc_o = tl.dot(p, rfa_v, acc_o)
+            # -- update statistics
+            d_i = d_i * prev_scale + d_ij
+            m_i = m_ij
+    # for rows that are all -inf, set d_i to 1.0
+    d_i = tl.where(d_i == 0.0, 1.0, d_i)
+    # multiply by log(2)
+    lse_m = (m_i + tl.math.log2(d_i)) * 0.6931471805599453
+    acc_o = acc_o / d_i[:, None]
+    # TODO: understand why rematerialize offsets to save registers?
+    start_m = tl.program_id(0)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    out_ptrs = (
+        Out +
+        off_b * stride_ob +
+        off_h * stride_oh +
+        (offs_m[:, None] * stride_om + offs_d[None, :])
+    )
+    if EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_ptrs, acc_o
+            )
+        else:
+            tl.store(
+                out_ptrs, acc_o,
+                mask=offs_d[None, :] < headdim
+            )
+    else:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_ptrs, acc_o,
+                mask=offs_m[:, None] < seqlen_q
+            )
+        else:
+            tl.store(
+                out_ptrs, acc_o,
+                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)
+            )
+    lse_ptrs = (
+        LSE +
+        off_b * stride_lse_b +
+        off_h * stride_lse_h +
+        offs_m
+    )
+    if EVEN_M:
+        tl.store(
+            lse_ptrs, lse_m,
+        )
+    else:
+        tl.store(
+            lse_ptrs, lse_m,
+            mask=offs_m < seqlen_q
+        )
+def triton_eva_agg_fwd(
+    q, k, v, rfa_k, rfa_v,
+    window_mask,
+    chunk_mask,
+    softmax_scale,
+    window_size,
+    chunks_per_window
+):
+    if rfa_k is None and rfa_v is None:
+        empty_rfa_kv = 1
+        q, k, v = [
+            x if x.stride(-1) == 1 else x.contiguous()
+            for x in [q, k, v]
+        ]
+    else:
+        assert rfa_k is not None and rfa_v is not None, "Both rfa_k and rfa_v must either be None or have values at the same time."
+        empty_rfa_kv = 0
+        q, k, v, rfa_k, rfa_v = [
+            x if x.stride(-1) == 1 else x.contiguous()
+            for x in [q, k, v, rfa_k, rfa_v]
+        ]
+    # shape constraints
+    batch, nheads, seqlen_q, head_dim = q.shape
+    _,     _,      seqlen_k, _        = k.shape
+    if empty_rfa_kv == 0:
+        nchunks = rfa_k.shape[-2]
+        assert rfa_k.shape == (batch, nheads, nchunks, head_dim)
+        assert rfa_v.shape == (batch, nheads, nchunks, head_dim)
+        assert q.dtype == k.dtype == v.dtype == rfa_k.dtype == rfa_v.dtype
+    else:
+        nchunks = 0
+        assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type"
+    assert k.shape == (batch, nheads, seqlen_k, head_dim)
+    assert v.shape == (batch, nheads, seqlen_k, head_dim)
+    assert head_dim <= 128, "We only test head dimensions up to 128"
+    # assert q.dtype in [torch.float16, torch.bfloat16], "Only support fp16 and bf16"
+    assert q.dtype in [torch.bfloat16, torch.float], "Only support bf16 and fp32 for now"
+    assert q.is_cuda and k.is_cuda and v.is_cuda
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(head_dim)
+    mask_type = 0
+    if window_mask is not None:
+        mask_type = 1
+        assert window_mask.dtype == torch.bool
+        assert window_mask.is_cuda
+        assert window_mask.dim() == 4
+        assert window_mask.shape == (batch, 1, seqlen_q, window_size)
+        if window_mask.stride(-1) != 1:
+            window_mask = window_mask.contiguous()
+        assert chunk_mask is not None
+        assert chunk_mask.dtype == torch.bool
+        assert chunk_mask.is_cuda
+        assert chunk_mask.dim() == 4
+        assert chunk_mask.shape == (batch, 1, seqlen_q, nchunks)
+        if chunk_mask.stride(-1) != 1:
+            chunk_mask = chunk_mask.contiguous()
+    chunk_mask_strides = (
+        (chunk_mask.stride(0), chunk_mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    window_mask_strides = (
+        (window_mask.stride(0), window_mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    rfa_k_strides = (
+        (rfa_k.stride(0), rfa_k.stride(1), rfa_k.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    rfa_v_strides = (
+        (rfa_v.stride(0), rfa_v.stride(1), rfa_v.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    o = torch.empty_like(q)
+    lse = torch.empty((q.shape[0], q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(head_dim), 16)
+    BLOCK_M, BLOCK_N, num_warps, num_stages = _get_config(q.dtype, head_dim, "fwd")
+    assert chunks_per_window >= BLOCK_N, "chunks_per_window must be greater than BLOCK"
+    assert chunks_per_window % BLOCK_N == 0, "chunks_per_window must be a multiple of BLOCK_N"
+    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
+    _fwd_eva_agg_kernel[grid](
+        q,
+        k,
+        v,
+        rfa_k,
+        rfa_v,
+        window_mask,
+        chunk_mask,
+        o,
+        lse,
+        softmax_scale,
+        q.stride(0), q.stride(1), q.stride(2),
+        k.stride(0), k.stride(1), k.stride(2),
+        v.stride(0), v.stride(1), v.stride(2),
+        rfa_k_strides[0], rfa_k_strides[1], rfa_k_strides[2],
+        rfa_v_strides[0], rfa_v_strides[1], rfa_v_strides[2],
+        window_mask_strides[0], window_mask_strides[1],
+        chunk_mask_strides[0], chunk_mask_strides[1],
+        o.stride(0), o.stride(1), o.stride(2),
+        lse.stride(0), lse.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        nchunks,
+        head_dim,
+        chunks_per_window,
+        window_size,
+        mask_type,
+        empty_rfa_kv,
+        BLOCK_HEADDIM,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return o, lse
+def triton_eva_agg_bwd(
+    do,
+    q, k, v, rfa_k, rfa_v,
+    window_mask, chunk_mask,
+    o, lse,
+    dq, dk, dv, d_rfa_k, d_rfa_v,
+    softmax_scale,
+    window_size,
+    chunks_per_window,
+    empty_rfa_kv,
+    mask_type,
+):
+    if do.stride(-1) != 1:
+        do = do.contiguous()
+    # shape constraints
+    batch, nheads, seqlen_q, head_dim = q.shape
+    _,     _,      seqlen_k, _        = k.shape
+    if empty_rfa_kv == 0:
+        nchunks = rfa_k.shape[-2]
+        assert rfa_k.shape == (batch, nheads, nchunks, head_dim)
+        assert rfa_v.shape == (batch, nheads, nchunks, head_dim)
+        assert d_rfa_k.stride(-1) == d_rfa_v.stride(-1) == 1
+        assert q.dtype == k.dtype == v.dtype == rfa_k.dtype == rfa_v.dtype
+    else:
+        nchunks = 0
+        assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type"
+    assert lse.shape == (batch, nheads, seqlen_q)
+    assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == rfa_k.stride(-1) == rfa_v.stride(-1) == 1
+    assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(head_dim)
+    assert head_dim <= 128, "We only test head dimensions up to 128"
+    window_mask_strides = (
+        (window_mask.stride(0), window_mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    chunk_mask_strides = (
+        (chunk_mask.stride(0), chunk_mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    rfa_k_strides = (
+        (rfa_k.stride(0), rfa_k.stride(1), rfa_k.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    rfa_v_strides = (
+        (rfa_v.stride(0), rfa_v.stride(1), rfa_v.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    d_rfa_k_strides = (
+        (d_rfa_k.stride(0), d_rfa_k.stride(1), d_rfa_k.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    d_rfa_v_strides = (
+        (d_rfa_v.stride(0), d_rfa_v.stride(1), d_rfa_v.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    BLOCK_HEADDIM = max(triton.next_power_of_2(head_dim), 16)
+    do_t_o = torch.sum(do.to(torch.float32) * o.to(torch.float32), dim=-1).to(do.dtype)
+    BLOCK_M, BLOCK_N, num_warps, num_stages = _get_config(q.dtype, head_dim, "bwd_dq")
+    assert chunks_per_window >= BLOCK_N, "chunks_per_window must be greater than BLOCK"
+    assert chunks_per_window % BLOCK_N == 0, "chunks_per_window must be a multiple of BLOCK"
+    grid = lambda META: (
+        triton.cdiv(seqlen_q, META["BLOCK_M"]),
+        batch * nheads,
+    )
+    _bwd_eva_agg_kernel_dq[grid](
+        q,
+        k,
+        v,
+        rfa_k,
+        rfa_v,
+        window_mask,
+        chunk_mask,
+        do,
+        lse,
+        do_t_o,
+        dq,
+        softmax_scale,
+        q.stride(0), q.stride(1), q.stride(2),
+        k.stride(0), k.stride(1), k.stride(2),
+        v.stride(0), v.stride(1), v.stride(2),
+        rfa_k_strides[0], rfa_k_strides[1], rfa_k_strides[2],
+        rfa_v_strides[0], rfa_v_strides[1], rfa_v_strides[2],
+        window_mask_strides[0], window_mask_strides[1],
+        chunk_mask_strides[0], chunk_mask_strides[1],
+        do.stride(0), do.stride(1), do.stride(2),
+        lse.stride(0), lse.stride(1),
+        do_t_o.stride(0), do_t_o.stride(1),
+        dq.stride(0), dq.stride(1), dq.stride(2),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        nchunks,
+        head_dim,
+        chunks_per_window,
+        window_size,
+        mask_type,
+        empty_rfa_kv,
+        BLOCK_HEADDIM,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    BLOCK_M, BLOCK_N, num_warps, num_stages = _get_config(q.dtype, head_dim, "bwd_dkdv")
+    grid = lambda META: (
+        triton.cdiv(seqlen_k, META["BLOCK_N"]),
+        batch * nheads,
+    )
+    _bwd_eva_agg_kernel_dkdv[grid](
+        q,
+        k,
+        v,
+        window_mask,
+        do,
+        lse,
+        do_t_o,
+        dk,
+        dv,
+        softmax_scale,
+        q.stride(0), q.stride(1), q.stride(2),
+        k.stride(0), k.stride(1), k.stride(2),
+        v.stride(0), v.stride(1), v.stride(2),
+        window_mask_strides[0], window_mask_strides[1],
+        do.stride(0), do.stride(1), do.stride(2),
+        lse.stride(0), lse.stride(1),
+        do_t_o.stride(0), do_t_o.stride(1),
+        dk.stride(0), dk.stride(1), dk.stride(2),
+        dv.stride(0), dv.stride(1), dv.stride(2),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        head_dim,
+        window_size,
+        mask_type,
+        BLOCK_HEADDIM,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    if empty_rfa_kv == 0:
+        BLOCK_M, BLOCK_N, num_warps, num_stages = _get_config(q.dtype, head_dim, "bwd_drfa_kv")
+        grid = lambda META: (
+            triton.cdiv(nchunks, META["BLOCK_N"]),
+            batch * nheads,
+        )
+        _bwd_eva_agg_kernel_drfa_kv[grid](
+            q,
+            rfa_k,
+            rfa_v,
+            chunk_mask,
+            do,
+            lse,
+            do_t_o,
+            d_rfa_k,
+            d_rfa_v,
+            softmax_scale,
+            q.stride(0), q.stride(1), q.stride(2),
+            rfa_k_strides[0], rfa_k_strides[1], rfa_k_strides[2],
+            rfa_v_strides[0], rfa_v_strides[1], rfa_v_strides[2],
+            chunk_mask_strides[0], chunk_mask_strides[1],
+            do.stride(0), do.stride(1), do.stride(2),
+            lse.stride(0), lse.stride(1),
+            do_t_o.stride(0), do_t_o.stride(1),
+            d_rfa_k_strides[0], d_rfa_k_strides[1], d_rfa_k_strides[2],
+            d_rfa_v_strides[0], d_rfa_v_strides[1], d_rfa_v_strides[2],
+            nheads,
+            seqlen_q,
+            nchunks,
+            head_dim,
+            chunks_per_window,
+            window_size,
+            mask_type,
+            BLOCK_HEADDIM,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+class EvaAggFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, rfa_k, rfa_v, window_mask, chunk_mask, softmax_scale=None, window_size=None, chunks_per_window=None):
+        if rfa_k is None and rfa_v is None:
+            empty_rfa_kv = 1
+        else:
+            assert rfa_k is not None and rfa_v is not None, "Both rfa_k and rfa_v must either be None or have values at the same time."
+            empty_rfa_kv = 0
+        if window_mask is not None:
+            mask_type = 1
+        else:
+            mask_type = 0
+        o, lse = triton_eva_agg_fwd(
+            q, k, v, rfa_k, rfa_v, window_mask, chunk_mask, softmax_scale, window_size, chunks_per_window
+        )
+        ctx.save_for_backward(q, k, v, o, lse, rfa_k, rfa_v, window_mask, chunk_mask)
+        ctx.softmax_scale = softmax_scale
+        ctx.window_size = window_size
+        ctx.chunks_per_window = chunks_per_window
+        ctx.empty_rfa_kv = empty_rfa_kv
+        ctx.mask_type = mask_type
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        q, k, v, o, lse, rfa_k, rfa_v, window_mask, chunk_mask = ctx.saved_tensors
+        dq = torch.empty_like(q)
+        dk = torch.empty_like(k)
+        dv = torch.empty_like(v)
+        if ctx.empty_rfa_kv == 0:
+            d_rfa_k = torch.empty_like(rfa_k)
+            d_rfa_v = torch.empty_like(rfa_v)
+        else:
+            d_rfa_k = None
+            d_rfa_v = None
+        triton_eva_agg_bwd(
+            do,
+            q,
+            k,
+            v,
+            rfa_k,
+            rfa_v,
+            window_mask,
+            chunk_mask,
+            o,
+            lse,
+            dq,
+            dk,
+            dv,
+            d_rfa_k,
+            d_rfa_v,
+            softmax_scale=ctx.softmax_scale,
+            window_size=ctx.window_size,
+            chunks_per_window=ctx.chunks_per_window,
+            empty_rfa_kv=ctx.empty_rfa_kv,
+            mask_type=ctx.mask_type,
+        )
+        return dq, dk, dv, d_rfa_k, d_rfa_v, None, None, None, None, None
+def eva_agg_func_triton(
+        q, k, v, rfa_k, rfa_v,
+        window_mask, chunk_mask,
+        softmax_scale=None, window_size=None, chunks_per_window=None,
+    ):
+    return EvaAggFunc.apply(
+        q, k, v, rfa_k, rfa_v,
+        window_mask, chunk_mask,
+        softmax_scale, window_size, chunks_per_window,
+    )

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/eva_cache.py ADDED Viewed

	@@ -0,0 +1,761 @@

+from typing import Dict, Optional, Tuple, List, Any, Union
+import torch
+from transformers.cache_utils import Cache
+class EvaCache(Cache):
+    """
+    A cache that grows dynamically as more tokens are generated. This is the default for generative models.
+    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
+    `[batch_size, num_heads, seq_len, head_dim]`.
+    """
+    def __init__(self) -> None:
+        self.w_k: List[torch.Tensor] = []
+        self.w_v: List[torch.Tensor] = []
+        self.rf_q: List[torch.Tensor] = []
+        self.rf_k: List[torch.Tensor] = []
+        self.rf_v: List[torch.Tensor] = []
+        self.softmax_phi_k_v: List[torch.Tensor] = []
+        self.log_sum_phi_k: List[torch.Tensor] = []
+        self.rf_k_bar: List[torch.Tensor] = []
+        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+        # attention masks temporary buffer
+        self.rf_mask: List[Optional[torch.Tensor]] = []
+        self.s_mask: List[torch.Tensor] = []
+        self.chunk_mask: List[torch.Tensor] = []
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.w_k)
+    def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
+        """Given the sequence length of the new inputs, returns the usable length of the cache."""
+        # Cache without size limit -> all cache is usable
+        # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache
+        #   length, we will need to evict part of the cache (and thus not all cache is usable)
+        max_length = self.get_max_length()
+        previous_seq_length = self.get_seq_length(layer_idx)
+        if max_length is not None and previous_seq_length + new_seq_length > max_length:
+            return max_length - new_seq_length
+        return previous_seq_length
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.w_k)):
+            device = self.w_k[layer_idx].device
+            self.w_k[layer_idx] = self.w_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.w_v[layer_idx].device
+            self.w_v[layer_idx] = self.w_v[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_q[layer_idx].device
+            self.rf_q[layer_idx] = self.rf_q[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_k[layer_idx].device
+            self.rf_k[layer_idx] = self.rf_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_v[layer_idx].device
+            self.rf_v[layer_idx] = self.rf_v[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.softmax_phi_k_v[layer_idx].device
+            self.softmax_phi_k_v[layer_idx] = self.softmax_phi_k_v[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.log_sum_phi_k[layer_idx].device
+            self.log_sum_phi_k[layer_idx] = self.log_sum_phi_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_k_bar[layer_idx].device
+            self.rf_k_bar[layer_idx] = self.rf_k_bar[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_mask[layer_idx].device
+            self.rf_mask[layer_idx] = self.rf_mask[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.s_mask[layer_idx].device
+            self.s_mask[layer_idx] = self.s_mask[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.chunk_mask[layer_idx].device
+            self.chunk_mask[layer_idx] = self.chunk_mask[layer_idx].index_select(0, beam_idx.to(device))
+    @property
+    def seen_tokens(self):
+        if hasattr(self, "_seen_tokens"):
+            return self._seen_tokens
+        else:
+            return None
+    def update_past_len(
+        self,
+        cur_q_len: int,
+        layer_idx: int
+    ):
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self._seen_tokens += cur_q_len
+        return self._seen_tokens
+    def update_mask(
+            self,
+            prev_s_mask,
+            cur_s_mask,
+            chunk_mask,
+            rf_mask,
+            layer_idx,
+            window_size,
+            chunk_size,
+    ):
+        ############################################
+        # compute masks for singletons
+        ############################################
+        q_len = None
+        if len(self.s_mask) <= layer_idx:
+            q_len = chunk_mask.shape[-2]
+            # prefill stage
+            # q is of shape [b, h, n, d]
+            if q_len < window_size:
+                assert prev_s_mask is None
+            # w_v =  # [b, h, 1, j, d]
+            # store the past window-wise key-value pairs
+            self.s_mask.append(cur_s_mask[..., -1:, :] if cur_s_mask is not None else prev_s_mask[..., -1, -1:, :])
+        else:
+            # decoding stage
+            prev_s_mask = None
+            cached_s_mask = self.s_mask[layer_idx]
+            assert cached_s_mask is not None
+            if cached_s_mask.shape[-1] == window_size:
+                cur_s_mask = cur_s_mask
+            else:
+                cur_s_mask = torch.cat([cached_s_mask, cur_s_mask], dim=-1)
+            # store the past window-wise key-value pairs
+            self.s_mask[layer_idx] = cur_s_mask
+        ############################################
+        # compute masks for intra-chunks
+        ############################################
+        dump_rf_mask = None
+        if len(self.rf_mask) <= layer_idx:
+            # initialize chunk stats
+            # prefill stage
+            if q_len < chunk_size:
+                cur_rf_mask = rf_mask
+            else:
+                if q_len % chunk_size == 0:
+                    dump_rf_mask = rf_mask
+                    cur_rf_mask = None
+                else:
+                    remainder_tokens = q_len % chunk_size
+                    if rf_mask is not None:
+                        dump_rf_mask, cur_rf_mask = torch.split(rf_mask, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    else:
+                        dump_rf_mask = None
+                        cur_rf_mask = None
+            self.rf_mask.append(cur_rf_mask)
+        else:
+            past_rf_mask = self.rf_mask[layer_idx]
+            if past_rf_mask is not None:
+                # when decoding tokens, we always assume the
+                # incoming token mask is 0 (not masked)
+                cur_rf_mask = torch.cat([past_rf_mask, rf_mask], dim=-2)
+            else:
+                # we do not need to use rf_mask anymore after we receive generated tokens
+                cur_rf_mask = None
+            # We need to store rf_k_bar and RFA-results that
+            # compute the per-chunk RFA.
+            # Dump the chunk if the len of current chunk reaches <chunk_size>.
+            if cur_rf_mask is not None and cur_rf_mask.shape[-2] == chunk_size:
+                dump_rf_mask = cur_rf_mask
+                cur_rf_mask = None
+            self.rf_mask[layer_idx] = cur_rf_mask
+        ############################################
+        # compute masks for inter chunks
+        ############################################
+        if len(self.chunk_mask) <= layer_idx:
+            # prefill stage
+            # q is of shape [b, h, n, d]
+            if q_len < window_size:
+                cur_chunk_mask = chunk_mask
+                prev_chunk_mask = None
+            else:
+                if q_len % window_size == 0:
+                    cur_chunk_mask = None
+                    prev_chunk_mask = chunk_mask
+                else:
+                    remainder_tokens = q_len % window_size
+                    # [b, h, n-r, d] [b, h, r, d]
+                    prev_chunk_mask, cur_chunk_mask = torch.split(chunk_mask, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                bsz, num_heads, _, head_dim = prev_chunk_mask.shape
+                prev_chunk_mask = prev_chunk_mask.reshape(bsz, num_heads, -1, window_size, head_dim)
+                assert prev_s_mask is not None
+                if prev_s_mask.shape[-3] == 1 and prev_chunk_mask.shape[-3] > 1:
+                    # need to expand
+                    prev_s_mask = prev_s_mask.expand(-1, -1, prev_chunk_mask.shape[-3], -1, -1)
+            # w_v =  # [b, h, 1, j, d]
+            # store the past window-wise key-value pairs
+            self.chunk_mask.append(cur_chunk_mask[..., -1:, :] if cur_chunk_mask is not None else prev_chunk_mask[..., -1, -1:, :])
+        else:
+            # decoding stage
+            prev_chunk_mask = None
+            cur_chunk_mask = self.chunk_mask[layer_idx]
+            # if the current sequence length reaches <chunk_size>,
+            # we append a new 1 to the end of chunk_mask
+            seen_seq_len = self.get_seq_length(layer_idx)
+            if seen_seq_len > 0 and seen_seq_len % chunk_size == 0:
+                past_chunk_mask = self.chunk_mask[layer_idx]
+                if past_chunk_mask is not None:
+                    # when decoding tokens, we always assume the
+                    # incoming token mask is 0 (not masked)
+                    cur_chunk_mask = torch.cat([past_chunk_mask, chunk_mask], dim=-1)
+                else:
+                    cur_chunk_mask = chunk_mask
+                self.chunk_mask[layer_idx] = cur_chunk_mask
+            # if the len of current sequence reaches <window_size> + 1,
+            # we turn on the mask for most recent chunks
+            if seen_seq_len > 0 and seen_seq_len % window_size == 1:
+                cur_chunk_mask = self.chunk_mask[layer_idx]
+                # we do not need to use rf_mask anymore after we receive generated tokens
+                num_chunks_per_window = window_size // chunk_size
+                cur_chunk_mask[..., -num_chunks_per_window:] = False
+                self.chunk_mask[layer_idx] = cur_chunk_mask
+        return (prev_s_mask, cur_s_mask, prev_chunk_mask, cur_chunk_mask, dump_rf_mask)
+    def update_singletons(
+            self,
+            q,
+            k,
+            v,
+            layer_idx,
+            window_size,
+    ):
+        if len(self.w_k) <= layer_idx:
+            # prefill stage
+            # q is of shape [b, h, n, d]
+            q_len = q.shape[-2]
+            if q_len < window_size:
+                w_q = q
+                w_k = k
+                w_v = v
+                past_w_q = past_w_k = past_w_v = None
+            else:
+                if q_len % window_size == 0:
+                    w_q = None
+                    w_k = None
+                    w_v = None
+                    past_w_q = q
+                    past_w_k = k
+                    past_w_v = v
+                else:
+                    remainder_tokens = q_len % window_size
+                    # [b, h, n-r, d] [b, h, r, d]
+                    past_w_q, w_q = torch.split(q, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    past_w_k, w_k = torch.split(k, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    past_w_v, w_v = torch.split(v, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                bsz, num_heads, _, head_dim = past_w_q.shape
+                past_w_q = past_w_q.reshape(bsz, num_heads, -1, window_size, head_dim)
+                past_w_k = past_w_k.reshape(bsz, num_heads, -1, window_size, head_dim)
+                past_w_v = past_w_v.reshape(bsz, num_heads, -1, window_size, head_dim)
+            # w_q = q[..., None, -window_size:, :] # [b, h, 1, j, d]
+            # w_k =  # [b, h, 1, j, d]
+            # w_v =  # [b, h, 1, j, d]
+            # store the past window-wise key-value pairs
+            # if w_k is None, it means we happen to pass in a sqeuence that is divisible by window_size
+            # we leave the cache with window_size-sized kv pairs to be cleared next iteration
+            self.w_k.append(w_k if w_k is not None else past_w_k[..., -1, :, :])
+            self.w_v.append(w_v if w_v is not None else past_w_v[..., -1, :, :])
+        else:
+            # decoding stage
+            past_w_q = past_w_k = past_w_v = None
+            # this is implemented as either a sliding window or fixed window
+            w_q = q # [b, h, 1, d]
+            w_k = k # [b, h, 1, d]
+            w_v = v # [b, h, 1, d]
+            cached_w_k = self.w_k[layer_idx]
+            assert cached_w_k is not None # [b, h, j, d]
+            if cached_w_k.shape[-2] == window_size:
+                w_k = w_k
+            else:
+                w_k = torch.cat([cached_w_k, w_k], dim=-2)
+            cached_w_v = self.w_v[layer_idx]
+            assert cached_w_v is not None
+            if cached_w_v.shape[-2] == window_size:
+                w_v = w_v
+            else:
+                w_v = torch.cat([cached_w_v, w_v], dim=-2)
+            # store the past window-wise key-value pairs
+            self.w_k[layer_idx] = w_k
+            self.w_v[layer_idx] = w_v
+        return (past_w_q, past_w_k, past_w_v), (w_q, w_k, w_v)
+    def update_chunks(
+            self,
+            q,
+            k,
+            v,
+            layer_idx,
+            chunk_size
+    ):
+        q_len = q.shape[-2]
+        dump_q = None
+        dump_k = None
+        dump_v = None
+        if len(self.rf_q) <= layer_idx:
+            # initialize chunk stats
+            # prefill stage
+            if q_len < chunk_size:
+                rf_q = q
+                rf_k = k
+                rf_v = v
+            else:
+                if q_len % chunk_size == 0:
+                    rf_q = None
+                    rf_k = None
+                    rf_v = None
+                    dump_q = q
+                    dump_k = k
+                    dump_v = v
+                else:
+                    remainder_tokens = q_len % chunk_size
+                    # [b, h, n-r, d] [b, h, r, d]
+                    dump_q, rf_q = torch.split(q, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    dump_k, rf_k = torch.split(k, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    dump_v, rf_v = torch.split(v, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+            self.rf_q.append(rf_q)
+            self.rf_k.append(rf_k)
+            self.rf_v.append(rf_v)
+        else:
+            # decode tokens
+            # add query, key & value to the current chunk.
+            past_rf_q = self.rf_q[layer_idx]
+            if past_rf_q is not None:
+                rf_q = torch.cat([past_rf_q, q], dim=-2)
+            else:
+                rf_q = q
+            past_rf_k = self.rf_k[layer_idx]
+            if past_rf_k is not None:
+                rf_k = torch.cat([past_rf_k, k], dim=-2)
+            else:
+                rf_k = k
+            past_rf_v = self.rf_v[layer_idx]
+            if past_rf_v is not None:
+                rf_v = torch.cat([past_rf_v, v], dim=-2)
+            else:
+                rf_v = v
+            # We need to store rf_k_bar and RFA-results that
+            # compute the per-chunk RFA.
+            # Dump the chunk if the len of current chunk reaches <chunk_size>.
+            if rf_q.shape[-2] == chunk_size:
+                dump_q = rf_q
+                dump_k = rf_k
+                dump_v = rf_v
+                # clear the chunk
+                rf_q = None
+                rf_k = None
+                rf_v = None
+            self.rf_q[layer_idx] = rf_q
+            self.rf_k[layer_idx] = rf_k
+            self.rf_v[layer_idx] = rf_v
+        return dump_q, dump_k, dump_v
+    def update_chunk_rfas(
+        self,
+        softmax_phi_k_v,
+        log_sum_phi_k,
+        rf_k_bar,
+        layer_idx,
+        random_feature_dim
+    ):
+        if len(self.softmax_phi_k_v) <= layer_idx:
+            # prefill stage
+            self.softmax_phi_k_v.append(softmax_phi_k_v)
+            self.log_sum_phi_k.append(log_sum_phi_k)
+            self.rf_k_bar.append(rf_k_bar)
+        else:
+            # token decoding
+            past_softmax_phi_k_v = self.softmax_phi_k_v[layer_idx]
+            past_log_sum_phi_k = self.log_sum_phi_k[layer_idx]
+            past_rf_k_bar = self.rf_k_bar[layer_idx]
+            if past_softmax_phi_k_v is not None:
+                if random_feature_dim == 1:
+                    dim = -2
+                else:
+                    dim = -3
+                softmax_phi_k_v = torch.cat([past_softmax_phi_k_v, softmax_phi_k_v], dim=dim)
+            if past_log_sum_phi_k is not None:
+                if random_feature_dim == 1:
+                    dim = -2
+                else:
+                    dim = -3
+                log_sum_phi_k = torch.cat([past_log_sum_phi_k, log_sum_phi_k], dim=dim)
+            if past_rf_k_bar is not None:
+                rf_k_bar = torch.cat([past_rf_k_bar, rf_k_bar], dim=-2)
+            self.softmax_phi_k_v[layer_idx] = softmax_phi_k_v
+            self.log_sum_phi_k[layer_idx] = log_sum_phi_k
+            self.rf_k_bar[layer_idx] = rf_k_bar
+        return softmax_phi_k_v, log_sum_phi_k, rf_k_bar
+    def get_chunk_rfas(self, layer_idx):
+        if len(self.softmax_phi_k_v) <= layer_idx:
+            return (
+                None,
+                None,
+                None
+            )
+        else:
+            return (
+                self.softmax_phi_k_v[layer_idx],
+                self.log_sum_phi_k[layer_idx],
+                self.rf_k_bar[layer_idx]
+            )
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        if len(self.w_k) <= layer_idx:
+            return 0
+        return self._seen_tokens
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
+        return None
+    def update(
+        self,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError("`update` is not used in Eva Cache.")
+class EvaStaticCacheForTriton(Cache):
+    """
+    A variant of EvaCache for eva's triton kernels
+    """
+    def __init__(
+        self,
+        batch_size,
+        num_key_value_heads,
+        window_size,
+        head_dim,
+        num_layers,
+        dtype,
+        device
+    ) -> None:
+        self.past_window_k: List[torch.Tensor] = []
+        self.past_window_v: List[torch.Tensor] = []
+        cache_shape = (batch_size, num_key_value_heads, window_size, head_dim)
+        for idx in range(num_layers):
+            new_window_k = torch.zeros(cache_shape, dtype=dtype, device=device)
+            new_window_v = torch.zeros(cache_shape, dtype=dtype, device=device)
+            self.past_window_k.append(new_window_k)
+            self.past_window_v.append(new_window_v)
+        self.past_window_pos: List[int] = []
+        self.rfa_k: List[torch.Tensor] = []
+        self.rfa_v: List[torch.Tensor] = []
+        # self.rfa_mask: List[torch.Tensor] = []
+        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+        # attention masks temporary buffer
+        self.rf_mask: List[Optional[torch.Tensor]] = []
+        self.s_mask: List[torch.Tensor] = []
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.past_window_pos)
+    def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
+        """Given the sequence length of the new inputs, returns the usable length of the cache."""
+        # Cache without size limit -> all cache is usable
+        # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache
+        #   length, we will need to evict part of the cache (and thus not all cache is usable)
+        max_length = self.get_max_length()
+        previous_seq_length = self.get_seq_length(layer_idx)
+        if max_length is not None and previous_seq_length + new_seq_length > max_length:
+            return max_length - new_seq_length
+        return previous_seq_length
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.past_window_k)):
+            device = self.past_window_k[layer_idx].device
+            self.past_window_k[layer_idx] = self.past_window_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.past_window_v[layer_idx].device
+            self.past_window_v[layer_idx] = self.past_window_v[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rfa_k[layer_idx].device
+            self.rfa_k[layer_idx] = self.rfa_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rfa_v[layer_idx].device
+            self.rfa_v[layer_idx] = self.rfa_v[layer_idx].index_select(0, beam_idx.to(device))
+            # device = self.rfa_mask[layer_idx].device
+            # self.rfa_mask[layer_idx] = self.rfa_mask[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_mask[layer_idx].device
+            self.rf_mask[layer_idx] = self.rf_mask[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.s_mask[layer_idx].device
+            self.s_mask[layer_idx] = self.s_mask[layer_idx].index_select(0, beam_idx.to(device))
+    @property
+    def seen_tokens(self):
+        if hasattr(self, "_seen_tokens"):
+            return self._seen_tokens
+        else:
+            return None
+    def update_past_len(
+        self,
+        cur_q_len: int,
+        layer_idx: int
+    ):
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self._seen_tokens += cur_q_len
+        return self._seen_tokens
+    def update_mask(
+            self,
+            s_mask,
+            rf_mask,
+            layer_idx,
+            window_size,
+    ):
+        ############################################
+        # compute masks for singletons
+        ############################################
+        if len(self.s_mask) <= layer_idx:
+            # prefill stage
+            # q is of shape [b, h, n, d]
+            # s_v =  # [b, h, 1, j, d]
+            # store the past window-wise key-value pairs
+            if s_mask is None:
+                cur_s_mask = None
+            else:
+                q_len = s_mask.shape[-2]
+                # s_mask is of shape [b, h, n, w]
+                # let r = q_len % window_size
+                # if r == 0, the mask to be appended is of shape [b, h, 1, w]
+                # otherwise, r < w, the mask to be appended is of shape [b, h, 1, r]
+                remainder_tokens = q_len % window_size
+                if remainder_tokens == 0:
+                    cur_s_mask = None
+                else:
+                    cur_s_mask = s_mask[..., -1:, :remainder_tokens]
+            self.s_mask.append(cur_s_mask)
+            # we use the passed s_mask for subsequent computations
+            dump_s_mask = s_mask
+        else:
+            # decoding stage
+            past_s_mask = self.s_mask[layer_idx]
+            if past_s_mask is None:
+                assert s_mask is None
+                cur_s_mask = None
+            else:
+                assert s_mask is not None
+                cur_s_mask = torch.cat([past_s_mask, s_mask], dim=-1)
+            dump_s_mask = cur_s_mask
+            if cur_s_mask is not None and cur_s_mask.shape[-1] == window_size:
+                cur_s_mask = None
+                # store the past window-wise key-value pairs
+            self.s_mask[layer_idx] = cur_s_mask
+        ############################################
+        # compute masks for intra-chunks
+        ############################################
+        dump_rf_mask = None
+        if len(self.rf_mask) <= layer_idx:
+            # initialize chunk stats
+            # prefill stage
+            if rf_mask is None:
+                cur_rf_mask = None
+            else:
+                q_len = rf_mask.shape[-2]
+                if q_len < window_size:
+                    dump_rf_mask = None
+                    cur_rf_mask = rf_mask
+                else:
+                    if q_len % window_size == 0:
+                        dump_rf_mask = rf_mask
+                        cur_rf_mask = None
+                    else:
+                        remainder_tokens = q_len % window_size
+                        dump_rf_mask, cur_rf_mask = torch.split(rf_mask, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+            self.rf_mask.append(cur_rf_mask)
+        else:
+            past_rf_mask = self.rf_mask[layer_idx]
+            if past_rf_mask is not None:
+                # when decoding tokens, we always assume the
+                # incoming token mask is 0 (not masked)
+                cur_rf_mask = torch.cat([past_rf_mask, rf_mask], dim=-2)
+            else:
+                cur_rf_mask = None
+            if cur_rf_mask is not None and cur_rf_mask.shape[-2] == window_size:
+                dump_rf_mask = cur_rf_mask
+                cur_rf_mask = None
+            self.rf_mask[layer_idx] = cur_rf_mask
+        return dump_s_mask, dump_rf_mask
+    def update_singletons_and_chunks(
+            self,
+            k,
+            v,
+            layer_idx,
+            window_size,
+    ):
+        if len(self.past_window_pos) <= layer_idx:
+            # prefill stage
+            s_k = k
+            s_v = v
+            input_len = k.shape[-2]
+            window_pos = 0
+            if input_len <= window_size:
+                new_window_pos = window_pos + input_len
+                cached_window_k = k
+                cached_window_v = v
+                dump_k = None
+                dump_v = None
+            else:
+                remainder_tokens = input_len % window_size
+                if remainder_tokens == 0:
+                    remainder_tokens = window_size
+                new_window_pos = window_pos + remainder_tokens
+                # [b, h, n-r, d] [b, h, r, d]
+                cached_window_k = k[..., -remainder_tokens:, :]
+                cached_window_v = v[..., -remainder_tokens:, :]
+                dump_k = k[..., :-remainder_tokens, :]
+                dump_v = v[..., :-remainder_tokens, :]
+            # store the past window-wise key-value pairs
+            self.past_window_k[layer_idx][:, :, window_pos : new_window_pos, :] = cached_window_k
+            self.past_window_v[layer_idx][:, :, window_pos : new_window_pos, :] = cached_window_v
+            self.past_window_pos.append(new_window_pos)
+        else:
+            # decoding stage
+            # if the previous cache has full tokens,
+            # roll back to the first elements
+            if self.past_window_pos[layer_idx] == window_size:
+                self.past_window_pos[layer_idx] = 0
+                dump_k = self.past_window_k[layer_idx].clone()
+                dump_v = self.past_window_v[layer_idx].clone()
+            else:
+                dump_k = None
+                dump_v = None
+            input_len = k.shape[-2]
+            window_pos = self.past_window_pos[layer_idx]
+            new_window_pos = window_pos + input_len
+            self.past_window_k[layer_idx][:, :, window_pos : new_window_pos, :] = k
+            self.past_window_v[layer_idx][:, :, window_pos : new_window_pos, :] = v
+            s_k = self.past_window_k[layer_idx][:, :, : new_window_pos, :]
+            s_v = self.past_window_v[layer_idx][:, :, : new_window_pos, :]
+            self.past_window_pos[layer_idx] = new_window_pos
+        return s_k, s_v, dump_k, dump_v
+    def update_chunk_rfas(
+        self,
+        rfa_k,
+        rfa_v,
+        layer_idx,
+    ):
+        if len(self.rfa_k) <= layer_idx:
+            # prefill stage
+            self.rfa_k.append(rfa_k)
+            self.rfa_v.append(rfa_v)
+        else:
+            # token decoding
+            past_rfa_k = self.rfa_k[layer_idx]
+            past_rfa_v = self.rfa_v[layer_idx]
+            if past_rfa_k is not None:
+                rfa_k = torch.cat([past_rfa_k, rfa_k], dim=-2)
+            if past_rfa_v is not None:
+                rfa_v = torch.cat([past_rfa_v, rfa_v], dim=-2)
+            self.rfa_k[layer_idx] = rfa_k
+            self.rfa_v[layer_idx] = rfa_v
+        return rfa_k, rfa_v
+    def get_past_window_pos(self, layer_idx):
+        if len(self.past_window_pos) <= layer_idx:
+            return None
+        else:
+            return self.past_window_pos[layer_idx]
+    def get_past_window_kv(self, layer_idx):
+        if len(self.past_window_pos) <= layer_idx:
+            return None, None
+        else:
+            return (
+                self.past_window_k[layer_idx][:, :, : self.past_window_pos[layer_idx], :],
+                self.past_window_v[layer_idx][:, :, : self.past_window_pos[layer_idx], :]
+            )
+    def get_chunk_rfas(self, layer_idx):
+        if len(self.rfa_k) <= layer_idx:
+            return None, None
+        else:
+            return self.rfa_k[layer_idx], self.rfa_v[layer_idx]
+    def get_seq_length(self, layer_idx = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # layer_idx must be provided since otherwise
+        # any layer > 0 can only get the updated _seen_tokens
+        if len(self.past_window_pos) <= layer_idx:
+            return 0
+        return self._seen_tokens
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
+        return None
+    def update(
+        self,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError("`update` is not used in Eva Cache.")

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/eva_prep_kv_kernel.py ADDED Viewed

	@@ -0,0 +1,1017 @@

+import math
+import torch
+import triton
+import triton.language as tl
+@triton.heuristics(
+    {
+        "EVEN_N": lambda args: args["seqlen"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _fwd_eva_prep_kv_kernel(
+    K, # [b, h, n, d]
+    V, # [b, h, n, d]
+    PARAM_MU, # [1, h, 1, 1, d]
+    PARAM_PHI,  # [1, h, 1, 1, d]
+    Mask, # [b, h, n, 1]
+    Out_RFA_K, # [b, h, c, d]
+    Out_RFA_V, # [b, h, c, d]
+    softmax_scale,
+    stride_kb, stride_kh, stride_kn,
+    stride_vb, stride_vh, stride_vn,
+    stride_mu_h,
+    stride_phi_h,
+    stride_mb, stride_mn,
+    stride_ok_b, stride_ok_h, stride_ok_c,
+    stride_ov_b, stride_ov_h, stride_ov_c,
+    nheads,
+    seqlen,
+    nchunks,
+    headdim,
+    CHUNKS_PER_BLOCK: tl.constexpr,
+    CHUNK_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_n = tl.program_id(0)
+    offs_bh = tl.program_id(1)
+    offs_h = offs_bh % nheads
+    offs_b = offs_bh // nheads
+    # initialize offsets
+    # we load BLOCK_N keys and values each time, and
+    # reshape it to [CHUNKS_PER_BLOCK, CHUNK_SIZE]
+    offs_c = tl.arange(0, CHUNKS_PER_BLOCK)
+    offs_m = tl.arange(0, CHUNK_SIZE)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    k_ptrs = (
+        K +
+        offs_b * stride_kb +
+        offs_h * stride_kh +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_kn +
+            offs_d[None, None, :]
+        )
+    )
+    v_ptrs = (
+        V +
+        offs_b * stride_vb +
+        offs_h * stride_vh +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_vn +
+            offs_d[None, None, :]
+        )
+    )
+    param_mu_ptrs = (
+        PARAM_MU +
+        offs_h * stride_mu_h +
+        offs_d[None, None, :]
+    )
+    param_phi_ptrs = (
+        PARAM_PHI +
+        offs_h * stride_phi_h +
+        offs_d[None, None, :]
+    )
+    log2e = 1.4426950408889634
+    if MASK_TYPE == 1:
+        m_ptrs = (
+            Mask +
+            offs_b * stride_mb +
+            (
+                (
+                    start_n * BLOCK_N +
+                    offs_c[:, None] * CHUNK_SIZE +
+                    offs_m[None, :]
+                ) * stride_mn
+            )
+        )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            k = tl.load(
+                k_ptrs
+            )
+        else:
+            k = tl.load(
+                k_ptrs,
+                mask=offs_d[None, None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            k = tl.load(
+                k_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None, None] * CHUNK_SIZE +
+                        offs_m[None, :, None]
+                    ) < seqlen,
+                other=0.0
+            )
+        else:
+            k = tl.load(
+                k_ptrs,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+                other=0.0
+            )
+    param_mu = tl.load(param_mu_ptrs).to(k.dtype)
+    rfa_k_c_w = tl.zeros([CHUNKS_PER_BLOCK, CHUNK_SIZE], dtype=tl.float32)
+    rfa_k_c_w += tl.sum(k * param_mu, axis=-1)
+    rfa_k_c_w *= log2e
+    if MASK_TYPE == 1:
+        if EVEN_N:
+            mask = tl.load(
+                m_ptrs
+            )
+        else:
+            mask = tl.load(
+                m_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None] * CHUNK_SIZE +
+                        offs_m[None, :]
+                    ) < seqlen,
+                other=1,
+            )
+        rfa_k_c_w = tl.where(mask, float("-inf"), rfa_k_c_w)
+    m_rfa_k_c_w = tl.max(rfa_k_c_w, axis=-1)
+    masked_out_rows_rfa_k = (m_rfa_k_c_w == float("-inf"))
+    m_rfa_k_c_w_masked = tl.where(masked_out_rows_rfa_k, 0, m_rfa_k_c_w)
+    rfa_k_c_w = tl.exp2(rfa_k_c_w - m_rfa_k_c_w_masked[:, None])
+    denom_k = tl.sum(rfa_k_c_w, axis=-1)
+    denom_k = tl.where(denom_k == 0.0, 1.0, denom_k)
+    rfa_k_c_w = rfa_k_c_w / denom_k[:, None]
+    rfa_k_c = tl.sum(k * rfa_k_c_w[:, :, None].to(k.dtype), axis=-2)
+    # TODO: understand why rematerialize offsets to save registers?
+    offs_out_c = start_n * CHUNKS_PER_BLOCK + tl.arange(0, CHUNKS_PER_BLOCK)
+    out_rfa_k_ptrs = (
+        Out_RFA_K +
+        offs_b * stride_ok_b +
+        offs_h * stride_ok_h +
+        (offs_out_c[:, None] * stride_ok_c + offs_d[None, :])
+    )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_rfa_k_ptrs, rfa_k_c
+            )
+        else:
+            tl.store(
+                out_rfa_k_ptrs, rfa_k_c,
+                mask=offs_d[None, :] < headdim
+            )
+    else:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_rfa_k_ptrs, rfa_k_c,
+                mask=offs_out_c[:, None] < nchunks
+            )
+        else:
+            tl.store(
+                out_rfa_k_ptrs, rfa_k_c,
+                mask=(offs_out_c[:, None] < nchunks) & (offs_d[None, :] < headdim)
+            )
+    param_phi = tl.load(param_phi_ptrs).to(k.dtype)
+    rfa_v_c_w = tl.zeros([CHUNKS_PER_BLOCK, CHUNK_SIZE], dtype=tl.float32)
+    rfa_v_c_w += tl.sum(k * param_phi, axis=-1)
+    rfa_v_c_w -= (0.5 * tl.sum(k * k, axis=-1))
+    rfa_v_c_w *= log2e * softmax_scale
+    if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+        rfa_v_c_w += tl.where(
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None] * CHUNK_SIZE +
+                offs_m[None, :]
+            ) < seqlen,
+            0,
+            float("-inf")
+        )
+    if MASK_TYPE == 1:
+        rfa_v_c_w = tl.where(mask, float("-inf"), rfa_v_c_w)
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            v = tl.load(
+                v_ptrs
+            )
+        else:
+            v = tl.load(
+                v_ptrs,
+                mask=offs_d[None, None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            v = tl.load(
+                v_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None, None] * CHUNK_SIZE +
+                        offs_m[None, :, None]
+                    ) < seqlen,
+                other=0.0
+            )
+        else:
+            v = tl.load(
+                v_ptrs,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+                other=0.0
+            )
+    m_rfa_v_c_w = tl.max(rfa_v_c_w, axis=-1)
+    masked_out_rows_rfa_v = (m_rfa_v_c_w == float("-inf"))
+    m_rfa_v_c_w_masked = tl.where(masked_out_rows_rfa_v, 0, m_rfa_v_c_w)
+    rfa_v_c_w = tl.exp2(rfa_v_c_w - m_rfa_v_c_w_masked[:, None])
+    denom_v = tl.sum(rfa_v_c_w, axis=-1)
+    denom_v = tl.where(denom_v == 0.0, 1.0, denom_v)
+    rfa_v_c_w = rfa_v_c_w / denom_v[:, None]
+    rfa_v_c = tl.sum(v * rfa_v_c_w[:, :, None].to(v.dtype), axis=-2)
+    offs_out_c = start_n * CHUNKS_PER_BLOCK + tl.arange(0, CHUNKS_PER_BLOCK)
+    out_rfa_v_ptrs = (
+        Out_RFA_V +
+        offs_b * stride_ov_b +
+        offs_h * stride_ov_h +
+        (offs_out_c[:, None] * stride_ov_c + offs_d[None, :])
+    )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_rfa_v_ptrs, rfa_v_c
+            )
+        else:
+            tl.store(
+                out_rfa_v_ptrs, rfa_v_c,
+                mask=offs_d[None, :] < headdim
+            )
+    else:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_rfa_v_ptrs, rfa_v_c,
+                mask=offs_out_c[:, None] < nchunks
+            )
+        else:
+            tl.store(
+                out_rfa_v_ptrs, rfa_v_c,
+                mask=(offs_out_c[:, None] < nchunks) & (offs_d[None, :] < headdim)
+            )
+@triton.heuristics(
+    {
+        "EVEN_N": lambda args: args["seqlen"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _bwd_eva_prep_kv_kernel(
+    RFA_K, # [b, h, c, d]
+    RFA_V, # [b, h, c, d]
+    K, # [b, h, n, d]
+    V, # [b, h, n, d]
+    PARAM_MU, # [1, h, 1, 1, d]
+    PARAM_PHI,  # [1, h, 1, 1, d]
+    Mask, # [b, h, n, 1]
+    D_RFA_K, # [b, h, c, d]
+    D_RFA_V, # [b, h, c, d]
+    D_K, # [b, h, n, d]
+    D_V, # [b, h, n, d]
+    D_PARAM_MU_PARTIAL, # [b, h, g, d]
+    D_PARAM_PHI_PARTIAL, # [b, h, g, d]
+    softmax_scale,
+    stride_rfa_k_b, stride_rfa_k_h, stride_rfa_k_c,
+    stride_rfa_v_b, stride_rfa_v_h, stride_rfa_v_c,
+    stride_kb, stride_kh, stride_kn,
+    stride_vb, stride_vh, stride_vn,
+    stride_mu_h,
+    stride_phi_h,
+    stride_mb, stride_mn,
+    stride_d_rfa_k_b, stride_d_rfa_k_h, stride_d_rfa_k_c,
+    stride_d_rfa_v_b, stride_d_rfa_v_h, stride_d_rfa_v_c,
+    stride_d_k_b, stride_d_k_h, stride_d_k_n,
+    stride_d_v_b, stride_d_v_h, stride_d_v_n,
+    stride_d_mu_b, stride_d_mu_h, stride_d_mu_g,
+    stride_d_phi_b, stride_d_phi_h, stride_d_phi_g,
+    nheads,
+    seqlen,
+    nchunks,
+    headdim,
+    CHUNKS_PER_BLOCK: tl.constexpr,
+    CHUNK_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_n = tl.program_id(0)
+    offs_bh = tl.program_id(1)
+    offs_h = offs_bh % nheads
+    offs_b = offs_bh // nheads
+    # initialize offsets
+    # we load BLOCK_N keys and values each time, and
+    # reshape it to [CHUNKS_PER_BLOCK, CHUNK_SIZE]
+    offs_c = tl.arange(0, CHUNKS_PER_BLOCK)
+    offs_m = tl.arange(0, CHUNK_SIZE)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    offs_rfa_c = start_n * CHUNKS_PER_BLOCK + offs_c
+    k_ptrs = (
+        K +
+        offs_b * stride_kb +
+        offs_h * stride_kh +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_kn +
+            offs_d[None, None, :]
+        )
+    )
+    rfa_k_ptrs = (
+        RFA_K +
+        offs_b * stride_rfa_k_b +
+        offs_h * stride_rfa_k_h +
+        (offs_rfa_c[:, None] * stride_rfa_k_c + offs_d[None, :])
+    )
+    rfa_v_ptrs = (
+        RFA_V +
+        offs_b * stride_rfa_v_b +
+        offs_h * stride_rfa_v_h +
+        (offs_rfa_c[:, None] * stride_rfa_v_c + offs_d[None, :])
+    )
+    d_rfa_k_ptrs = (
+        D_RFA_K +
+        offs_b * stride_d_rfa_k_b +
+        offs_h * stride_d_rfa_k_h +
+        (offs_rfa_c[:, None] * stride_d_rfa_k_c + offs_d[None, :])
+    )
+    d_rfa_v_ptrs = (
+        D_RFA_V +
+        offs_b * stride_d_rfa_v_b +
+        offs_h * stride_d_rfa_v_h +
+        (offs_rfa_c[:, None] * stride_d_rfa_v_c + offs_d[None, :])
+    )
+    param_mu_ptrs = (
+        PARAM_MU +
+        offs_h * stride_mu_h +
+        offs_d[None, None, :]
+    )
+    param_phi_ptrs = (
+        PARAM_PHI +
+        offs_h * stride_phi_h +
+        offs_d[None, None, :]
+    )
+    log2e = 1.4426950408889634
+    if MASK_TYPE == 1:
+        m_ptrs = (
+            Mask +
+            offs_b * stride_mb +
+            (
+                (
+                    start_n * BLOCK_N +
+                    offs_c[:, None] * CHUNK_SIZE +
+                    offs_m[None, :]
+                ) * stride_mn
+            )
+        )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            k = tl.load(
+                k_ptrs
+            )
+        else:
+            k = tl.load(
+                k_ptrs,
+                mask=offs_d[None, None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            k = tl.load(
+                k_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None, None] * CHUNK_SIZE +
+                        offs_m[None, :, None]
+                    ) < seqlen,
+                other=0.0
+            )
+        else:
+            k = tl.load(
+                k_ptrs,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+                other=0.0
+            )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            rfa_k = tl.load(
+                rfa_k_ptrs
+            )
+        else:
+            rfa_k = tl.load(
+                rfa_k_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            rfa_k = tl.load(
+                rfa_k_ptrs,
+                mask=offs_rfa_c[:, None] < nchunks,
+                other=0.0
+            )
+        else:
+            rfa_k = tl.load(
+                rfa_k_ptrs,
+                mask=(offs_rfa_c[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            d_rfa_k = tl.load(
+                d_rfa_k_ptrs
+            )
+        else:
+            d_rfa_k = tl.load(
+                d_rfa_k_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            d_rfa_k = tl.load(
+                d_rfa_k_ptrs,
+                mask=offs_rfa_c[:, None] < nchunks,
+                other=0.0
+            )
+        else:
+            d_rfa_k = tl.load(
+                d_rfa_k_ptrs,
+                mask=(offs_rfa_c[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+    param_mu = tl.load(param_mu_ptrs).to(k.dtype)
+    mu_c_w = tl.zeros([CHUNKS_PER_BLOCK, CHUNK_SIZE], dtype=tl.float32)
+    mu_c_w += tl.sum(k * param_mu, axis=-1)
+    mu_c_w *= log2e
+    if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+        mu_c_w += tl.where(
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None] * CHUNK_SIZE +
+                offs_m[None, :]
+            ) < seqlen,
+            0,
+            float("-inf")
+        )
+    if MASK_TYPE == 1:
+        if EVEN_N:
+            mask = tl.load(
+                m_ptrs
+            )
+        else:
+            mask = tl.load(
+                m_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None] * CHUNK_SIZE +
+                        offs_m[None, :]
+                    ) < seqlen,
+                other=1,
+            )
+        mu_c_w = tl.where(mask, float("-inf"), mu_c_w)
+    # [c, w]
+    m_mu_c_w = tl.max(mu_c_w, axis=-1)
+    masked_out_rows_mu = (m_mu_c_w == float("-inf"))
+    m_mu_c_w_masked = tl.where(masked_out_rows_mu, 0, m_mu_c_w)
+    mu_c_w = tl.exp2(mu_c_w - m_mu_c_w_masked[:, None])
+    denom_mu = tl.sum(mu_c_w, axis=-1)
+    denom_mu = tl.where(denom_mu == 0.0, 1.0, denom_mu)
+    mu_tilde_c_w = mu_c_w / denom_mu[:, None]
+    mu_tilde_c_w = mu_tilde_c_w.to(k.dtype)
+    # [c, d] [c, w, d] -> [c, w]
+    d_mu_tilde_c_w = tl.sum(d_rfa_k[:, None, :] * k, axis=-1)
+    # [c, d] [c, d] -> [c]
+    d_out_rfa_k_t_rfa_k = tl.sum(d_rfa_k * rfa_k, axis=-1)[:, None]
+    d_mu_c_w = (d_mu_tilde_c_w - d_out_rfa_k_t_rfa_k) * mu_tilde_c_w
+    # [c, w] [c, w, d] -> [d]
+    d_param_mu = tl.sum(tl.sum(d_mu_c_w[:, :, None] * k, axis=0), axis=0)
+    # [c, w] [c, d] + [c, w] [1, 1, d] -> [c, w, d]
+    d_k = mu_tilde_c_w[:, :, None] * d_rfa_k[:, None, :] + d_mu_c_w[:, :, None] * param_mu
+    d_param_mu_partial_ptrs = (
+        D_PARAM_MU_PARTIAL +
+        offs_b * stride_d_mu_b +
+        offs_h * stride_d_mu_h +
+        start_n * stride_d_mu_g +
+        offs_d
+    )
+    if EVEN_HEADDIM:
+        tl.store(
+            d_param_mu_partial_ptrs, d_param_mu
+        )
+    else:
+        tl.store(
+            d_param_mu_partial_ptrs, d_param_mu,
+            mask=offs_d < headdim
+        )
+    v_ptrs = (
+        V +
+        offs_b * stride_vb +
+        offs_h * stride_vh +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_vn +
+            offs_d[None, None, :]
+        )
+    )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            v = tl.load(
+                v_ptrs
+            )
+        else:
+            v = tl.load(
+                v_ptrs,
+                mask=offs_d[None, None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            v = tl.load(
+                v_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None, None] * CHUNK_SIZE +
+                        offs_m[None, :, None]
+                    ) < seqlen,
+                other=0.0
+            )
+        else:
+            v = tl.load(
+                v_ptrs,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+                other=0.0
+            )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            rfa_v = tl.load(
+                rfa_v_ptrs
+            )
+        else:
+            rfa_v = tl.load(
+                rfa_v_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            rfa_v = tl.load(
+                rfa_v_ptrs,
+                mask=offs_rfa_c[:, None] < nchunks,
+                other=0.0
+            )
+        else:
+            rfa_v = tl.load(
+                rfa_v_ptrs,
+                mask=(offs_rfa_c[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            d_rfa_v = tl.load(
+                d_rfa_v_ptrs
+            )
+        else:
+            d_rfa_v = tl.load(
+                d_rfa_v_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            d_rfa_v = tl.load(
+                d_rfa_v_ptrs,
+                mask=offs_rfa_c[:, None] < nchunks,
+                other=0.0
+            )
+        else:
+            d_rfa_v = tl.load(
+                d_rfa_v_ptrs,
+                mask=(offs_rfa_c[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+    param_phi = tl.load(param_phi_ptrs).to(k.dtype)
+    phi_c_w = tl.zeros([CHUNKS_PER_BLOCK, CHUNK_SIZE], dtype=tl.float32)
+    phi_c_w += tl.sum(k * param_phi, axis=-1)
+    phi_c_w -= (0.5 * tl.sum(k * k, axis=-1))
+    phi_c_w *= log2e * softmax_scale
+    if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+        phi_c_w += tl.where(
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None] * CHUNK_SIZE +
+                offs_m[None, :]
+            ) < seqlen,
+            0,
+            float("-inf")
+        )
+    if MASK_TYPE == 1:
+        phi_c_w = tl.where(mask, float("-inf"), phi_c_w)
+    m_phi_c_w = tl.max(phi_c_w, axis=-1)
+    masked_out_rows_phi = (m_phi_c_w == float("-inf"))
+    m_phi_c_w_masked = tl.where(masked_out_rows_phi, 0, m_phi_c_w)
+    phi_c_w = tl.exp2(phi_c_w - m_phi_c_w_masked[:, None])
+    denom_phi = tl.sum(phi_c_w, axis=-1)
+    denom_phi = tl.where(denom_phi == 0.0, 1.0, denom_phi)
+    phi_tilde_c_w = phi_c_w / denom_phi[:, None]
+    # phi_c_w = tl.exp2(phi_c_w - tl.max(phi_c_w, axis=-1)[:, None])
+    # phi_tilde_c_w = phi_c_w / tl.sum(phi_c_w, axis=-1)[:, None]
+    phi_tilde_c_w = phi_tilde_c_w.to(k.dtype)
+    d_phi_tilde_c_w = tl.sum(d_rfa_v[:, None, :] * v, axis=-1)
+    d_out_rfa_v_t_rfa_v = tl.sum(d_rfa_v * rfa_v, axis=-1)[:, None]
+    d_phi_c_w = (d_phi_tilde_c_w.to(tl.float32) - d_out_rfa_v_t_rfa_v.to(tl.float32)) * phi_tilde_c_w
+    d_param_phi = tl.sum(tl.sum(d_phi_c_w[:, :, None] * k * softmax_scale, axis=0), axis=0)
+    d_v = phi_tilde_c_w[:, :, None] * d_rfa_v[:, None, :]
+    # [c, w, d] + [c, w] * [1, 1, d] - [c, w, d]
+    d_k = d_k + softmax_scale * d_phi_c_w[:, :, None] * (param_phi - k)
+    d_k_ptrs = (
+        D_K +
+        offs_b * stride_d_k_b +
+        offs_h * stride_d_k_h +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_d_k_n +
+            offs_d[None, None, :]
+        )
+    )
+    d_v_ptrs = (
+        D_V +
+        offs_b * stride_d_v_b +
+        offs_h * stride_d_v_h +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_d_v_n +
+            offs_d[None, None, :]
+        )
+    )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            tl.store(
+                d_k_ptrs, d_k
+            )
+            tl.store(
+                d_v_ptrs, d_v
+            )
+        else:
+            tl.store(
+                d_k_ptrs, d_k,
+                mask=offs_d[None, None, :] < headdim
+            )
+            tl.store(
+                d_v_ptrs, d_v,
+                mask=offs_d[None, None, :] < headdim
+            )
+    else:
+        if EVEN_HEADDIM:
+            tl.store(
+                d_k_ptrs, d_k,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ),
+            )
+            tl.store(
+                d_v_ptrs, d_v,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ),
+            )
+        else:
+            tl.store(
+                d_k_ptrs, d_k,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+            )
+            tl.store(
+                d_v_ptrs, d_v,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+            )
+    d_param_phi_partial_ptrs = (
+        D_PARAM_PHI_PARTIAL +
+        offs_b * stride_d_phi_b +
+        offs_h * stride_d_phi_h +
+        start_n * stride_d_phi_g +
+        offs_d
+    )
+    if EVEN_HEADDIM:
+        tl.store(
+            d_param_phi_partial_ptrs, d_param_phi
+        )
+    else:
+        tl.store(
+            d_param_phi_partial_ptrs, d_param_phi,
+            mask=offs_d < headdim
+        )
+def triton_eva_prep_kv_fwd(k, v, param_mu, param_phi, mask, softmax_scale, chunksize):
+    k, v, param_mu, param_phi = [
+        x if x.stride(-1) == 1 else x.contiguous()
+        for x in [k, v, param_mu, param_phi]
+    ]
+    # shape constraints
+    batch, nheads, seqlen, head_dim = k.shape
+    assert seqlen % chunksize == 0, "seqlen must be divisible by chunksize"
+    nchunks = seqlen // chunksize
+    assert k.shape == (batch, nheads, seqlen, head_dim)
+    assert v.shape == (batch, nheads, seqlen, head_dim)
+    assert param_mu.shape == (1, nheads, 1, 1, head_dim)
+    assert param_phi.shape == (1, nheads, 1, 1, head_dim)
+    assert head_dim <= 128, "We only test head dimensions up to 128"
+    assert k.dtype == v.dtype == param_mu.dtype == param_phi.dtype, "All tensors must have the same type"
+    assert k.dtype in [torch.bfloat16, torch.float], "Only support bf16 and fp32 for now"
+    assert k.is_cuda and v.is_cuda
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(head_dim)
+    mask_type = 0
+    if mask is not None:
+        mask_type = 1
+        assert mask.dtype == torch.bool
+        assert mask.is_cuda
+        assert mask.dim() == 4
+        assert mask.shape == (batch, 1, seqlen, 1)
+        if mask.stride(-1) != 1:
+            mask = mask.contiguous()
+    mask_strides = (
+        (mask.stride(0), mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    out_rfa_k = torch.empty((batch, nheads, nchunks, head_dim), dtype=k.dtype, device=k.device)
+    out_rfa_v = torch.empty((batch, nheads, nchunks, head_dim), dtype=v.dtype, device=v.device)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(head_dim), 16)
+    BLOCK = 128
+    num_warps = 4 if head_dim <= 64 else 8
+    assert (BLOCK > chunksize) & (BLOCK % chunksize) == 0, "BLOCK must be divisible by chunksize"
+    chunks_per_block = BLOCK // chunksize
+    grid = lambda META: (triton.cdiv(seqlen, META["BLOCK_N"]), batch * nheads)
+    _fwd_eva_prep_kv_kernel[grid](
+        k,
+        v,
+        param_mu,
+        param_phi,
+        mask,
+        out_rfa_k,
+        out_rfa_v,
+        softmax_scale,
+        k.stride(0), k.stride(1), k.stride(2),
+        v.stride(0), v.stride(1), v.stride(2),
+        param_mu.stride(1),
+        param_phi.stride(1),
+        mask_strides[0], mask_strides[1],
+        out_rfa_k.stride(0), out_rfa_k.stride(1), out_rfa_k.stride(2),
+        out_rfa_v.stride(0), out_rfa_v.stride(1), out_rfa_v.stride(2),
+        nheads,
+        seqlen,
+        nchunks,
+        head_dim,
+        chunks_per_block,
+        chunksize,
+        mask_type,
+        BLOCK_HEADDIM,
+        BLOCK_N=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return out_rfa_k, out_rfa_v
+def triton_eva_prep_kv_bwd(
+        d_rfa_k, d_rfa_v,
+        k, v, param_mu, param_phi,
+        mask,
+        rfa_k, rfa_v,
+        d_k, d_v, d_param_mu, d_param_phi,
+        softmax_scale,
+        mask_type,
+        chunksize
+    ):
+    d_rfa_k, d_rfa_v = [
+        x if x.stride(-1) == 1 else x.contiguous()
+        for x in [d_rfa_k, d_rfa_v]
+    ]
+    # shape constraints
+    batch, nheads, seqlen, head_dim = k.shape
+    assert seqlen % chunksize == 0, "seqlen must be divisible by chunksize"
+    nchunks = seqlen // chunksize
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(head_dim)
+    mask_strides = (
+        (mask.stride(0), mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    BLOCK_HEADDIM = max(triton.next_power_of_2(head_dim), 16)
+    BLOCK = 128
+    num_warps = 4 if head_dim <= 64 else 8
+    assert (BLOCK > chunksize) & (BLOCK % chunksize) == 0, "BLOCK must be divisible by chunksize"
+    chunks_per_block = BLOCK // chunksize
+    partial_groups = triton.cdiv(seqlen, BLOCK)
+    d_param_mu_partial = torch.zeros((batch, nheads, partial_groups, head_dim), dtype=torch.float32, device=d_rfa_k.device)
+    d_param_phi_partial = torch.zeros((batch, nheads, partial_groups, head_dim), dtype=torch.float32, device=d_rfa_k.device)
+    grid = lambda META: (partial_groups, batch * nheads)
+    _bwd_eva_prep_kv_kernel[grid](
+        rfa_k, # [b, h, c, d]
+        rfa_v, # [b, h, c, d]
+        k, # [b, h, n, d]
+        v, # [b, h, n, d]
+        param_mu, # [1, h, 1, 1, d]
+        param_phi,  # [1, h, 1, 1, d]
+        mask, # [b, h, n, 1]
+        d_rfa_k, # [b, h, c, d]
+        d_rfa_v, # [b, h, c, d]
+        d_k, # [b, h, n, d]
+        d_v, # [b, h, n, d]
+        d_param_mu_partial, # [b, h, g, d]
+        d_param_phi_partial, # [b, h, g, d]
+        softmax_scale,
+        rfa_k.stride(0), rfa_k.stride(1), rfa_k.stride(2),
+        rfa_v.stride(0), rfa_v.stride(1), rfa_v.stride(2),
+        k.stride(0), k.stride(1), k.stride(2),
+        v.stride(0), v.stride(1), v.stride(2),
+        param_mu.stride(1),
+        param_phi.stride(1),
+        mask_strides[0], mask_strides[1],
+        d_rfa_k.stride(0), d_rfa_k.stride(1), d_rfa_k.stride(2),
+        d_rfa_v.stride(0), d_rfa_v.stride(1), d_rfa_v.stride(2),
+        d_k.stride(0), d_k.stride(1), d_k.stride(2),
+        d_v.stride(0), d_v.stride(1), d_v.stride(2),
+        d_param_mu_partial.stride(0), d_param_mu_partial.stride(1), d_param_mu_partial.stride(2),
+        d_param_phi_partial.stride(0), d_param_phi_partial.stride(1), d_param_phi_partial.stride(2),
+        nheads,
+        seqlen,
+        nchunks,
+        head_dim,
+        chunks_per_block,
+        chunksize,
+        mask_type,
+        BLOCK_HEADDIM,
+        BLOCK_N=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    d_param_mu.copy_(d_param_mu_partial.sum(dim=(0, -2), keepdim=True).unsqueeze(-2).to(d_param_mu.dtype))
+    d_param_phi.copy_(d_param_phi_partial.sum(dim=(0, -2), keepdim=True).unsqueeze(-2).to(d_param_phi.dtype))
+class EvaPrepKVFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, k, v, param_mu, param_phi, mask, softmax_scale=None, chunksize=None):
+        if mask is not None:
+            mask_type = 1
+        else:
+            mask_type = 0
+        rfa_k, rfa_v = triton_eva_prep_kv_fwd(
+            k, v, param_mu, param_phi, mask, softmax_scale, chunksize
+        )
+        ctx.save_for_backward(k, v, param_mu, param_phi, mask, rfa_k, rfa_v)
+        ctx.softmax_scale = softmax_scale
+        ctx.chunksize = chunksize
+        ctx.mask_type = mask_type
+        return rfa_k, rfa_v
+    @staticmethod
+    def backward(ctx, d_rfa_k, d_rfa_v):
+        k, v, param_mu, param_phi, mask, rfa_k, rfa_v = ctx.saved_tensors
+        d_k = torch.empty_like(k)
+        d_v = torch.empty_like(v)
+        d_param_mu = torch.empty_like(param_mu)
+        d_param_phi = torch.empty_like(param_phi)
+        triton_eva_prep_kv_bwd(
+            d_rfa_k, d_rfa_v,
+            k, v, param_mu, param_phi,
+            mask,
+            rfa_k, rfa_v,
+            d_k, d_v, d_param_mu, d_param_phi,
+            ctx.softmax_scale,
+            ctx.mask_type,
+            ctx.chunksize
+        )
+        return d_k, d_v, d_param_mu, d_param_phi, None, None, None
+def eva_prep_kv_func_triton(
+        k, v,
+        param_mu, param_phi,
+        mask,
+        softmax_scale=None, chunksize=None
+    ):
+    return EvaPrepKVFunc.apply(
+        k, v,
+        param_mu, param_phi,
+        mask,
+        softmax_scale, chunksize
+    )

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/eva_pt_ref.py ADDED Viewed

	@@ -0,0 +1,420 @@

+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+MASK_MIN_VALUE = -10e10
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """
+    Rotates half the hidden dims (last dim) of the input.
+    Args:
+        x: Rotary embedded tensor
+    Return:
+        Tensor with half of last dim negated and rotated to the front.
+    """
+    x1, x2 = x.split(x.shape[-1] // 2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor,
+                         position_ids: torch.Tensor) -> torch.Tensor:
+    """
+    Apply rotary embedding (cos, sin) to the query and key tensor on the sequence dimension.
+    The legends for dimensions are defined as:
+    num_heads: number of attention heads
+    current_seq_len: the current batch's sequence length, should be either 1 or max_seq_len
+    max_seq_len: the static sequence length, different from current_seq_len in cached inference case where it is always
+                 maximum lenghth, e.g. the length of static sequence length of KV cache
+    Args:
+        q: Query tensor, of size (batch_size, num_heads, current_seq_len, head_dim)
+        k: Key tensor, of size (batch_size, num_key_value_heads, current_seq_len, head_dim)
+        cos: Cosine base of rotary embedding, of size (max_seq_len, head_dim)
+        sin: Sine base of rotary embedding, of size (max_seq_len, head_dim)
+        position_ids: The position indices of the tokens corresponding to the query and key tensors. It has a size of
+                      (batch_size, current_seq_len).
+    Returns:
+        Embedded query and key tensor of same size as input.
+    """
+    bs, nheads, cur_seq_len, head_dim = q.shape
+    assert len(
+        k.shape) == 4, f"k should be of shape (batch_size, num_heads, current_seq_len, head_dim), got {k.shape} instead"
+    assert k.shape[0] == bs, f"k has a different batch_size {k.shape[0]} compared to q {bs}"
+    assert list(k.shape[2:]) == [cur_seq_len,
+                                 head_dim], f"k has different current_seq_len and/or head_dim compared to q"
+    assert cos.shape[3] == head_dim, f"cos should have dim of head dim {head_dim}, got {cos.shape[3]} instead"
+    assert list(position_ids.shape) in [[bs, cur_seq_len], [1, cur_seq_len]],\
+            f"position_ids should be of shape {[bs, cur_seq_len]} or {[1, cur_seq_len]}, got {position_ids.shape} instead"
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def attention_op(
+        q,
+        k,
+        v,
+        attn_mask,
+        mixedp_attn,
+        head_dim_scaling
+    ):
+    attn = torch.matmul(q, k.transpose(-2, -1))
+    if mixedp_attn:
+        attn = attn.to(torch.float)
+    attn = attn * head_dim_scaling
+    if attn_mask is not None:
+        attn = attn.masked_fill(attn_mask, MASK_MIN_VALUE)
+    attn_weights = torch.softmax(attn, dim=-1).to(q.dtype)
+    attn_output = torch.matmul(attn_weights, v)
+    return attn_output
+def prm_projection(
+    x: torch.Tensor,
+    projection_matrix: torch.Tensor,
+    mixedp_attn: bool = False
+    ):
+    """
+    Constructs nonnegative kernel features for fast softmax attention.
+    Args:
+    x: input for which features are computed
+    projection_matrix: random matrix used to compute features
+    Returns:
+    Random features for fast attention.
+    """
+    # x : [..., m, d]
+    # proj : [..., r, d]
+    scaling_factor = (x.shape[-1] ** -0.5)
+    proj_x = torch.matmul(projection_matrix, x.transpose(-1, -2)) # [..., r, m]
+    norm = torch.sum(x ** 2, dim=-1).unsqueeze(-2) * 0.5 # [..., 1]
+    if mixedp_attn:
+        proj_x = proj_x.to(torch.float)
+        norm = norm.to(torch.float)
+    phi_x =  scaling_factor * (proj_x - norm)
+    return phi_x
+class EvaAttention(nn.Module):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.head_dim_scaling = self.head_dim ** -0.5
+        self.max_position_embeddings = config.max_position_embeddings
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.window_size = config.window_size
+        self.num_chunks = config.num_chunks
+        self.chunk_size = config.chunk_size
+        if self.chunk_size is not None:
+            assert self.window_size >= self.chunk_size and self.window_size % self.chunk_size == 0
+            # chunk_size overrides the number of landmarks
+            self.num_chunks = None
+        self.chunks_per_window = int(self.window_size // self.chunk_size)
+        self.random_feature_dim = 1
+        self.adaptive_phi = nn.Parameter(
+            torch.randn(
+                1,
+                self.num_heads,
+                1,
+                1,
+                self.head_dim
+            ).clamp(-1., 1.) * self.head_dim_scaling
+        )
+        self.adaptive_mu_k = nn.Parameter(
+            torch.randn(
+                1,
+                self.num_heads,
+                1,
+                1,
+                self.head_dim
+            ).clamp(-1., 1.) * self.head_dim_scaling
+        )
+    def _generate_feature_map(self, rf_q, rf_k, rf_v):
+        rf_k_logits = torch.sum(self.adaptive_mu_k.to(rf_k.dtype) * rf_k, dim=-1, keepdim=True) # b h c m 1
+        if self.config.mixedp_attn:
+            rf_k_logits = rf_k_logits.to(torch.float)
+        rf_k_weights = torch.softmax(rf_k_logits, dim=-2).to(rf_k.dtype)
+        rf_k_bar = torch.sum(rf_k_weights * rf_k, dim=-2)
+        weights = self.adaptive_phi.to(rf_k.dtype)
+        return weights, rf_k_bar
+    def _calculate_chunk_rfa_cache(self, rf_q, rf_k, rf_v, weights, rf_mask=None):
+        proj_x = torch.sum(weights * rf_k, dim=-1, keepdim=True)
+        norm = torch.sum(rf_k ** 2, dim=-1, keepdim=True) * 0.5 # [..., 1]
+        if self.config.mixedp_attn:
+            proj_x = proj_x.to(torch.float)
+            norm = norm.to(torch.float)
+        log_phi_k = self.head_dim_scaling * (proj_x - norm)
+        if rf_mask is not None:
+            log_phi_k = log_phi_k.masked_fill(rf_mask, MASK_MIN_VALUE)
+        # [b, h, c, m, r]
+        softmax_phi_k = torch.softmax(log_phi_k, dim=-2).to(rf_k.dtype)
+        softmax_phi_k_v = torch.sum(softmax_phi_k * rf_v, dim=-2)
+        # [b, h, c, r, m] [b, h, c, m, d] -> [b, h, c, r, d]
+        # softmax_phi_k_v = torch.matmul(softmax_phi_k.transpose(-1, -2), rf_v).squeeze(-2)
+        log_sum_phi_k = None
+        return softmax_phi_k_v, log_sum_phi_k
+    def _calculate_chunk_rfa(self, q, softmax_phi_k_v, log_sum_phi_k, weights):
+        if self.random_feature_dim == 1:
+            # when r = 1, the snis weights becomes 1, so this takes no effect
+            # [b, h, c, r, d] -> [b, h, c, d]
+            return softmax_phi_k_v
+        else:
+            # [b, h, c, r, d] [b, h, 1, s, d] -> [b, h, c, r, s]
+            log_phi_q = prm_projection(q.unsqueeze(-3), weights, self.config.mixedp_attn)
+            # [b, h, c, r, s] [b, h, c, r, 1] -> [b, h, c, r, s]
+            sniw = torch.softmax(log_phi_q + log_sum_phi_k, dim=-1).to(q.dtype)
+            # [b, h, c, r, s] [b, h, c, r, d] -> [b, h, c, s, d] -> [b, h, s, c, d]
+            rfa_per_chunk = torch.matmul(sniw.transpose(-1, -2), softmax_phi_k_v).transpose(-3, -2)
+            return rfa_per_chunk
+    def window_partition(self, x, window_size=None):
+        window_size = window_size if window_size is not None else self.window_size
+        gw, d = x.shape[-2:]
+        leading_dims = x.shape[:-2]
+        n_groups = gw // window_size
+        return x.reshape(*leading_dims, n_groups, window_size, d)
+    def window_merge(self, x, window_size=None):
+        g, w, d = x.shape[-3:]
+        leading_dims = x.shape[:-3]
+        return x.reshape(*leading_dims, g * w, d)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+        multibyte_decoding: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        assert not output_attentions
+        bsz, q_len, _ = hidden_states.size()
+        ############################################
+        # initialize past states if not provided
+        ############################################
+        if use_cache and past_key_value is None:
+            raise ValueError
+        if use_cache and multibyte_decoding:
+            raise NotImplementedError("Multibyte decoding is not supported for PyTorch native implementation")
+        # assert isinstance(attention_mask, tuple)
+        if len(attention_mask) == 4:
+            assert use_cache
+            prev_causal_mask, cur_causal_mask, chunk_causal_mask, intra_chunk_mask = attention_mask
+        elif len(attention_mask) == 3:
+            assert not use_cache
+            window_causal_mask, chunk_causal_mask, intra_chunk_mask = attention_mask
+        else:
+            raise NotImplementedError("Only attention-mask tuple with length 2 or 3 is supported")
+        ############################################
+        # compute q, k, v from hidden states
+        ############################################
+        # [b, h, q_len, d]
+        q = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        k = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        v = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if use_cache:
+            past_key_value.update_past_len(q.shape[-2], self.layer_idx)
+        ############################################
+        # apply rotary positional embeddings to q, k
+        ############################################
+        q, k = apply_rotary_pos_emb(q, k, cos, sin, position_ids)
+        ############################################
+        # compute q, k, v stats for the local window
+        ############################################
+        if use_cache:
+            (prev_w_q, prev_w_k, prev_w_v), (cur_w_q, cur_w_k, cur_w_v) = past_key_value.update_singletons(
+                q,
+                k,
+                v,
+                self.layer_idx,
+                self.window_size,
+            )
+        else:
+            prev_w_q = self.window_partition(q) # [b, h, w, i, d]
+            prev_w_k = self.window_partition(k) # [b, h, w, j, d]
+            prev_w_v = self.window_partition(v) # [b, h, w, j, d]
+            # during training, we assume window_size divides seq_len so no remainders
+            cur_w_q = cur_w_k = cur_w_v = None
+        ############################################
+        # compute q, k, v stats for chunk-level RFAs
+        ############################################
+        if use_cache:
+            dump_q, dump_k, dump_v = past_key_value.update_chunks(q, k, v, self.layer_idx, self.chunk_size)
+        else:
+            dump_q, dump_k, dump_v = q, k, v
+        if use_cache:
+            prev_s_mask, cur_s_mask, prev_chunk_mask, cur_chunk_mask, dump_rf_mask = past_key_value.update_mask(
+                prev_s_mask=prev_causal_mask,
+                cur_s_mask=cur_causal_mask,
+                chunk_mask=chunk_causal_mask,
+                rf_mask=intra_chunk_mask,
+                layer_idx=self.layer_idx,
+                window_size=self.window_size,
+                chunk_size=self.chunk_size,
+            )
+        else:
+            prev_s_mask = self.window_partition(prev_causal_mask) # [1, 1, w, i, j]
+            cur_s_mask = None
+            prev_chunk_mask = self.window_partition(chunk_causal_mask)
+            cur_chunk_mask = None
+            dump_rf_mask = intra_chunk_mask
+            if prev_s_mask.shape[-3] == 1:
+                # need to expand
+                prev_s_mask = prev_s_mask.expand(-1, -1, prev_chunk_mask.shape[-3], -1, -1)
+        if (
+            dump_q is not None and
+            dump_k is not None and
+            dump_v is not None
+        ):
+            # [b, h, c, j, d]
+            rf_q = self.window_partition(dump_q, window_size=self.chunk_size)
+            # [b, h, c, j, d]
+            rf_k = self.window_partition(dump_k, window_size=self.chunk_size)
+            # [b, h, c, j, d]
+            rf_v = self.window_partition(dump_v, window_size=self.chunk_size)
+            if dump_rf_mask is not None:
+                rf_mask = self.window_partition(dump_rf_mask, window_size=self.chunk_size)
+                rf_q = rf_q.masked_fill(rf_mask, 0.)
+                rf_k = rf_k.masked_fill(rf_mask, 0.)
+                rf_v = rf_v.masked_fill(rf_mask, 0.)
+            else:
+                rf_mask = None
+        else:
+            rf_q = None
+            rf_k = None
+            rf_v = None
+            rf_mask = None
+        if rf_q is not None:
+            # import pdb; pdb.set_trace()
+            weights, rf_k_bar = self._generate_feature_map(rf_q, rf_k, rf_v)
+            softmax_phi_k_v, log_sum_phi_k = self._calculate_chunk_rfa_cache(rf_q, rf_k, rf_v, weights, rf_mask=rf_mask)
+            if use_cache:
+                softmax_phi_k_v, log_sum_phi_k, rf_k_bar = past_key_value.update_chunk_rfas(
+                    softmax_phi_k_v, log_sum_phi_k, rf_k_bar, self.layer_idx, 1
+                )
+        elif use_cache:
+            weights = None
+            softmax_phi_k_v, log_sum_phi_k, rf_k_bar = past_key_value.get_chunk_rfas(self.layer_idx)
+        else:
+            weights = None
+            softmax_phi_k_v = None
+            log_sum_phi_k = None
+            rf_k_bar = None
+        if rf_k_bar is not None:
+            rfa_per_chunk = self._calculate_chunk_rfa(q, softmax_phi_k_v, log_sum_phi_k, weights)
+        ############################################
+        # compute meta-attention weights for
+        # - group-wise RFAs and
+        # - singletons (equivalent to exact local attention)
+        ############################################
+        if prev_w_k is not None:
+            if rf_k_bar is not None:
+                num_windows = prev_w_k.shape[-3]
+                # rf_k_bar and rfa_per_chunk take the shape [b, h, c, d]
+                # -> [b, h, 1, c, d] -> [b, h, w, c, d]
+                prev_rf_k_bar = rf_k_bar.unsqueeze(-3).expand(-1, -1, num_windows, -1, -1)
+                prev_rfa_per_chunk = rfa_per_chunk.unsqueeze(-3).expand(-1, -1, num_windows, -1, -1)
+                prev_agg_k = torch.cat([prev_w_k, prev_rf_k_bar], dim=-2)
+                prev_agg_v = torch.cat([prev_w_v, prev_rfa_per_chunk], dim=-2)
+                prev_attn_mask = torch.cat([prev_s_mask, prev_chunk_mask], dim=-1)
+            else:
+                prev_agg_k = prev_w_k
+                prev_agg_v = prev_w_v
+                prev_attn_mask = prev_s_mask
+            prev_attn_output = attention_op(
+                q=prev_w_q,
+                k=prev_agg_k,
+                v=prev_agg_v,
+                attn_mask=prev_attn_mask,
+                mixedp_attn=self.config.mixedp_attn,
+                head_dim_scaling=self.head_dim_scaling
+            )
+            prev_attn_output = self.window_merge(prev_attn_output)
+        if cur_w_k is not None:
+            if rf_k_bar is not None:
+                # rf_k_bar and rfa_per_chunk take the shape [b, h, c, d]
+                # cur_w_k and cur_w_v also has shape [b, h, m, d]
+                cur_agg_k = torch.cat([cur_w_k, rf_k_bar], dim=-2)
+                cur_agg_v = torch.cat([cur_w_v, rfa_per_chunk], dim=-2)
+                cur_attn_mask = torch.cat([cur_s_mask, cur_chunk_mask], dim=-1)
+            else:
+                cur_agg_k = cur_w_k
+                cur_agg_v = cur_w_v
+                cur_attn_mask = cur_s_mask
+            cur_attn_output = attention_op(
+                q=cur_w_q,
+                k=cur_agg_k,
+                v=cur_agg_v,
+                attn_mask=cur_attn_mask,
+                mixedp_attn=self.config.mixedp_attn,
+                head_dim_scaling=self.head_dim_scaling
+            )
+        if prev_w_k is not None and cur_w_k is not None:
+            attn_output = torch.cat([prev_attn_output, cur_attn_output], dim=-2)
+        elif prev_w_k is not None:
+            attn_output = prev_attn_output
+        elif cur_w_k is not None:
+            attn_output = cur_attn_output
+        else:
+            raise ValueError("There must be some bug")
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        attn_weights = None
+        return attn_output, attn_weights, past_key_value

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.47.1"
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/image_processing_evabyte.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# coding=utf-8
+"""Image processor class for EvaByte."""
+from typing import Dict, List, Optional, Union, Tuple
+import io
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_utils import (
+    ImageInput,
+    PILImageResampling,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from PIL import Image
+def _get_qtable_bytes():
+    return {
+        5: b'\xff\xd8\xff\xdb\x00C\x00\xa0nx\x8cxd\xa0\x8c\x82\x8c\xb4\xaa\xa0\xbe\xf0\xff\xff\xf0\xdc\xdc\xf0\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xdb\x00C\x01\xa0\xb4\xb4\xf0\xd2\xf0\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xd9',
+        10: b'\xff\xd8\xff\xdb\x00C\x00P7<F<2PFAFZUP_x\xc8\x82xnnx\xf5\xaf\xb9\x91\xc8\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xdb\x00C\x01PZZxix\xeb\x82\x82\xeb\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xd9',
+        15: b'\xff\xd8\xff\xdb\x00C\x005%(/(!5/+/<95?P\x85WPIIP\xa3u{a\x85\xc1\xaa\xcb\xc8\xbe\xaa\xba\xb7\xd5\xf0\xff\xff\xd5\xe2\xff\xe6\xb7\xba\xff\xff\xff\xff\xff\xff\xff\xff\xff\xce\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xdb\x00C\x015<<PFP\x9dWW\x9d\xff\xdc\xba\xdc\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xd9',
+        20: b'\xff\xd8\xff\xdb\x00C\x00(\x1c\x1e#\x1e\x19(#!#-+(0<dA<77<{X]Id\x91\x80\x99\x96\x8f\x80\x8c\x8a\xa0\xb4\xe6\xc3\xa0\xaa\xda\xad\x8a\x8c\xc8\xff\xcb\xda\xee\xf5\xff\xff\xff\x9b\xc1\xff\xff\xff\xfa\xff\xe6\xfd\xff\xf8\xff\xdb\x00C\x01(--<5<vAAv\xf8\xa5\x8c\xa5\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xff\xd9',
+        25: b'\xff\xd8\xff\xdb\x00C\x00 \x16\x18\x1c\x18\x14 \x1c\x1a\x1c$" &0P40,,0bFJ:Ptfzxrfpn\x80\x90\xb8\x9c\x80\x88\xae\x8anp\xa0\xda\xa2\xae\xbe\xc4\xce\xd0\xce|\x9a\xe2\xf2\xe0\xc8\xf0\xb8\xca\xce\xc6\xff\xdb\x00C\x01 $$0*0^44^\xc6\x84p\x84\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xff\xd9',
+        30: b'\xff\xd8\xff\xdb\x00C\x00\x1b\x12\x14\x17\x14\x11\x1b\x17\x16\x17\x1e\x1c\x1b (B+(%%(Q:=0B`Ued_U][jx\x99\x81jq\x90s[]\x85\xb5\x86\x90\x9e\xa3\xab\xad\xabg\x80\xbc\xc9\xba\xa6\xc7\x99\xa8\xab\xa4\xff\xdb\x00C\x01\x1b\x1e\x1e(#(N++N\xa4n]n\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xff\xd9',
+        50: b'\xff\xd8\xff\xdb\x00C\x00\x10\x0b\x0c\x0e\x0c\n\x10\x0e\r\x0e\x12\x11\x10\x13\x18(\x1a\x18\x16\x16\x181#%\x1d(:3=<9387@H\\N@DWE78PmQW_bghg>Mqypdx\\egc\xff\xdb\x00C\x01\x10\x12\x12\x18\x15\x18/\x1a\x1a/cB8Bcccccccccccccccccccccccccccccccccccccccccccccccccc\xff\xd9',
+        75: b'\xff\xd8\xff\xdb\x00C\x00\x08\x06\x06\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f\x14\x1d\x1a\x1f\x1e\x1d\x1a\x1c\x1c $.\' ",#\x1c\x1c(7),01444\x1f\'9=82<.342\xff\xdb\x00C\x01\x08\t\t\x0c\x0b\x0c\x18\r\r\x182!\x1c!22222222222222222222222222222222222222222222222222\xff\xd9',
+        95: b'\xff\xd8\xff\xdb\x00C\x00\x02\x01\x01\x01\x01\x01\x02\x01\x01\x01\x02\x02\x02\x02\x02\x04\x03\x02\x02\x02\x02\x05\x04\x04\x03\x04\x06\x05\x06\x06\x06\x05\x06\x06\x06\x07\t\x08\x06\x07\t\x07\x06\x06\x08\x0b\x08\t\n\n\n\n\n\x06\x08\x0b\x0c\x0b\n\x0c\t\n\n\n\xff\xdb\x00C\x01\x02\x02\x02\x02\x02\x02\x05\x03\x03\x05\n\x07\x06\x07\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\xff\xd9',
+        100: b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xff\xdb\x00C\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xff\xd9',
+    }
+def _resize_if_exceeding_max_len(
+    width: int, height: int, min_len: Optional[int] = 16, max_len: Optional[int] = None
+) -> Tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+    Args:
+        height (`int`):
+            Height of the input image.
+        width (`int`):
+            Width of the input image.
+        max_len (`Dict[str, int]`, *optional*, defaults to the maximum size of the image):
+            Defines the maximum dimensions of the image.
+    Returns:
+        The output size of the image after resizing.
+    """
+    max_len = max(height, width) if max_len is None else max_len
+    aspect_ratio = width / height
+    if width >= height and width > max_len:
+        width = max_len
+        height = int(width / aspect_ratio)
+        if height % 2 != 0:
+            height += 1
+    elif height > width and height > max_len:
+        height = max_len
+        width = int(height * aspect_ratio)
+        if width % 2 != 0:
+            width += 1
+    # Avoid resizing to a size smaller than 1
+    height = max(height, min_len)
+    width = max(width, min_len)
+    return width, height
+class EvaByteImageProcessor(BaseImageProcessor):
+    model_input_names = []
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        size: Dict[str, int] = None,
+        do_convert_rgb: bool = True,
+        jpeg_quality: int = 25,
+        jpeg_subsampling: str = "4:2:0",
+        jpeg_streamtype: str = 2,
+        jpeg_restart_marker_blocks: int = 1,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.size = size if size is not None else {"longest_edge": 384}
+        self.do_convert_rgb = do_convert_rgb
+        self.jpeg_quality = jpeg_quality
+        self.jpeg_subsampling = jpeg_subsampling
+        self.jpeg_streamtype = jpeg_streamtype
+        self.jpeg_restart_marker_blocks = jpeg_restart_marker_blocks
+    def jpeg_encode(
+        self,
+        image,
+        jpeg_quality,
+        jpeg_subsampling,
+        jpeg_streamtype,
+        jpeg_restart_marker_blocks,
+    ):
+        with io.BytesIO() as output:
+            image.save(
+                output,
+                format="JPEG",
+                quality=jpeg_quality,
+                subsampling=jpeg_subsampling,
+                streamtype=jpeg_streamtype,
+                restart_marker_blocks=jpeg_restart_marker_blocks
+            )
+            jpeg_bytes = output.getvalue()
+        return jpeg_bytes
+    def jpeg_merge_qtables(
+        self,
+        image_bytes,
+        jpeg_quality=None,
+    ):
+        if jpeg_quality is None:
+            jpeg_quality = self.jpeg_quality
+        qtable_bytes = _get_qtable_bytes()[jpeg_quality]
+        return image_bytes[:2] + qtable_bytes[2:-2] + image_bytes[2:]
+    def resize(
+        self,
+        image: Image,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+    ) -> Image:
+        if "longest_edge" in size:
+            width, height = image.size
+            # Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio
+            width, height = _resize_if_exceeding_max_len(width, height, max_len=size["longest_edge"])
+            size = (width, height)
+        elif "width" in size and "height" in size:
+            size = (size["width"], size["height"])
+        else:
+            raise ValueError("size must be a dictionary with key 'longest_edge' or 'height' and 'width'.")
+        resized_image = image.resize(size, resample=resample)
+        return resized_image
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        resample = None,
+        size: Dict[str, int] = None,
+        do_convert_rgb: bool = None,
+        jpeg_quality: int = None,
+        jpeg_subsampling: str = None,
+        jpeg_streamtype: str = None,
+        jpeg_restart_marker_blocks: int = None,
+    ):
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        jpeg_quality = jpeg_quality if jpeg_quality is not None else self.jpeg_quality
+        jpeg_subsampling = jpeg_subsampling if jpeg_subsampling is not None else self.jpeg_subsampling
+        jpeg_streamtype = jpeg_streamtype if jpeg_streamtype is not None else self.jpeg_streamtype
+        jpeg_restart_marker_blocks = jpeg_restart_marker_blocks if jpeg_restart_marker_blocks is not None else self.jpeg_restart_marker_blocks
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        images_list = images
+        if do_convert_rgb:
+            images_list = [
+                [
+                    image.convert("RGB") for image in images
+                ]
+                for images in images_list
+            ]
+        if do_resize:
+            images_list = [
+                [
+                    self.resize(image=image, size=size, resample=resample)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+        jpeg_bytes = [
+            [
+                self.jpeg_encode(
+                    image,
+                    jpeg_quality,
+                    jpeg_subsampling,
+                    jpeg_streamtype,
+                    jpeg_restart_marker_blocks
+                ) for image in images
+            ]
+            for images in images_list
+        ]
+        return jpeg_bytes

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+  "metadata": {
+    "total_size": 57058938880
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.2.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.26.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.3.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.27.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.4.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.28.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.5.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.6.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.30.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.6.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.30.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.7.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.31.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.7.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.31.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.8.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.32.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.2.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.14.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.38.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.norm.weight": "model-00003-of-00003.safetensors",
+    "lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.10.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.34.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.10.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.34.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.16.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.12.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.36.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.15.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.15.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.39.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.18.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.adaptive_phi": "model-00003-of-00003.safetensors"
+  }
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/modeling_evabyte.py ADDED Viewed

	@@ -0,0 +1,912 @@

+from typing import List, Optional, Tuple, Union
+import math
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_evabyte import EvaByteConfig
+from .multibyte_decoding_evabyte import MultiByteDecodingMixin
+try:
+    import triton
+    USE_TRITON_IMPL = True
+    from .eva import EvaAttention
+    from .eva_agg_kernel import triton_eva_agg_fwd
+    from .eva_prep_kv_kernel import triton_eva_prep_kv_fwd
+except ImportError:
+    USE_TRITON_IMPL = False
+    print("WARNING: triton is not installed, using fallback EVA which might be slow and throw errors")
+    from .eva_pt_ref import EvaAttention
+from .eva_cache import EvaCache, EvaStaticCacheForTriton
+MASK_MIN_VALUE = -10e10
+def prepare_eva_attention_mask(
+        seq_len,
+        device,
+        chunk_size,
+        window_size,
+        use_cache=False,
+        cache=None
+    ):
+    """
+    Prepare attention masks for EVA.
+    """
+    chunk_causal_mask  = None
+    window_causal_mask = None
+    if use_cache:
+        cached_seq_len = cache.get_seq_length()
+        total_seq_len = seq_len + cached_seq_len
+        # cached_seq_len will be 0 during prefilling
+        # padded_seq_len = chunk_size * math.ceil(total_seq_len / chunk_size)
+        padded_seq_len = window_size * math.ceil(total_seq_len / window_size)
+        num_chunks = padded_seq_len // chunk_size
+    else:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        assert seq_len % chunk_size == 0
+        num_chunks = seq_len // chunk_size
+        assert seq_len % window_size == 0
+    # create causal mask
+    ################################
+    # generate chunked causal masks
+    ################################
+    # [b, h, j, c, c]
+    chunks_per_window = window_size // chunk_size
+    if num_chunks >= chunks_per_window:
+        chunk_causal_mask = torch.ones(
+            (chunk_size, num_chunks, num_chunks),
+            device=device,
+            dtype=torch.bool
+        ).triu(0)
+        num_blocks = num_chunks // chunks_per_window
+        chunk_causal_mask = chunk_causal_mask.reshape(
+            chunk_size,
+            num_blocks,
+            chunks_per_window,
+            num_blocks,
+            chunks_per_window
+        ).transpose(-2, -3)
+        block_diag_zero = (
+            torch.eye(num_blocks, device=device, dtype=torch.bool)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .unsqueeze(0)
+        )
+        # Set diagonal blocks to zero
+        chunk_causal_mask = chunk_causal_mask.masked_fill(block_diag_zero, True)
+        # Reshape back to original size
+        chunk_causal_mask = (
+            chunk_causal_mask
+            .transpose(-2, -3)
+            .reshape(chunk_size, num_chunks, num_chunks)
+            .transpose(-2, -3)
+            .reshape(chunk_size * num_chunks, num_chunks)
+            .unsqueeze(0)
+            .unsqueeze(0)
+        )
+    else:
+        chunk_causal_mask = torch.ones(
+            (1, 1, chunk_size, num_chunks, num_chunks),
+            device=device,
+            dtype=torch.bool,
+        ).triu(0).transpose(-2, -3) # [1, 1, c, j, c]
+        chunk_causal_mask = chunk_causal_mask.reshape(
+            1, 1, chunk_size * num_chunks, num_chunks
+        ) # [1, 1, n, c]
+    if use_cache:
+        chunk_causal_mask = chunk_causal_mask[..., cached_seq_len : cached_seq_len + seq_len, :]
+    window_causal_mask = torch.ones(
+        (1, 1, 1, window_size, window_size),
+        device=device
+    ).triu(1).to(torch.bool)
+    return (chunk_causal_mask, window_causal_mask)
+def pad_to_multiple(tensor, multiple, dim=-2, value=0, create_mask=False, left_padding=False):
+    assert dim < 0 # only accept ``dim'' index in a reverse manner
+    seqlen = int(tensor.shape[dim])
+    m = seqlen / multiple
+    if m.is_integer():
+        if create_mask:
+            return tensor, torch.ones(size=(tensor.shape[0], tensor.shape[dim]), dtype=torch.bool, device=tensor.device)
+        else:
+            return tensor
+    remainder = math.ceil(m) * multiple - seqlen
+    pad_offset = (0,) * (-1 - dim) * 2
+    if left_padding:
+        padded_res = F.pad(tensor, (*pad_offset, remainder, 0), value=value)
+    else:
+        padded_res = F.pad(tensor, (*pad_offset, 0, remainder), value=value)
+    if create_mask:
+        # assume dim 0 is the batch size
+        padding_mask = torch.ones(size=(padded_res.shape[0], padded_res.shape[dim]), dtype=torch.bool, device=padded_res.device)
+        if left_padding:
+            padding_mask[:, :remainder] = False
+        else:
+            padding_mask[:, -remainder:] = False
+        return padded_res, padding_mask
+    else:
+        return padded_res
+class EvaByteRMSNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.fp32_ln = True
+        self.variance_epsilon = config.rms_norm_eps
+        self.add_unit_offset = config.norm_add_unit_offset
+        if self.add_unit_offset:
+            self.weight = nn.Parameter(torch.zeros(config.hidden_size))
+        else:
+            self.weight = nn.Parameter(torch.ones(config.hidden_size))
+    def forward(self, hidden_states):
+        _hidden_states = hidden_states.to(torch.float32 if self.fp32_ln else torch.bfloat16)
+        variance = _hidden_states.pow(2).mean(-1, keepdim=True)
+        _hidden_states = _hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        if self.add_unit_offset:
+            return ((1 + self.weight) * _hidden_states).type_as(hidden_states)
+        else:
+            return (self.weight * _hidden_states).type_as(hidden_states)
+class EvaByteRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._set_cos_sin_cache(seq_len=max_position_embeddings,
+                                device=self.inv_freq.device,
+                                dtype=torch.get_default_dtype())
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        # return (
+        #     self.cos_cached[:seq_len].to(dtype=x.dtype),
+        #     self.sin_cached[:seq_len].to(dtype=x.dtype),
+        # )
+        if seq_len < self.max_seq_len_cached:
+            cos_slice = self.cos_cached.split(seq_len, dim=0)[0]
+            sin_slice = self.sin_cached.split(seq_len, dim=0)[0]
+        else:
+            cos_slice = self.cos_cached
+            sin_slice = self.sin_cached
+        return (
+            cos_slice.to(dtype=x.dtype),
+            sin_slice.to(dtype=x.dtype),
+        )
+class EvaByteLinearScalingRotaryEmbedding(EvaByteRotaryEmbedding):
+    """EvaByteRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class EvaByteDynamicNTKScalingRotaryEmbedding(EvaByteRotaryEmbedding):
+    """EvaByteRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * ((self.scaling_factor * seq_len / self.max_position_embeddings) -
+                                (self.scaling_factor - 1))**(self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class EvaByteMLP(nn.Module):
+    def __init__(self, config, layer_idx: int = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.layer_idx = layer_idx
+        self.config = config
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class EvaByteDecoderLayer(nn.Module):
+    def __init__(self, config: EvaByteConfig, layer_idx: int = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = EvaAttention(config=config, layer_idx=layer_idx)
+        self.mlp = EvaByteMLP(config, layer_idx=layer_idx)
+        self.input_layernorm = EvaByteRMSNorm(config)
+        self.post_attention_layernorm = EvaByteRMSNorm(config)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+            cos: Optional[torch.Tensor] = None,
+            sin: Optional[torch.Tensor] = None,
+            multibyte_decoding: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        if self.config.fp32_skip_add:
+            residual = residual.float()
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(hidden_states=hidden_states,
+                                                                            attention_mask=attention_mask,
+                                                                            position_ids=position_ids,
+                                                                            past_key_value=past_key_value,
+                                                                            output_attentions=output_attentions,
+                                                                            use_cache=use_cache,
+                                                                            cos=cos,
+                                                                            sin=sin,
+                                                                            multibyte_decoding=multibyte_decoding)
+        hidden_states = (residual + hidden_states).to(hidden_states.dtype)
+        # Fully Connected
+        residual = hidden_states
+        if self.config.fp32_skip_add:
+            residual = residual.float()
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = (residual + hidden_states).to(hidden_states.dtype)
+        outputs = (hidden_states, )
+        if output_attentions:
+            outputs += (self_attn_weights, )
+        if use_cache:
+            outputs += (present_key_value, )
+        return outputs
+class EvaBytePreTrainedModel(PreTrainedModel):
+    config_class = EvaByteConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["EvaByteDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    def _init_weights(self, module):
+        std = getattr(self.config, "initializer_range", 0.02)
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, EvaByteModel):
+            module.gradient_checkpointing = value
+class EvaByteModel(EvaBytePreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`EvaByteDecoderLayer`]
+    Args:
+        config: EvaByteConfig
+    """
+    def __init__(self, config: EvaByteConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = self.config.max_position_embeddings
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([EvaByteDecoderLayer(config, layer_idx=layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.norm = EvaByteRMSNorm(config)
+        self.gradient_checkpointing = False
+        self.rope = config.rope_theta
+        # Initialize weights and apply final processing
+        self.post_init()
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = EvaByteRotaryEmbedding(self.head_dim,
+                                                   max_position_embeddings=self.max_position_embeddings,
+                                                   base=self.rope)
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = EvaByteLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope)
+            elif scaling_type == "dynamic":
+                self.rotary_emb = EvaByteDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope)
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def _helper_padding_mask(
+            self,
+            padding_mask,
+            causal_mask
+    ):
+        padding_mask = torch.logical_or(padding_mask, padding_mask.transpose(-1, -2))
+        return torch.logical_or(padding_mask, causal_mask)
+    def _prepare_eva_generation_attn_mask_triton(
+        self,
+        attention_mask,
+        input_ids,
+        use_cache,
+        past_key_values
+    ):
+        batch_size, seq_len = input_ids.shape
+        if use_cache and past_key_values.get_seq_length() > 0:
+            # decoding phase
+            if past_key_values.rf_mask[0] is not None:
+                cur_rf_mask = torch.zeros(
+                    (batch_size, 1, seq_len, 1),
+                    dtype=past_key_values.rf_mask[0].dtype,
+                    device=past_key_values.rf_mask[0].device
+                )
+            else:
+                cur_rf_mask = None
+            if past_key_values.s_mask[0] is not None:
+                cur_s_mask = torch.zeros(
+                    (batch_size, 1, seq_len, 1),
+                    dtype=past_key_values.s_mask[0].dtype,
+                    device=past_key_values.s_mask[0].device
+                )
+            else:
+                cur_s_mask = None
+            seen_tokens = past_key_values.get_seq_length()
+            if seen_tokens <= self.config.window_size:
+                rfa_chunks_dummy_mask = None
+            else:
+                if cur_s_mask is not None:
+                    chunks_per_window = int(self.config.window_size // self.config.chunk_size)
+                    # the ongoing decoding step would be (seen_seq_len + 1)-th token
+                    num_windows_seen_so_far = seen_tokens // self.config.window_size
+                    rfa_chunks_dummy_mask = torch.zeros(
+                        (batch_size, 1, seq_len, num_windows_seen_so_far * chunks_per_window),
+                        dtype=past_key_values.s_mask[0].dtype,
+                        device=past_key_values.s_mask[0].device
+                    )
+                else:
+                    rfa_chunks_dummy_mask = None
+            # rf_mask and cur_mask are 0s because we do not want to mask them
+            return (cur_s_mask, cur_rf_mask, rfa_chunks_dummy_mask)
+        if attention_mask is not None and torch.any(attention_mask == 0.0):
+            # convert 0 -> padding to 1 -> padding
+            padded_attention_mask = pad_to_multiple(
+                attention_mask,
+                self.config.window_size,
+                dim=-1,
+                value=0,
+                create_mask=False,
+                left_padding=False
+            )
+            # convert 0 -> padding to 1 -> padding
+            padded_rf_mask = ~padded_attention_mask.unsqueeze(1).unsqueeze(-1).to(torch.bool) # [b, 1, n, 1]
+            # [b, 1, w, j, 1]
+            padded_w_attn_mask = padded_rf_mask.reshape(batch_size, 1, -1, self.config.window_size, 1).to(torch.bool)
+            # [b, 1, w, j, 1] [b, 1, w, 1, j] -> [b, 1, w, j, j]
+            w_padding_mask = torch.logical_or(padded_w_attn_mask, padded_w_attn_mask.transpose(-1, -2))
+            w_causal_mask = torch.ones(
+                (1, 1, 1, self.config.window_size, self.config.window_size),
+                device=input_ids.device
+            ).triu(1).to(torch.bool)
+            s_mask = torch.logical_or(w_padding_mask, w_causal_mask)
+            s_mask = s_mask.reshape(batch_size, 1, -1, self.config.window_size)
+            s_mask = s_mask[..., :seq_len, :]
+            # negate the attention mask to get the padding mask
+            rf_mask = ~attention_mask.unsqueeze(1).unsqueeze(-1).to(torch.bool) # [b, 1, n, 1]
+            return (s_mask, rf_mask)
+        else:
+            return (None, None)
+    def _prepare_eva_generation_attn_mask(
+        self,
+        attention_mask,
+        input_ids,
+        use_cache,
+        past_key_values
+    ):
+        batch_size, seq_len = input_ids.shape
+        if use_cache and past_key_values.get_seq_length() > 0:
+            # decoding phase
+            if past_key_values.rf_mask[0] is not None:
+                rf_mask = torch.zeros(
+                    (batch_size, 1, seq_len, 1),
+                    dtype=past_key_values.rf_mask[0].dtype,
+                    device=past_key_values.rf_mask[0].device
+                )
+            else:
+                rf_mask = None
+            cur_causal_mask = torch.zeros(
+                (batch_size, 1, seq_len, 1),
+                dtype=torch.bool,
+                device=input_ids.device
+            )
+            chunk_causal_mask = torch.ones(
+                (batch_size, 1, seq_len, 1),
+                dtype=torch.bool,
+                device=input_ids.device
+            )
+            # chunk_causal_mask are 1s because we will mask them by default and
+            # will be unmasked when the current singleton attention is processed over
+            return (None, cur_causal_mask, chunk_causal_mask, rf_mask)
+        true_num_chunks = seq_len // self.config.chunk_size
+        chunk_causal_mask, _ = prepare_eva_attention_mask(
+            seq_len,
+            input_ids.device,
+            self.config.chunk_size,
+            self.config.window_size,
+            use_cache=use_cache,
+            cache=past_key_values
+        )
+        chunk_causal_mask = chunk_causal_mask[..., :seq_len, :true_num_chunks]
+        if attention_mask is not None and torch.any(attention_mask == 0.0):
+            # convert 0 -> padding to 1 -> padding
+            rf_mask = ~attention_mask.unsqueeze(1).unsqueeze(-1).to(torch.bool) # [b, 1, n, 1]
+        else:
+            rf_mask = None
+        if seq_len < self.config.window_size:
+            cur_window_mask = torch.ones(
+                (1, 1, seq_len, seq_len),
+                device=input_ids.device
+            ).triu(1).to(torch.bool)
+            if rf_mask is not None:
+                cur_window_mask = self._helper_padding_mask(rf_mask, cur_window_mask)
+            prev_window_mask = None
+        else:
+            if seq_len % self.config.window_size == 0:
+                num_windows = seq_len // self.config.window_size
+                cur_window_mask = None
+                prev_window_mask = torch.ones(
+                    (1, 1, num_windows, self.config.window_size, self.config.window_size),
+                    device=input_ids.device
+                ).triu(1).to(torch.bool)
+                if rf_mask is not None:
+                    prev_rf_mask = rf_mask.reshape(batch_size, 1, -1, self.config.window_size, 1)
+                    prev_window_mask = self._helper_padding_mask(prev_rf_mask, prev_window_mask)
+            else:
+                num_windows = seq_len // self.config.window_size
+                remainder_tokens = seq_len % self.config.window_size
+                cur_window_mask = torch.ones(
+                    (1, 1, remainder_tokens, remainder_tokens),
+                    device=input_ids.device
+                ).triu(1).to(torch.bool)
+                prev_window_mask = torch.ones(
+                    (1, 1, num_windows, self.config.window_size, self.config.window_size),
+                    device=input_ids.device
+                ).triu(1).to(torch.bool)
+                if rf_mask is not None:
+                    prev_rf_mask, cur_rf_mask = torch.split(rf_mask, [seq_len - remainder_tokens, remainder_tokens], dim=-2)
+                    cur_window_mask = self._helper_padding_mask(cur_rf_mask, cur_window_mask)
+                    prev_rf_mask = prev_rf_mask.reshape(batch_size, 1, -1, self.config.window_size, 1)
+                    prev_window_mask = self._helper_padding_mask(prev_rf_mask, prev_window_mask)
+        return (prev_window_mask, cur_window_mask, chunk_causal_mask, rf_mask)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            multibyte_decoding: Optional[bool] = None,
+    ) -> Tuple:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else self.config.output_hidden_states)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if self.gradient_checkpointing and self.training and use_cache:
+            raise ValueError("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+        batch_size, seq_len = input_ids.shape
+        #### Step 0. Hack
+        if (not self.training) and (not use_cache) and (not multibyte_decoding):
+            # forward-only inference mode.
+            # We tweak use_cache to be True to reuse code for generation
+            use_cache = True
+            device = input_ids.device if input_ids is not None else None
+            if position_ids is None:
+                position_ids = torch.arange(0, seq_len, device=device, dtype=int).reshape(1, -1).expand(batch_size, -1)
+        #### Step 1. Prepare caches if in inference mode
+        if use_cache:
+            if past_key_values is not None:
+                assert isinstance(past_key_values, Cache)
+            else:
+                if not USE_TRITON_IMPL:
+                    past_key_values = EvaCache()
+                else:
+                    past_key_values = EvaStaticCacheForTriton(
+                        input_ids.shape[0],
+                        self.config.num_attention_heads,
+                        self.config.window_size,
+                        self.config.hidden_size // self.config.num_attention_heads,
+                        self.config.num_hidden_layers,
+                        self.embed_tokens.weight.dtype,
+                        self.embed_tokens.weight.device,
+                    )
+        if not multibyte_decoding:
+            if use_cache:
+                if USE_TRITON_IMPL:
+                    causal_mask = self._prepare_eva_generation_attn_mask_triton(
+                        attention_mask,
+                        input_ids,
+                        use_cache,
+                        past_key_values
+                    )
+                else:
+                    causal_mask = self._prepare_eva_generation_attn_mask(
+                        attention_mask,
+                        input_ids,
+                        use_cache,
+                        past_key_values
+                    )
+            else:
+                assert self.training
+                assert seq_len % self.config.window_size == 0, "Training is only tested for sequences that are a multiple of window_size"
+                # for training, we need to pass in the attention mask
+                # usually calculated by _prepare_training_attn_mask()
+                causal_mask = attention_mask
+        else:
+            assert use_cache
+            causal_mask = attention_mask
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        max_seq_length = past_seen_tokens + inputs_embeds.shape[1]
+        hidden_states = inputs_embeds
+        if position_ids is None:
+            assert not use_cache, "during decoding we must explicitly pass position_ids to the model call"
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(past_seen_tokens, max_seq_length, device=device, dtype=int).reshape(1, -1).expand(batch_size, -1)
+        cos, sin = self.rotary_emb(hidden_states, seq_len=max_seq_length)
+        assert len(cos.shape) == 2, f"cos should be of shape (max_seq_len, head_dim), got {cos.shape} instead"
+        assert sin.shape == cos.shape, f"sin should be of shape (max_seq_len, head_dim), got {sin.shape} instead"
+        assert len(position_ids.shape) == 2, f"position_ids should be of 2D, got {position_ids.shape} instead"
+        cos = cos[position_ids, :]
+        sin = sin[position_ids, :]
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states, )
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cos,
+                    sin,
+                    multibyte_decoding,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cos=cos,
+                    sin=sin,
+                    multibyte_decoding=multibyte_decoding,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1], )
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states, )
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class EvaByteForCausalLM(EvaBytePreTrainedModel, MultiByteDecodingMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        EvaBytePreTrainedModel.__init__(self, config)
+        self.model = EvaByteModel(config)
+        self.vocab_size = config.vocab_size
+        # define multibyte prediction heads
+        if hasattr(config, "num_pred_heads") and config.num_pred_heads > 1:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size * config.num_pred_heads, bias=False)
+        else:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            return_all_pred_logits: Optional[bool] = None,
+            multibyte_decoding: Optional[bool] = None) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is None:
+            assert past_key_values is None
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            multibyte_decoding=multibyte_decoding,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        if self.config.fp32_logits:
+            logits = logits.float()
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(reduction="none")
+            if hasattr(self.config, "num_pred_heads") and self.config.num_pred_heads > 1:
+                shift_logits = logits.view(logits.shape[0], logits.shape[1], self.config.num_pred_heads, self.config.vocab_size)
+                # shift_logits = shift_logits.view(-1, logits.shape[1] * self.config.num_pred_heads, self.config.vocab_size)
+                shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            else:
+                shift_logits = logits.view(-1, self.config.vocab_size)
+            shift_labels = labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if hasattr(self.config, "num_pred_heads") and self.config.num_pred_heads > 1:
+            all_pred_logits = logits.reshape(logits.shape[0], logits.shape[1], self.config.num_pred_heads, self.config.vocab_size)
+            if return_all_pred_logits:
+                logits = all_pred_logits
+            else:
+                logits = all_pred_logits[..., 0, :]
+        if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return (loss, ) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past_key_values=None,
+                                      attention_mask=None,
+                                      inputs_embeds=None,
+                                      use_cache=True,
+                                      **kwargs):
+        # prefill phase:
+        # input_ids:      b x s
+        # attention_mask: None if no padding or b x s
+        # position_ids :  b x s
+        # token gen phase:
+        # input_ids : b x 1
+        # attention_mask: b x 1 x s
+        # position_ids:  b x 1
+        past_length = 0
+        if past_key_values is not None:
+            assert isinstance(past_key_values, Cache)
+            past_length = past_key_values.get_seq_length()
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        # must initialize position_ids at each step during GPU inference
+        assert position_ids is not None
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), )
+        return reordered_past

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/multibyte_decoding_evabyte.py ADDED Viewed

	@@ -0,0 +1,881 @@

+# The implementation of multibyte deocidng is largely adapted from
+# Medusa decoding: https://github.com/FasterDecoding/Medusa
+import torch
+import torch.nn.functional as F
+from transformers.generation.stopping_criteria import (
+    MaxLengthCriteria,
+    StoppingCriteriaList,
+)
+from typing import Union, List
+from .eva_cache import EvaStaticCacheForTriton
+from .eva_prep_kv_kernel import triton_eva_prep_kv_fwd
+class MultibyteEosTokenCriteria:
+    """
+    This class implements a simple stopping criteria to stop generation whenever
+    the "end-of-sequence" token is generated in the last `new_tokens` tokens.
+    Adapted from
+    https://github.com/huggingface/transformers/blob/main/src/transformers/generation/stopping_criteria.py#L446
+    By default, it uses the `model.generation_config.eos_token_id`.
+    Args:
+        eos_token_id (`Union[int, List[int]]`):
+            The id(s) of the *end-of-sequence* token.
+    """
+    def __init__(self, eos_token_ids: Union[int, List[int]]):
+        if isinstance(eos_token_ids, int):
+            eos_token_ids = [eos_token_ids]
+        self.eos_token_ids = eos_token_ids
+    def __call__(self, input_ids: torch.LongTensor, new_tokens: int) -> bool:
+        current_input_len = input_ids.shape[-1]
+        new_token_ids = input_ids[:, current_input_len - new_tokens:]
+        for eos_token_id in self.eos_token_ids:
+            if torch.any(new_token_ids == eos_token_id):
+                return True
+        return False
+def build_tree(spec):
+    nodes_at_depth = []
+    nodes_at_depth.append([()])  # Root at depth 1
+    for d in range(1, len(spec) + 1):
+        prev_nodes = nodes_at_depth[d - 1]
+        spec_list = spec[d - 1]
+        current_nodes = []
+        for node_idx, node in enumerate(prev_nodes):
+            if node_idx < len(spec_list):
+                num_children = spec_list[node_idx]
+            else:
+                num_children = 0
+            for child_idx in range(num_children):
+                new_node = node + (child_idx,)
+                current_nodes.append(new_node)
+        nodes_at_depth.append(current_nodes)
+    # Flatten the list of nodes, excluding the root node if desired
+    all_nodes = [node for depth_nodes in nodes_at_depth for node in depth_nodes if node]
+    return all_nodes
+evabyte_7b_95 = build_tree(
+    [
+        [10],
+        [10, 8, 2, 2, 1, 1],
+        [10, 4, 2, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 1],
+        [8, 2, 2, 1, 0, 0, 0, 0, 0, 0, 1],
+        [6, 2, 1, 1],
+        [4, 2, 1, 1],
+        [4, 2, 1],
+    ]
+)
+evabyte_7b_31 = build_tree(
+    [
+        [4],
+        [3, 2, 1, 1],
+        [3, 2, 1, 1],
+        [2, 1, 1],
+        [2, 1],
+        [2, 1],
+        [2, 1],
+    ]
+)
+TOPK = 10 # topk for sparse tree (10 is a placeholder and it is sufficient)
+def pad_path(path, length, pad_value=-2):
+    """
+    Pad the given path list with a specific value up to a specified length.
+    Parameters:
+    - path (list): The original list that needs padding.
+    - length (int): The desired length of the padded list.
+    - pad_value (optional, default=-2): The value to use for padding.
+    Returns:
+    - list: A new list based on the original path but padded to the desired length.
+    Example:
+    >>> pad_path([1,2,3], 5)
+    [1, 2, 3, -2, -2]
+    Note:
+    If the given path is already longer than the specified length,
+    then no padding occurs, and the original path is returned.
+    """
+    return path + [pad_value] * (length - len(path))
+def reset_past_key_values(passed_key_values):
+    """
+    Resets the current lengths in the passed key-values to zero.
+    This function is designed to be used during the evaluation of a baseline model.
+    It iterates through each layer's key-values and sets their current lengths to zero,
+    effectively resetting their state.
+    Args:
+    - passed_key_values (list of torch.Tensor): Contains past hidden states and past attention values for each layer.
+    Returns:
+    - passed_key_values (list of torch.Tensor): Updated past hidden states and past attention values with reset lengths.
+    """
+    for i in range(len(passed_key_values)):
+        for j in range(2):
+            passed_key_values[i][j].current_length.fill_(0)
+    return passed_key_values
+def get_nucleus_one_token(logit, temperature, top_p):
+    """
+    Performs token sampling based on the nucleus (top-p) sampling method.
+    This function selects a token from a given logit distribution using the nucleus sampling strategy.
+    It allows for more controlled and diverse generation compared to traditional top-k sampling.
+    Args:
+        logit (torch.Tensor): The logits from a language model output, expected to be a 2D tensor (BxC).
+        temperature (float): A temperature parameter to control the randomness in sampling.
+                             Higher values increase diversity, lower values make selections more deterministic.
+        top_p (float): The cumulative probability threshold for nucleus sampling.
+                       It controls the size of the set of high-probability tokens to consider for sampling.
+    Returns:
+        torch.Tensor: A tensor containing the indices of the sampled tokens.
+    """
+    if top_p >= 1:
+        return torch.multinomial(F.softmax(logit / temperature, dim=-1), 1)
+    logit = logit / temperature
+    probs = torch.softmax(logit, dim=-1)
+    sorted_logits, sorted_indices = torch.sort(probs, descending=True)
+    cum_probs = torch.cumsum(sorted_logits, dim=-1)
+    sorted_indices_to_remove = cum_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+    logit[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logit, dim=-1), 1)
+    return sampled_tokens
+def get_typical_one_token(logit, temperature, posterior_threshold, posterior_alpha):
+    """
+    Implements token sampling based on the typical sampling method.
+    This function selects a token from a given logit distribution using the typical sampling strategy,
+    aiming to balance between diversity and likelihood in a more nuanced way compared to traditional methods.
+    Args:
+        logit (torch.Tensor): The logits from a language model output, expected to be a 2D tensor.
+        temperature (float): A parameter to control the randomness in sampling.
+                              Higher values increase diversity, lower values make selections more deterministic.
+        posterior_threshold (float): A threshold to decide the lower bound of probabilities to be considered for sampling.
+        posterior_alpha (float): A scaling factor applied to the entropy-based adaptive threshold.
+    Returns:
+        torch.Tensor: A tensor containing the indices of the sampled tokens.
+    """
+    logit = logit / temperature
+    probs = torch.softmax(logit, dim=-1)
+    entropy = -torch.sum(
+            probs * torch.log(probs + 1e-5), dim=-1
+        )
+    threshold = torch.minimum(
+            torch.ones_like(entropy) * posterior_threshold,
+            torch.exp(-entropy) * posterior_alpha,
+        )
+    indices_to_remove = probs < threshold.unsqueeze(-1)
+    logit[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logit, dim=-1), 1)
+    return sampled_tokens
+def generate_medusa_buffers(medusa_choices, device="cuda"):
+    """
+    Generate buffers for the Medusa structure based on the provided choices.
+    Parameters:
+    - medusa_choices (list): A nested list representing tree in the Medusa structure.
+    - device (str): Device to which the tensors should be moved. Default is "cuda".
+    Returns:
+    - dict: A dictionary containing buffers related to the Medusa structure.
+    """
+    # Sort the medusa_choices based on their lengths and then their values
+    sorted_medusa_choices = sorted(medusa_choices, key=lambda x: (len(x), x))
+    medusa_len = len(sorted_medusa_choices) + 1
+    # Initialize depth_counts to keep track of how many choices have a particular depth
+    depth_counts = [0] * max([len(path) for path in sorted_medusa_choices])
+    for path in sorted_medusa_choices:
+        depth_counts[len(path) - 1] += 1
+    # Create the attention mask for Medusa
+    medusa_attn_mask = torch.eye(medusa_len, medusa_len)
+    medusa_attn_mask[:, 0] = 1
+    start = 0
+    for i in range(len(depth_counts)):
+        for j in range(depth_counts[i]):
+            cur_medusa_choice = sorted_medusa_choices[start + j]
+            # retrieve ancestor position
+            if len(cur_medusa_choice) == 1:
+                continue
+            ancestor_idx = []
+            for c in range(len(cur_medusa_choice) - 1):
+                ancestor_idx.append(sorted_medusa_choices.index(cur_medusa_choice[:c+1]) + 1)
+            medusa_attn_mask[j + start + 1, ancestor_idx] = 1
+        start += depth_counts[i]
+    # Generate tree indices for the Medusa structure
+    medusa_tree_indices = torch.zeros(medusa_len, dtype=torch.long)
+    medusa_tree_indices[0] = 0
+    start = 0
+    for i in range(len(depth_counts)):
+        for j in range(depth_counts[i]):
+            cur_medusa_choice = sorted_medusa_choices[start + j]
+            medusa_tree_indices[start + j + 1] = cur_medusa_choice[-1] + TOPK * i + 1
+        start += depth_counts[i]
+    # Generate position IDs for the Medusa structure
+    medusa_position_ids = torch.zeros(medusa_len, dtype=torch.long)
+    start = 0
+    for i in range(len(depth_counts)):
+        medusa_position_ids[start + 1: start + depth_counts[i] + 1] = i + 1
+        start += depth_counts[i]
+    # Generate retrieval indices for Medusa structure verification
+    retrieve_indices_nest = []
+    retrieve_paths = []
+    for i in range(len(sorted_medusa_choices)):
+        cur_medusa_choice = sorted_medusa_choices[-i-1]
+        retrieve_indice = []
+        if cur_medusa_choice in retrieve_paths:
+            continue
+        else:
+            for c in range(len(cur_medusa_choice)):
+                retrieve_indice.append(sorted_medusa_choices.index(cur_medusa_choice[:c+1]))
+                retrieve_paths.append(cur_medusa_choice[:c+1])
+        retrieve_indices_nest.append(retrieve_indice)
+    max_length = max([len(x) for x in retrieve_indices_nest])
+    retrieve_indices = [pad_path(path, max_length) for path in retrieve_indices_nest]
+    retrieve_indices = torch.tensor(retrieve_indices, dtype=torch.long)
+    retrieve_indices = retrieve_indices + 1
+    retrieve_indices = torch.cat([torch.zeros((retrieve_indices.shape[0], 1), dtype=torch.long), retrieve_indices], dim=1)
+    # Aggregate the generated buffers into a dictionary
+    medusa_buffers = {
+        "medusa_attn_mask": medusa_attn_mask.unsqueeze(0).unsqueeze(0),
+        "tree_indices": medusa_tree_indices,
+        "medusa_position_ids": medusa_position_ids.unsqueeze(0),
+        "retrieve_indices": retrieve_indices,
+    }
+    # Move the tensors in the dictionary to the specified device
+    medusa_buffers = {
+        k: v.clone().to(device)
+        if isinstance(v, torch.Tensor)
+        else torch.tensor(v, device=device)
+        for k, v in medusa_buffers.items()
+    }
+    return medusa_buffers
+def generate_candidates(
+        medusa_logits,
+        logits,
+        tree_indices,
+        retrieve_indices,
+        temperature = 0,
+        posterior_threshold=0.3,
+        posterior_alpha = 0.09,
+        top_p=0.8,
+        sampling = 'typical',
+        fast = False
+    ):
+    # Say we have 3 heads, and the top-4 for each head are:
+    # [10, 3, 8, 4]
+    # [9, 5, 1, 6]
+    # [7, 16, 3, 2]
+    # candidates_id = 10
+    if temperature == 0 or fast:
+        candidates_ids = torch.argmax(logits[:, -1]).unsqueeze(0)
+    else:
+        if sampling == 'typical':
+            candidates_ids = get_typical_one_token(logits[:, -1], temperature, posterior_threshold, posterior_alpha).squeeze(0)
+        elif sampling == 'nucleus':
+            candidates_ids = get_nucleus_one_token(logits[:, -1], temperature, top_p).squeeze(0)
+        else:
+            raise NotImplementedError
+    # this calculates the top-k medusa logits
+    # candidates_medusa_id = [
+    #   [9, 5, 1, 6]
+    #   [7, 16, 3, 2]
+    # ]
+    candidates_medusa_ids = torch.topk(medusa_logits[:, 0, -1], TOPK, dim=-1).indices
+    # [10, 9, 5, 1, 6, 7, 16, 3, 2]
+    candidate_ids = torch.cat([candidates_ids, candidates_medusa_ids.view(-1)], dim=-1)
+    # based on the pre-defined tree_indices, select the corresponding candidates
+    # if we select top-2 and top-3 for the two heads (we select top-1 for the first head):
+    # tree_candidates = [10, 9, 5, 7, 16, 3, 7, 16, 3]
+    tree_candidate_ids = candidate_ids[tree_indices]
+    # tree_candidate_ids = [10, 9, 5, 7, 16, 3, 7, 16, 3, 0]
+    # Sometimes the tree_indices are padded, so we append a zero here
+    # so that all padded indices select the appended zero.
+    tree_candidate_ids_ext = torch.cat(
+        [
+            tree_candidate_ids,
+            torch.zeros((1), dtype=torch.long, device=tree_candidate_ids.device)
+        ],
+        dim=0
+    )
+    # [[10, 9, 7], [10, 9, 16], [10, 9, 3], [10, 5, 7], [10, 5, 16], [10, 5, 3]]
+    unflattened_candidate_ids = tree_candidate_ids_ext[retrieve_indices]
+    tree_candidate_ids = tree_candidate_ids.unsqueeze(0)
+    return tree_candidate_ids, unflattened_candidate_ids
+def get_nucleus_posterior_mask(logits, candidates, temperature, top_p):
+    """
+    Generates a posterior mask for token candidates using nucleus (top-p) sampling.
+    This function applies nucleus sampling to a set of logits, and then generates a mask indicating
+    which candidate tokens are selected. It adapts the sampling strategy to accommodate for
+    temperature scaling and cumulative probability thresholding.
+    Args:
+        logits (torch.Tensor): A tensor of logits from a language model output.
+        candidates (torch.Tensor): A tensor of candidate tokens to compare against sampled tokens.
+        temperature (float): A parameter to scale the logits, controlling randomness in sampling.
+        top_p (float): The cumulative probability threshold for nucleus sampling.
+    Returns:
+        torch.Tensor: A posterior mask indicating which candidate tokens match the sampled tokens.
+    """
+    # adapted from https://github.com/huggingface/transformers/blob/18a879f47576822aa1a5c49aecb27d89bfa5fa69/examples/run_generation.py#L79
+    # Apply temperature
+    logits = logits[:, :-1] / temperature
+    n_samples, n_tokens = logits.shape[0], logits.shape[1]
+    logits = logits.view(n_samples*n_tokens, -1)
+    if top_p >= 1:
+        sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+        sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+        posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+        return posterior_mask
+    # Convert to probabilities (softmax)
+    probs = F.softmax(logits, dim=-1)
+    # Sort the probabilities
+    sorted_logits, sorted_indices = torch.sort(probs, descending=True)
+    # Compute cumulative probabilities
+    cum_probs = torch.cumsum(sorted_logits, dim=-1)
+    # Create mask for the top-p nucleus
+    sorted_indices_to_remove = cum_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+    # Remove low-probability tokens
+    logits[indices_to_remove] = float('-inf')
+    # Sample from the remaining tokens
+    sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+    sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+    # Create a mask for selected tokens
+    posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+    return posterior_mask
+def get_typical_posterior_mask(logits, candidates, temperature, posterior_threshold, posterior_alpha):
+    """
+    Args:
+        logits (torch.Tensor): A tensor of logits from a language model output.
+        candidates (torch.Tensor): A tensor of candidate tokens to compare against sampled tokens.
+        temperature (float): A parameter to scale the logits, controlling randomness in sampling.
+        posterior_threshold (float): The minimum threshold for probabilities to be considered in sampling.
+        posterior_alpha (float): A scaling factor applied to the entropy-based adaptive threshold.
+    Returns:
+        torch.Tensor: A posterior mask indicating which candidate tokens match the sampled tokens.
+    """
+    logits = logits[:, :-1] / temperature
+    n_samples, n_tokens = logits.shape[0], logits.shape[1]
+    logits = logits.view(n_samples*n_tokens, -1)
+    probs = F.softmax(logits, dim=-1)
+    entropy = -torch.sum(
+            probs * torch.log(probs + 1e-5), dim=-1
+        )
+    threshold = torch.minimum(
+            torch.ones_like(entropy) * posterior_threshold,
+            torch.exp(-entropy) * posterior_alpha,
+        )
+    indices_to_remove = probs < threshold.unsqueeze(-1)
+    logits[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+    sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+    posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+    return posterior_mask
+def evaluate_posterior(
+    logits,
+    candidates,
+    temperature,
+    posterior_threshold=0.3,
+    posterior_alpha = 0.09,
+    top_p=0.8,
+    sampling = 'typical',
+    fast = True
+):
+    if logits.shape[1] <= 1:
+        return torch.tensor(0, dtype=torch.long, device=candidates.device), 0
+    # Greedy decoding based on temperature value
+    if temperature == 0:
+        # Find the tokens that match the maximum logits for each position in the sequence
+        posterior_mask = (
+            candidates[:, 1:] == torch.argmax(logits[:, :-1], dim=-1)
+        ).int()
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        accept_length = candidates_accept_length.max().item()
+        # Choose the best candidate
+        if accept_length == 0:
+            # Default to the first candidate if none are accepted
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+        return best_candidate, accept_length
+    elif sampling == 'typical':
+        if fast:
+            posterior_prob = torch.softmax(logits[:, :-1] / temperature, dim=-1)
+            candidates_prob = torch.gather(
+                posterior_prob, dim=-1, index=candidates[:, 1:].unsqueeze(-1)
+            ).squeeze(-1)
+            posterior_entropy = -torch.sum(
+                posterior_prob * torch.log(posterior_prob + 1e-5), dim=-1
+            )  # torch.sum(torch.log(*)) is faster than torch.prod
+            threshold = torch.minimum(
+                torch.ones_like(posterior_entropy) * posterior_threshold,
+                torch.exp(-posterior_entropy) * posterior_alpha,
+            )
+            posterior_mask = candidates_prob > threshold
+            candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+            # Choose the best candidate based on the evaluated posterior probabilities
+            accept_length = candidates_accept_length.max().item()
+            if accept_length == 0:
+                # If no candidates are accepted, just choose the first one
+                best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+            else:
+                best_candidates = torch.where(candidates_accept_length == accept_length)[0]
+                # Accept the best one according to likelihood
+                likelihood = torch.sum(
+                    torch.log(candidates_prob[best_candidates, :accept_length]), dim=-1
+                )
+                best_candidate = best_candidates[torch.argmax(likelihood)]
+            return best_candidate, accept_length
+        # Calculate posterior probabilities and thresholds for candidate selection
+        posterior_mask = get_typical_posterior_mask(logits, candidates, temperature, posterior_threshold, posterior_alpha)
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        # Choose the best candidate based on the evaluated posterior probabilities
+        accept_length = candidates_accept_length.max().item()
+        if accept_length == 0:
+            # If no candidates are accepted, just choose the first one
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+            # Accept the best one according to likelihood
+        return best_candidate, accept_length
+    elif sampling == 'nucleus':
+        assert top_p < 1.0 + 1e-6, "top_p should between 0 and 1"
+        posterior_mask = get_nucleus_posterior_mask(logits, candidates, temperature, top_p)
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        accept_length = candidates_accept_length.max().item()
+        # Choose the best candidate
+        if accept_length == 0:
+            # Default to the first candidate if none are accepted
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+        return best_candidate, accept_length
+    else:
+        raise NotImplementedError
+def update_inference_inputs(
+    input_ids,
+    medusa_logits,
+    logits,
+    candidate_ids,
+    best_candidate,
+    accept_length,
+):
+    input_ids = torch.cat(
+        [
+            input_ids,
+            candidate_ids[None, best_candidate, : accept_length + 1]
+        ],
+        dim=-1
+    )
+    logits = logits[
+        None, best_candidate, accept_length : accept_length + 1
+    ]
+    medusa_logits = medusa_logits[
+        :, None, best_candidate, accept_length : accept_length + 1
+    ]
+    # Update the new token counter
+    new_token = accept_length + 1
+    return input_ids, medusa_logits, logits, new_token
+def split_logits(full_logits):
+    # logits has shape [b, n, heads, vocab_size]
+    logits = full_logits[..., 0, :]
+    medusa_logits = full_logits[..., 1:, :].permute(2, 0, 1, 3)
+    return medusa_logits, logits
+class MultiByteDecodingMixin:
+    def multi_byte_pred_update_cache(
+        self,
+        past_key_values,
+        retrieve_indices,
+        best_candidate,
+        new_tokens,
+    ):
+        prev_window_len = past_key_values.get_past_window_pos(0)
+        select_indices = (
+            retrieve_indices[best_candidate, : new_tokens] + prev_window_len
+        )
+        for layer_idx in range(self.config.num_hidden_layers):
+            past_key_values.update_past_len(new_tokens, layer_idx)
+            past_window_k = past_key_values.past_window_k[layer_idx]
+            past_window_v = past_key_values.past_window_v[layer_idx]
+            tgt_window_k = past_window_k[..., select_indices, :]
+            tgt_window_v = past_window_v[..., select_indices, :]
+            dst_window_k = past_window_k[..., prev_window_len : prev_window_len + new_tokens, :]
+            dst_window_v = past_window_v[..., prev_window_len : prev_window_len + new_tokens, :]
+            dst_window_k.copy_(tgt_window_k, non_blocking=True)
+            dst_window_v.copy_(tgt_window_v, non_blocking=True)
+            new_window_len = prev_window_len + new_tokens
+            if new_window_len >= self.config.window_size:
+                assert new_window_len < 2 * self.config.window_size
+                dump_k = past_window_k[..., :self.config.window_size, :].clone()
+                dump_v = past_window_v[..., :self.config.window_size, :].clone()
+                _window_len = new_window_len - self.config.window_size
+                if _window_len > 0:
+                    new_window_k = past_window_k[..., self.config.window_size : new_window_len, :]
+                    new_window_v = past_window_v[..., self.config.window_size : new_window_len, :]
+                    _dst_window_k = past_window_k[..., : _window_len, :]
+                    _dst_window_v = past_window_v[..., : _window_len, :]
+                    _dst_window_k.copy_(new_window_k, non_blocking=True)
+                    _dst_window_v.copy_(new_window_v, non_blocking=True)
+                past_key_values.past_window_pos[layer_idx] = _window_len
+            else:
+                dump_k = None
+                dump_v = None
+                past_key_values.past_window_pos[layer_idx] = new_window_len
+            if dump_k is not None and dump_v is not None:
+                rfa_k, rfa_v = triton_eva_prep_kv_fwd(
+                    dump_k, dump_v,
+                    self.model.layers[layer_idx].self_attn.adaptive_mu_k,
+                    self.model.layers[layer_idx].self_attn.adaptive_phi,
+                    None,
+                    self.model.layers[layer_idx].self_attn.head_dim_scaling,
+                    self.model.layers[layer_idx].self_attn.chunk_size
+                )
+                rfa_k, rfa_v = past_key_values.update_chunk_rfas(
+                    rfa_k, rfa_v, layer_idx
+                )
+        return past_key_values
+    def _multi_byte_pred_update_cache_when_prefil_len_eq_window_size(
+        self,
+        past_key_values,
+    ):
+        prev_window_len = past_key_values.get_past_window_pos(0)
+        for layer_idx in range(self.config.num_hidden_layers):
+            past_window_k = past_key_values.past_window_k[layer_idx]
+            past_window_v = past_key_values.past_window_v[layer_idx]
+            new_window_len = prev_window_len
+            if new_window_len == self.config.window_size:
+                dump_k = past_window_k[..., :self.config.window_size, :].clone()
+                dump_v = past_window_v[..., :self.config.window_size, :].clone()
+                past_key_values.past_window_pos[layer_idx] = 0
+                if dump_k is not None and dump_v is not None:
+                    rfa_k, rfa_v = triton_eva_prep_kv_fwd(
+                        dump_k, dump_v,
+                        self.model.layers[layer_idx].self_attn.adaptive_mu_k,
+                        self.model.layers[layer_idx].self_attn.adaptive_phi,
+                        None,
+                        self.model.layers[layer_idx].self_attn.head_dim_scaling,
+                        self.model.layers[layer_idx].self_attn.chunk_size
+                    )
+                    rfa_k, rfa_v = past_key_values.update_chunk_rfas(
+                        rfa_k, rfa_v, layer_idx
+                    )
+        return past_key_values
+    def multi_byte_pred_update_attn_mask(
+        self,
+        last_iter_new_tokens,
+        tree_candidate_ids,
+        past_attn_mask,
+        medusa_attn_mask,
+        past_key_values,
+    ):
+        batch_size, tree_candidate_len = tree_candidate_ids.shape
+        seen_tokens = past_key_values.get_seq_length()
+        # NOTE: past_key_values has been updated so now
+        # seen_tokens incldues new tokens from the last tree iteration
+        assert seen_tokens > 0
+        # so one iteration would not cross two windows
+        assert last_iter_new_tokens < self.config.window_size
+        if past_attn_mask is not None and seen_tokens < self.config.window_size:
+            past_attn_mask = torch.cat(
+                [
+                    past_attn_mask,
+                    torch.ones(
+                        [batch_size, 1, tree_candidate_len, last_iter_new_tokens],
+                        dtype=torch.bool,
+                        device=self.device
+                    )
+                ],
+                dim=-1
+            )
+        else:
+            # we initialize attn mask each time when
+            # 1. the model crosses the window bounary, or
+            # 2. after prefilling
+            chunks_per_window = int(self.config.window_size // self.config.chunk_size)
+            window_tokens = seen_tokens % self.config.window_size
+            num_windows_seen_so_far = seen_tokens // self.config.window_size
+            attn_mask_len = num_windows_seen_so_far * chunks_per_window + window_tokens
+            past_attn_mask = torch.ones(
+                (batch_size, 1, tree_candidate_len, attn_mask_len),
+                dtype=torch.bool,
+                device=self.device
+            )
+        # note that 1 indicates the position is not masked
+        tree_attn_mask = torch.cat(
+            [
+                past_attn_mask,
+                medusa_attn_mask.to(torch.bool)
+            ],
+            dim=-1
+        )
+        return tree_attn_mask, past_attn_mask
+    @torch.no_grad()
+    def multi_byte_generate(
+        self,
+        input_ids,
+        attention_mask=None,
+        temperature=0.0,
+        max_length=None,
+        max_new_tokens=None,
+        stopping_criteria=None,
+        posterior_threshold=0.09,
+        posterior_alpha=0.3,
+        top_p=0.8,
+        sampling='typical',
+        fast=True,
+        do_sample=False,
+        medusa_choices=None,
+        return_acc_lengths=False
+    ):
+        if do_sample or temperature > 0.0:
+            fast = False
+        ### Prepare `max_length` depending on other stopping criteria.
+        if max_new_tokens is not None:
+            max_length = max_new_tokens + input_ids.shape[-1]
+        elif max_new_tokens is None and max_length is None:
+            max_length = getattr(self.config, "max_position_embeddings", 32768)
+        ### Set up stopping criteria
+        eos_stop_criteria = MultibyteEosTokenCriteria(self.generation_config.eos_token_id)
+        stop_criteria = StoppingCriteriaList()
+        if max_length is not None:
+            max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+            stop_criteria.append(
+                MaxLengthCriteria(
+                    max_length=max_length,
+                    max_position_embeddings=max_position_embeddings,
+                )
+            )
+        if stopping_criteria is not None and len(stopping_criteria) > 0:
+            stop_criteria.extend(stopping_criteria)
+        assert input_ids.shape[0] == 1, "Only support batch size 1 for now"
+        assert attention_mask is None, "Only support attention mask None for now"
+        # Avoid modifying the input_ids in-place
+        input_ids = input_ids.clone()
+        position_ids = torch.arange(0, input_ids.shape[1], device=self.device, dtype=int).reshape(1, -1)
+        ####################################################
+        # 0. initialize the medusa buffers
+        ####################################################
+        if medusa_choices is None:
+            medusa_choices = evabyte_7b_95
+        medusa_buffers = generate_medusa_buffers(
+            medusa_choices, device=self.device
+        )
+        past_key_values = EvaStaticCacheForTriton(
+            input_ids.shape[0],
+            self.config.num_attention_heads,
+            # we add 256 to allow tree ids
+            self.config.window_size + 256,
+            self.config.hidden_size // self.config.num_attention_heads,
+            self.config.num_hidden_layers,
+            self.lm_head.weight.dtype,
+            self.lm_head.weight.device,
+        )
+        # prefill to get medusa logits and logits
+        full_logits, past_key_values = self.forward(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=True,
+            past_key_values=past_key_values,
+            return_all_pred_logits=True,
+            multibyte_decoding=False,
+        )
+        # handles an edge case where the prefill length == window_size
+        # we force the previous window to be dumped into RFA chunks
+        past_key_values = self._multi_byte_pred_update_cache_when_prefil_len_eq_window_size(
+            past_key_values
+        )
+        medusa_logits, logits = split_logits(full_logits)
+        past_attn_mask = None
+        last_iter_new_tokens = 0
+        max_iters = 32768
+        if return_acc_lengths:
+            acc_lengths = []
+        for _ in range(max_iters):
+            ####################################################
+            # 1. generate candidate_ids with topk predictions from Medusa heads
+            ####################################################
+            tree_candidate_ids, unflattened_candidate_ids = generate_candidates(
+                medusa_logits,
+                logits,
+                medusa_buffers["tree_indices"],
+                medusa_buffers["retrieve_indices"],
+                temperature=temperature,
+                posterior_alpha=posterior_alpha,
+                posterior_threshold=posterior_threshold,
+                top_p=top_p,
+                sampling=sampling,
+                fast=fast,
+            )
+            ####################################################
+            # 2. Build the medusa attention mask and position ids
+            ####################################################
+            # NOTE: 1 indicates the position is not masked
+            medusa_attn_mask, past_attn_mask = self.multi_byte_pred_update_attn_mask(
+                last_iter_new_tokens,
+                tree_candidate_ids,
+                past_attn_mask,
+                medusa_buffers["medusa_attn_mask"],
+                past_key_values,
+            )
+            medusa_position_ids = medusa_buffers["medusa_position_ids"] + input_ids.shape[1]
+            ####################################################
+            # 3. tree decoding
+            ####################################################
+            tree_full_logits, past_key_values = self.forward(
+                tree_candidate_ids,
+                past_key_values=past_key_values,
+                attention_mask=medusa_attn_mask,
+                position_ids=medusa_position_ids,
+                return_all_pred_logits=True,
+                multibyte_decoding=True,
+            )
+            _medusa_logits, _logits = split_logits(tree_full_logits)
+            medusa_logits = _medusa_logits[..., 0, medusa_buffers["retrieve_indices"], :]
+            logits = _logits[..., 0, medusa_buffers["retrieve_indices"], :]
+            ####################################################
+            # 4. candidate selection
+            ####################################################
+            # if the current iteration, with tree tokens, crosses window
+            # boundaries, trim the condidate_ids to be within the window
+            # so that those exceeded tokens (which will be inaccurate)
+            # will not be considered
+            tree_depth = unflattened_candidate_ids.shape[-1]
+            if tree_depth + past_key_values.get_past_window_pos(0) > self.config.window_size:
+                max_acc_len = self.config.window_size - past_key_values.get_past_window_pos(0)
+                _trimmed_unflattened_candidate_ids = unflattened_candidate_ids[:, :max_acc_len]
+                _trimmed_logits = logits[:, :max_acc_len]
+            else:
+                _trimmed_unflattened_candidate_ids = unflattened_candidate_ids
+                _trimmed_logits = logits
+            best_candidate, accept_length = evaluate_posterior(
+                _trimmed_logits,
+                _trimmed_unflattened_candidate_ids,
+                temperature,
+                posterior_threshold,
+                posterior_alpha,
+                top_p=top_p,
+                sampling=sampling,
+                fast=fast
+            )
+            ####################################################
+            # 5. update model inputs and caches
+            ####################################################
+            input_ids, medusa_logits, logits, last_iter_new_tokens = update_inference_inputs(
+                input_ids,
+                medusa_logits,
+                logits,
+                unflattened_candidate_ids,
+                best_candidate,
+                accept_length,
+            )
+            past_key_values = self.multi_byte_pred_update_cache(
+                past_key_values,
+                medusa_buffers["retrieve_indices"],
+                best_candidate,
+                last_iter_new_tokens,
+            )
+            if return_acc_lengths:
+                acc_lengths.append(last_iter_new_tokens)
+            if stop_criteria(input_ids, None) or eos_stop_criteria(input_ids, last_iter_new_tokens):
+                if return_acc_lengths:
+                    return input_ids, acc_lengths
+                else:
+                    return input_ids
+        if return_acc_lengths:
+            return input_ids, acc_lengths
+        else:
+            return input_ids

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_evabyte.EvaByteImageProcessor",
+    "AutoProcessor": "processing_evabyte.EvaByteProcessor"
+  },
+  "do_convert_rgb": true,
+  "do_resize": true,
+  "image_processor_type": "EvaByteImageProcessor",
+  "jpeg_quality": 25,
+  "jpeg_restart_marker_blocks": 1,
+  "jpeg_streamtype": 2,
+  "jpeg_subsampling": "4:2:0",
+  "processor_class": "EvaByteProcessor",
+  "resample": 1,
+  "size": {
+    "longest_edge": 384
+  }
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/processing_evabyte.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# coding=utf-8
+"""
+Processor class for EvaByte.
+"""
+import base64
+from io import BytesIO
+import requests
+import os
+import PIL
+from PIL import Image
+from typing import List, Optional, Union
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, is_valid_image
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import TensorType, to_py_obj
+def fetch_image(image: Union[str, "PIL.Image.Image"]) -> Image.Image:
+    image_obj = None
+    if isinstance(image, Image.Image):
+        image_obj = image
+    elif image.startswith("http://") or image.startswith("https://"):
+        image_obj = Image.open(BytesIO(requests.get(image, timeout=None).content))
+    elif os.path.isfile(image):
+        image_obj = Image.open(image)
+    elif image.startswith("data:image/"):
+        image = image.split(",")[1]
+        # Try to load as base64
+        try:
+            b64 = base64.decodebytes(image.encode())
+            image = PIL.Image.open(BytesIO(b64))
+        except Exception as e:
+            raise ValueError(
+                f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
+            )
+    else:
+        image_obj = Image.open(image)
+    if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
+    return image_obj
+def is_url(val) -> bool:
+    return isinstance(val, str) and val.startswith("http")
+def is_file(val) -> bool:
+    return isinstance(val, str) and os.path.isfile(val)
+def is_image_or_image_url(elem):
+    return is_url(elem) or is_valid_image(elem) or is_file(elem)
+vl_chat_template = """
+{{- bos_token }}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{{- '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>'}}
+{%- for message in messages %}
+    {%- if (message['role'] != 'user') and (message['role'] != 'assistant') %}
+        {{- raise_exception('Conversation roles must be user or assistant') }}
+    {%- endif %}
+    {%- if message['content'] is string %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}
+    {%- else %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+        {%- for content in message['content'] %}
+            {%- if content['type'] == 'image' %}
+                {{- '<image_placeholder>\n' }}
+            {%- elif content['type'] == 'text' %}
+                {{- content['text'] }}
+            {%- endif %}
+        {%- endfor %}
+        {{- '<|eot_id|>' }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\n\n' }}
+{%- endif %}
+"""
+class EvaByteProcessor(ProcessorMixin):
+    r"""
+    Constructs a EvaByte processor which wraps a EvaByte image processor and a EvaByte tokenizer into a single processor.
+    [`EvaByteProcessor`] offers all the functionalities of [`EvaByteImageProcessor`] and [`EvaByteTokenizer`]. See the
+    [`~EvaByteProcessor.__call__`] and [`~EvaByteProcessor.decode`] for more information.
+    Args:
+        image_processor ([`EvaByteImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`EvaByteTokenizer`], *optional*):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        super().__init__(image_processor, tokenizer)
+        self.t2v_token_id = self.tokenizer.convert_tokens_to_ids("<t2v_token>")
+        self.v2t_token_id = self.tokenizer.convert_tokens_to_ids("<v2t_token>")
+        self.image_placeholder = "<image_placeholder>"
+        self.vl_chat_template = vl_chat_template
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        strip_ending_sentinel: bool = False,
+        encode_only: bool = False,
+        **kwargs
+    ) -> Union[BatchFeature, List[List[int]]]:
+        # processing pipeline:
+        # 1. read images or videos from paths
+        # 2. use image_processor to convert images / videos to byte streams
+        if images is not None:
+            if isinstance(images, bytes):
+                image_bytes_list = [[images]]
+            elif isinstance(images, list) and isinstance(images[0], bytes):
+                image_bytes_list = [images]
+            elif isinstance(images, list) and isinstance(images[0], list) and isinstance(images[0][0], bytes):
+                image_bytes_list = images
+            else:
+                if is_image_or_image_url(images):
+                    images = [[images]]
+                elif isinstance(images, list) and is_image_or_image_url(images[0]):
+                    images = [images]
+                elif (
+                    not isinstance(images, list)
+                    and not isinstance(images[0], list)
+                    and not is_image_or_image_url(images[0][0])
+                ):
+                    raise ValueError(
+                        "Invalid input images. Please provide a single image or a list of images or a list of list of images."
+                    )
+                # Load images if they are URLs
+                images = [[fetch_image(im) if is_url(im) or is_file(im) else im for im in sample] for sample in images]
+                image_bytes_list = self.image_processor(images=images, **kwargs)
+        if not isinstance(text, list):
+            text = [text]
+        assert len(text) == 1, "Only support batch size 1 for now"
+        assert len(text) == len(image_bytes_list), "text and image_bytes_list must have the same length"
+        # TODO: invoke SequenceFeatureExtractor to get batched inputs
+        # 3. tokenize the text and put images / videos byte streams into the placeholders
+        #    surrounded by special tokens like "<image>" and "</image>"
+        batch_input_ids = []
+        if not encode_only:
+            batch_attention_mask = []
+        else:
+            batch_attention_mask = None
+        for t, image_bytes in zip(text, image_bytes_list):
+            text_splits = t.split(self.image_placeholder)
+            if len(text_splits) != len(image_bytes) + 1:
+                raise ValueError(
+                    f"The number of image tokens should be equal to the number of images, "
+                    f"but got {len(text_splits)} and {len(image_bytes) + 1}"
+                )
+            input_ids = [self.tokenizer.bos_token_id]
+            for i, text_part in enumerate(text_splits):
+                # each text part must be non-empty because we added markers around placeholders
+                split_tokens = self.tokenizer.encode(text_part, add_special_tokens=False)
+                input_ids.extend(split_tokens)
+                # Add image bytes after each text part except the last one
+                if i < len(image_bytes):
+                    input_ids.append(self.t2v_token_id)
+                    input_ids.extend([b + self.tokenizer.offset for b in image_bytes[i]])
+                    input_ids.append(self.v2t_token_id)
+            if strip_ending_sentinel and (input_ids[-1] in [self.t2v_token_id, self.v2t_token_id]):
+                input_ids = input_ids[:-1]
+            batch_input_ids.append(input_ids)
+            if not encode_only:
+                batch_attention_mask.append([1] * len(input_ids))
+        if not encode_only:
+            # 4. return batch of features
+            inputs = BatchFeature({
+                "input_ids": batch_input_ids,
+                "attention_mask": batch_attention_mask
+            }, tensor_type=return_tensors)
+            return inputs
+            # # Pad sequences
+            # padded_inputs = self.tokenizer.pad(
+            #     {"input_ids": batch_input_ids},
+            #     padding=True,
+            #     return_attention_mask=True,
+            #     return_tensors=return_tensors,
+            # )
+            # return BatchFeature(data=padded_inputs)
+        else:
+            return batch_input_ids
+    def image_tokens_to_bytes(self, image_token_ids, jpeg_quality=None):
+        image_bytes = bytes([token_id - self.tokenizer.offset for token_id in image_token_ids])
+        image_bytes = self.image_processor.jpeg_merge_qtables(image_bytes, jpeg_quality)
+        return image_bytes
+    def batch_decode(self, sequences, **kwargs):
+        """
+        This method forwards all its arguments to EvaByteTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        rets = [self.decode(seq, **kwargs) for seq in sequences]
+        return tuple(map(list, zip(*rets)))
+    def decode(self, token_ids, **kwargs):
+        """
+        Decodes a sequence of input_ids, handling image tokens separately.
+        Returns a tuple of (decoded_text, images), where images is a list of bytes.
+        """
+        if kwargs and "jpeg_quality" in kwargs:
+            kwargs = kwargs.copy()
+            jpeg_quality = kwargs.pop("jpeg_quality")
+        else:
+            jpeg_quality = None
+        token_ids = to_py_obj(token_ids)
+        # Find indices of t2v_token_id and v2t_token_id
+        t2v_indices = [i for i, token_id in enumerate(token_ids) if token_id == self.t2v_token_id]
+        v2t_indices = [i for i, token_id in enumerate(token_ids) if token_id == self.v2t_token_id]
+        # Check for correct pairing of t2v and v2t tokens
+        if len(t2v_indices) != len(v2t_indices):
+            raise ValueError("Mismatched number of t2v and v2t tokens in token_ids: {} and {}".format(t2v_indices, v2t_indices))
+        # Ensure t2v and v2t tokens are in the correct order
+        for t2v_idx, v2t_idx in zip(t2v_indices, v2t_indices):
+            if t2v_idx >= v2t_idx:
+                raise ValueError("Found t2v_token_id after v2t_token_id in token_ids")
+        # Initialize the start index
+        images = []
+        decoded_text = ""
+        start = 0
+        # Iterate over pairs of t2v and v2t indices
+        for t2v_idx, v2t_idx in zip(t2v_indices, v2t_indices):
+            # Decode text tokens before the image
+            text_token_ids = token_ids[start:t2v_idx]
+            if len(text_token_ids) > 0:
+                decoded_text += self.tokenizer.decode(text_token_ids, **kwargs)
+            # Insert image placeholder
+            decoded_text += self.image_placeholder
+            # Extract image tokens and convert them to bytes
+            image_token_ids = token_ids[t2v_idx + 1 : v2t_idx]
+            image_bytes = self.image_tokens_to_bytes(image_token_ids, jpeg_quality)
+            images.append(image_bytes)
+            # Update the start index to the token after v2t_token_id
+            start = v2t_idx + 1
+        # Decode any remaining text tokens after the last image
+        if start < len(token_ids):
+            text_token_ids = token_ids[start:]
+            decoded_text += self.tokenizer.decode(text_token_ids, **kwargs)
+        return decoded_text, images
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_evabyte.EvaByteProcessor"
+  },
+  "processor_class": "EvaByteProcessor"
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,98 @@

+{
+  "additional_special_tokens": [
+    "<repo_name>",
+    "<file_sep>",
+    "<t2v_token>",
+    "<v2t_token>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "<|eot_id|>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>"
+  ],
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<sep>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/tokenization_evabyte.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# coding=utf-8
+""" Tokenization class for model EvaByte."""
+from typing import List, Optional, Tuple
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+chat_template = """
+{{- bos_token }}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{{- '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>'}}
+{%- for message in messages %}
+    {%- if (message['role'] != 'user') and (message['role'] != 'assistant') %}
+        {{- raise_exception('Conversation roles must be user or assistant') }}
+    {%- endif %}
+    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\n\n' }}
+{%- endif %}
+"""
+class EvaByteTokenizer(PreTrainedTokenizer):
+    def __init__(
+        self,
+        bos_token="<bos>",
+        eos_token="<eos>",
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        extra_ids=59,
+        additional_special_tokens=None,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ) -> None:
+        num_base_special_tokens = 5
+        # Add extra_ids to the special token list
+        if extra_ids > 0 and additional_special_tokens is None:
+            additional_special_tokens = [f"<extra_id_{i}>" for i in range(num_base_special_tokens, extra_ids + num_base_special_tokens)]
+        elif extra_ids > 0 and additional_special_tokens is not None and len(additional_special_tokens) > 0:
+            # Check that we have the right number of extra_id special tokens
+            extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
+            if extra_tokens != extra_ids:
+                raise ValueError(
+                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
+                    " provided to EvaByteTokenizer. In this case the additional_special_tokens must include the"
+                    " extra_ids tokens"
+                )
+        #### override some reserved tokens to support chat template
+        for i, token in enumerate(additional_special_tokens):
+            if token == "<extra_id_5>":
+                token = "<repo_name>"
+            elif token == "<extra_id_6>":
+                token = "<file_sep>"
+            elif token == "<extra_id_7>":
+                token = "<t2v_token>"
+            elif token == "<extra_id_8>":
+                token = "<v2t_token>"
+            elif token == "<extra_id_9>":
+                token = "<|start_header_id|>"
+            elif token == "<extra_id_10>":
+                token = "<|end_header_id|>"
+            elif token == "<extra_id_11>":
+                token = "<|eot_id|>"
+            additional_special_tokens[i] = token
+        # lstrip and rstrip are set to False because we don't want to strip the whitespace from the special tokens
+        # this would be important for the byte tokenizer
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        self._added_tokens_decoder = {
+            0: pad_token,
+            1: bos_token,
+            2: eos_token,
+            3: unk_token, # unk_token is a placeholder
+            4: sep_token,
+            **{i: AddedToken(t, lstrip=False, rstrip=False) for i, t in enumerate(additional_special_tokens, start=num_base_special_tokens)},
+        }
+        self.offset = len(self._added_tokens_decoder)
+        self._utf_vocab_size = 2**8  # utf is 8 bits
+        self.add_bos_token = True
+        self.add_eos_token = False
+        super().__init__(
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            extra_ids=0,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+        self.chat_template = chat_template
+    @property
+    def vocab_size(self):
+        return self._utf_vocab_size
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.offset)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = bos_token_id + token_ids_0 + eos_token_id
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+        return output
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+        return output
+    def _tokenize(self, text: str) -> List[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        tokens = [chr(i) for i in text.encode("utf-8")]
+        return tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if len(token) != 1:
+            token_id = None
+        else:
+            token_id = ord(token) + self.offset
+        return token_id
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) to a byte (str) using the vocab."""
+        token = chr(index - self.offset)
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of bytes (string) to a single string."""
+        bstring = b""
+        for token in tokens:
+            if token in self.added_tokens_decoder:
+                tok_string = self.added_tokens_decoder[token].encode("utf-8")
+            elif token in self.added_tokens_encoder:
+                tok_string = token.encode("utf-8")
+            else:
+                tok_string = bytes([ord(token)])
+            bstring += tok_string
+        string = bstring.decode("utf-8", errors="ignore")
+        return string
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        return ()

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-40000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,596 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<sep>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "6": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "7": {
+      "content": "<t2v_token>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "8": {
+      "content": "<v2t_token>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "9": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "10": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "11": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "12": {
+      "content": "<extra_id_12>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "13": {
+      "content": "<extra_id_13>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "14": {
+      "content": "<extra_id_14>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "15": {
+      "content": "<extra_id_15>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "16": {
+      "content": "<extra_id_16>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "17": {
+      "content": "<extra_id_17>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "18": {
+      "content": "<extra_id_18>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "19": {
+      "content": "<extra_id_19>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "20": {
+      "content": "<extra_id_20>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21": {
+      "content": "<extra_id_21>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "22": {
+      "content": "<extra_id_22>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "23": {
+      "content": "<extra_id_23>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "24": {
+      "content": "<extra_id_24>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "25": {
+      "content": "<extra_id_25>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "26": {
+      "content": "<extra_id_26>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "27": {
+      "content": "<extra_id_27>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "28": {
+      "content": "<extra_id_28>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "29": {
+      "content": "<extra_id_29>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "30": {
+      "content": "<extra_id_30>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "31": {
+      "content": "<extra_id_31>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32": {
+      "content": "<extra_id_32>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "33": {
+      "content": "<extra_id_33>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "34": {
+      "content": "<extra_id_34>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "35": {
+      "content": "<extra_id_35>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "36": {
+      "content": "<extra_id_36>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "37": {
+      "content": "<extra_id_37>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "38": {
+      "content": "<extra_id_38>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "39": {
+      "content": "<extra_id_39>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "40": {
+      "content": "<extra_id_40>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "41": {
+      "content": "<extra_id_41>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "42": {
+      "content": "<extra_id_42>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "43": {
+      "content": "<extra_id_43>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "44": {
+      "content": "<extra_id_44>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "45": {
+      "content": "<extra_id_45>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "46": {
+      "content": "<extra_id_46>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "47": {
+      "content": "<extra_id_47>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "48": {
+      "content": "<extra_id_48>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "49": {
+      "content": "<extra_id_49>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50": {
+      "content": "<extra_id_50>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "51": {
+      "content": "<extra_id_51>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "52": {
+      "content": "<extra_id_52>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "53": {
+      "content": "<extra_id_53>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "54": {
+      "content": "<extra_id_54>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "55": {
+      "content": "<extra_id_55>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "56": {
+      "content": "<extra_id_56>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57": {
+      "content": "<extra_id_57>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "58": {
+      "content": "<extra_id_58>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "59": {
+      "content": "<extra_id_59>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "60": {
+      "content": "<extra_id_60>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "61": {
+      "content": "<extra_id_61>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "62": {
+      "content": "<extra_id_62>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "63": {
+      "content": "<extra_id_63>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<repo_name>",
+    "<file_sep>",
+    "<t2v_token>",
+    "<v2t_token>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "<|eot_id|>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>"
+  ],
+  "auto_map": {
+    "AutoProcessor": "processing_evabyte.EvaByteProcessor",
+    "AutoTokenizer": [
+      "tokenization_evabyte.EvaByteTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<bos>",
+  "chat_template": "\n{{- bos_token }}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{{- '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>'}}\n\n{%- for message in messages %}\n    {%- if (message['role'] != 'user') and (message['role'] != 'assistant') %}\n        {{- raise_exception('Conversation roles must be user or assistant') }}\n    {%- endif %}\n\n    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n{%- endfor %}\n\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\n\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<eos>",
+  "extra_ids": 0,
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "processor_class": "EvaByteProcessor",
+  "sep_token": "<sep>",
+  "tokenizer_class": "EvaByteTokenizer",
+  "unk_token": "<unk>"
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/README.md ADDED Viewed

	@@ -0,0 +1,105 @@

+---
+license: apache-2.0
+---
+# EvaByte Model Card
+**EvaByte** is a 6.5B **byte-level language model** built upon an improved architecture with multibyte prediction and EVA -- an efficient attention mechanism designed for scalability and performance. Trained on 1.5T bytes spanning natural language text, math, and code, EvaByte demonstrates the viability of efficient byte-level processing at scale -- rivaling top open-source tokenizer-based LMs using 5x less training data, excelling in coding tasks, and decoding up to 2x faster.
+## Model Resources
+- **Repository:** https://github.com/openevabyte/evabyte
+- **Blog:** https://hkunlp.github.io/blog/2025/evabyte and https://sambanova.ai/blog/evabyte-efficient-byte-level-language-models-at-scale
+- **Paper:** Coming soon
+## Model Details
+EvaByte is trained using the performant SambaNova SN30 RDU system with a batch size of 8M bytes and 32K context length. The training process consists of 3 phases: after pre-training on 1.2T bytes (yielding **EvaByte-Phase1**), two independent annealing runs (100B and 200B bytes respectively) are conducted with learning rate linearly decayed from 1e-4 to 0. The resulting checkpoints are merged via model soup (**EvaByte**), which then undergoes supervised fine-tuning (**EvaByte-SFT**).
+| Stage | Model |
+|:----- |:-----|
+| Base (before annealing) | [EvaByte-Phase1](https://huggingface.co/evabyte/EvaByte-Phase1) |
+| Base | [EvaByte](https://huggingface.co/evabyte/EvaByte) <-- you are here |
+| SFT  | [EvaByte-SFT](https://huggingface.co/evabyte/EvaByte-SFT) |
+## Usage
+**Note:** Make sure to set `trust_remote_code=True` when loading the model (or tokenizer), as our implementation includes custom code.
+The code snippet below demonstrates EvaByte-6.5B for completion:
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+# Load model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("evabyte/EvaByte", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained("evabyte/EvaByte", torch_dtype=torch.bfloat16, trust_remote_code=True).eval().to("cuda")
+prompt = "The quick brown fox jumps "
+# Tokenize input
+# Option 1: standard HF tokenizer interface
+input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
+# Option 2: Direct UTF-8 byte encoding with offset
+# Note: Each byte is offset by 64 with <bos> prepended.
+input_ids = torch.tensor([[1] + [b + 64 for b in prompt.encode("utf-8")]]).to("cuda")
+# byte-by-byte generation (default)
+generation_output = model.generate(
+    input_ids=input_ids,
+    max_new_tokens=32
+)
+# alternatively, use faster multibyte generation
+generation_output = model.multi_byte_generate(
+    input_ids=input_ids,
+    max_new_tokens=32
+)
+# Decode and print the output
+response = tokenizer.decode(
+    generation_output[0][input_ids.shape[1]:],
+    skip_special_tokens=False,
+    clean_up_tokenization_spaces=False
+)
+print(response)
+# Sample output:
+# over the lazy dog.\n\nThe quick
+```
+### ⚙️ Generation Modes
+EvaByte supports two generation interfaces:
+- `model.generate()`: The default generation method compatible with Huggingface `transformers` library. This approach generates one byte at a time and might be slow.
+- `model.multi_byte_generate()`: A faster alternative that generates multiple bytes per step and usually yields the same result as `model.generate()` under greedy decoding, with the implementation adapted from [Medusa](https://github.com/FasterDecoding/Medusa). `model.multi_byte_generate()` supports a subset of arguments in `model.generate()`:
+    - `input_ids`: the input byte ids.
+    - `temperature`: the temperature for sampling.
+    - `max_length`: the maximum length of the generated sequence.
+    - `max_new_tokens`: the maximum number of new bytes to generate.
+    - `stopping_criteria`: the [stopping criteria](https://huggingface.co/docs/transformers/v4.47.1/en/internal/generation_utils#transformers.StoppingCriteria) for generation.
+    - `top_p`: the top-p parameter for sampling.
+    - `do_sample`: greedy decoding or sampling.
+**Notes and Limitations:**
+- `device_map="auto"` is not supported for >2 GPUs.
+- Only batch size of 1 (with `attention_mask=None`) is supported for decoding.
+- `torch_dtype=torch.bfloat16` is required.
+- The multibyte generation `model.multi_byte_generate()` might return extra bytes after the end-of-sequence sentinel, due to the nature of the multibyte decoding. Manual truncation or cleaning may be needed.
+## Bias, Risks, and Limitations
+As a pretrained base model, **EvaByte** has not been fine-tuned for chat or instruction following, so users should not expect reliable performance in conversational or instruction-based tasks. Like other base models, it does not incorporate any moderation mechanisms, making it possible to generate potentially harmful or inappropriate content.
+## Evaluation
+For detailed evaluation results, check out our blog post at [SambaNova](https://sambanova.ai/blog/evabyte-efficient-byte-level-language-models-at-scale) or [HKUNLP](https://hkunlp.github.io/blog/2025/evabyte).
+## Citation
+```bibtex
+@misc{evabyte,
+    title = {EvaByte: Efficient Byte-level Language Models at Scale},
+    url = {https://hkunlp.github.io/blog/2025/evabyte},
+    author = {Lin Zheng and Xueliang Zhao and Guangtao Wang and Chen Wu and David Dong and Angela Wang and Mingran Wang and Yun Du and Haige Bo and Amol Sharma and Bo Li and Kejie Zhang and Changran Hu and Urmish Thakker and Lingpeng Kong},
+    year = {2025}
+}
+```

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "_name_or_path": null,
+  "architectures": [
+    "EvaByteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_class": "eva",
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_evabyte.EvaByteConfig",
+    "AutoModelForCausalLM": "modeling_evabyte.EvaByteForCausalLM"
+  },
+  "bos_token_id": 1,
+  "chunk_size": 16,
+  "eos_token_id": 2,
+  "fp32_ln": true,
+  "fp32_logits": true,
+  "fp32_skip_add": false,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "init_cutoff_factor": null,
+  "init_fn": "v2",
+  "init_std": 0.01275,
+  "initializer_range": 0.01275,
+  "intermediate_size": 16384,
+  "lazy_init": true,
+  "max_position_embeddings": 16384,
+  "max_seq_length": 16384,
+  "mixedp_attn": true,
+  "model_type": "evabyte",
+  "norm_add_unit_offset": true,
+  "num_attention_heads": 40,
+  "num_chunks": null,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 40,
+  "num_pred_heads": 1,
+  "pad_token_id": 0,
+  "return_dict": false,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 100000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.1",
+  "use_cache": true,
+  "vocab_size": 320,
+  "window_size": 2048
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/configuration_evabyte.py ADDED Viewed

	@@ -0,0 +1,99 @@

+""" EvaByte configuration"""
+from transformers.configuration_utils import PretrainedConfig
+class EvaByteConfig(PretrainedConfig):
+    model_type = "evabyte"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=320,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        norm_add_unit_offset=False,
+        init_fn="mitchell",
+        init_std=0.006,
+        init_cutoff_factor=None,
+        attention_class="mha",
+        window_size=512,
+        num_chunks=None,
+        chunk_size=256,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.norm_add_unit_offset = norm_add_unit_offset
+        self.init_fn = init_fn
+        self.init_std = init_std
+        self.init_cutoff_factor = init_cutoff_factor
+        # Attention-specific paramters
+        self.attention_class = attention_class
+        self.window_size = window_size
+        self.num_chunks = num_chunks
+        self.chunk_size = chunk_size
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/eva.py ADDED Viewed

	@@ -0,0 +1,424 @@

+from typing import Dict, Optional, Tuple, List, Any, Union
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .eva_agg_kernel import eva_agg_func_triton
+from .eva_prep_kv_kernel import eva_prep_kv_func_triton
+try:
+    import triton
+    USE_TRITON_IMPL = True
+except ImportError:
+    USE_TRITON_IMPL = False
+    raise ImportError("Triton is not installed. Please install it by running `pip install triton`.")
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """
+    Rotates half the hidden dims (last dim) of the input.
+    Args:
+        x: Rotary embedded tensor
+    Return:
+        Tensor with half of last dim negated and rotated to the front.
+    """
+    x1, x2 = x.split(x.shape[-1] // 2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor,
+                         position_ids: torch.Tensor) -> torch.Tensor:
+    """
+    Apply rotary embedding (cos, sin) to the query and key tensor on the sequence dimension.
+    The legends for dimensions are defined as:
+    num_heads: number of attention heads
+    current_seq_len: the current batch's sequence length, should be either 1 or max_seq_len
+    max_seq_len: the static sequence length, different from current_seq_len in cached inference case where it is always
+                 maximum lenghth, e.g. the length of static sequence length of KV cache
+    Args:
+        q: Query tensor, of size (batch_size, num_heads, current_seq_len, head_dim)
+        k: Key tensor, of size (batch_size, num_key_value_heads, current_seq_len, head_dim)
+        cos: Cosine base of rotary embedding, of size (max_seq_len, head_dim)
+        sin: Sine base of rotary embedding, of size (max_seq_len, head_dim)
+        position_ids: The position indices of the tokens corresponding to the query and key tensors. It has a size of
+                      (batch_size, current_seq_len).
+    Returns:
+        Embedded query and key tensor of same size as input.
+    """
+    bs, nheads, cur_seq_len, head_dim = q.shape
+    assert len(
+        k.shape) == 4, f"k should be of shape (batch_size, num_heads, current_seq_len, head_dim), got {k.shape} instead"
+    assert k.shape[0] == bs, f"k has a different batch_size {k.shape[0]} compared to q {bs}"
+    assert list(k.shape[2:]) == [cur_seq_len,
+                                 head_dim], f"k has different current_seq_len and/or head_dim compared to q"
+    assert cos.shape[3] == head_dim, f"cos should have dim of head dim {head_dim}, got {cos.shape[3]} instead"
+    assert list(position_ids.shape) in [[bs, cur_seq_len], [1, cur_seq_len]],\
+            f"position_ids should be of shape {[bs, cur_seq_len]} or {[1, cur_seq_len]}, got {position_ids.shape} instead"
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class EvaAttention(nn.Module):
+    """
+        Causal EVA for language modeling.
+    """
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.head_dim_scaling = self.head_dim ** -0.5
+        self.max_position_embeddings = config.max_position_embeddings
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.window_size = config.window_size
+        self.num_chunks = config.num_chunks
+        self.chunk_size = config.chunk_size
+        if self.chunk_size is not None:
+            assert self.window_size >= self.chunk_size and self.window_size % self.chunk_size == 0
+            # chunk_size overrides the number of landmarks
+            self.num_chunks = None
+        self.chunks_per_window = int(self.window_size // self.chunk_size)
+        self.adaptive_phi = nn.Parameter(
+            torch.randn(
+                1,
+                self.num_heads,
+                1,
+                1,
+                self.head_dim
+            ).clamp(-1., 1.) * self.head_dim_scaling
+        )
+        self.adaptive_mu_k = nn.Parameter(
+            torch.randn(
+                1,
+                self.num_heads,
+                1,
+                1,
+                self.head_dim
+            ).clamp(-1., 1.) * self.head_dim_scaling
+        )
+    def _triton_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        assert not output_attentions
+        bsz, q_len, _ = hidden_states.size()
+        if use_cache:
+            if past_key_value is None:
+                raise ValueError
+            assert isinstance(attention_mask, tuple)
+        # infer the model's running mode
+        is_prefilling = use_cache and past_key_value.get_seq_length(self.layer_idx) == 0
+        is_decoding = use_cache and past_key_value.get_seq_length(self.layer_idx) > 0
+        if is_prefilling:
+            assert len(attention_mask) == 2
+            window_mask, intra_chunk_mask = attention_mask
+            chunk_mask = None
+        elif is_decoding:
+            assert len(attention_mask) == 3
+            window_mask, intra_chunk_mask, chunk_mask = attention_mask
+        else:
+            if attention_mask is not None:
+                assert isinstance(attention_mask, tuple) and len(attention_mask) == 3
+                window_mask, chunk_mask, intra_chunk_mask = attention_mask
+            else:
+                window_mask, chunk_mask, intra_chunk_mask = None, None, None
+        ############################################
+        # compute q, k, v from hidden states
+        ############################################
+        # [b, h, q_len, d]
+        q = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        k = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        v = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if use_cache:
+            past_key_value.update_past_len(q.shape[-2], self.layer_idx)
+        ############################################
+        # apply rotary positional embeddings to q, k
+        ############################################
+        q, k = apply_rotary_pos_emb(q, k, cos, sin, position_ids)
+        ############################################
+        # update and get cached singleton tokens
+        # update and cache k and v for calculating chunk-level RFAs
+        ############################################
+        if use_cache:
+            s_k, s_v, dump_k, dump_v = past_key_value.update_singletons_and_chunks(
+                k,
+                v,
+                self.layer_idx,
+                self.window_size,
+            )
+        else:
+            s_k, s_v = k, v
+            dump_k, dump_v = k, v
+        if use_cache:
+            singleton_mask, dump_rf_mask = past_key_value.update_mask(
+                s_mask=window_mask,
+                rf_mask=intra_chunk_mask,
+                layer_idx=self.layer_idx,
+                window_size=self.window_size,
+            )
+        else:
+            singleton_mask = window_mask
+            dump_rf_mask = intra_chunk_mask
+        if dump_k is not None and dump_v is not None:
+            # 1. in prefilling, the input shape is
+            #   dump_k/dump_v: [b, h, n, d]
+            #   rfa_k/rfa_v: [b, h, n // c, d]
+            # 2. in decoding, the input shape is
+            #   k/v: [b, h, w, d]
+            #   rfa_k/rfa_v: [b, h, w//c, d]
+            # 3. in forward inference; the seq_len is already divisible
+            rfa_k, rfa_v = eva_prep_kv_func_triton(
+                dump_k, dump_v,
+                self.adaptive_mu_k, self.adaptive_phi,
+                dump_rf_mask, self.head_dim_scaling, self.chunk_size
+            )
+            # rfa_mask = get_rfa_chunk_mask(dump_rf_mask)
+            if use_cache:
+                rfa_k, rfa_v = past_key_value.update_chunk_rfas(
+                    rfa_k, rfa_v, self.layer_idx
+                )
+        elif use_cache:
+            # if there are not enough elements within the last chunk,
+            # we will only use the cached chunk-level RFAs
+            rfa_k, rfa_v = past_key_value.get_chunk_rfas(self.layer_idx)
+        else:
+            rfa_k, rfa_v = None, None
+        ############################################
+        # compute the full attention output
+        ############################################
+        if is_prefilling:
+            # prefilling
+            # 1. in prefilling, the input shape is
+            #   q: [b, h, n, d]
+            #   k/v: [b, h, n, d]
+            #   rfa_k/rfa_v: [b, h, n // c, d]
+            attn_output = eva_agg_func_triton(
+                q, s_k, s_v,
+                rfa_k, rfa_v,
+                singleton_mask, chunk_mask,
+                self.head_dim_scaling, self.window_size, self.chunks_per_window
+            )
+        elif is_decoding:
+            # 2. in decoding, the input shape is
+            #   q: [b, h, 1, d] or [b, h, z, d] (for multi-byte prediction)
+            #   k/v: [b, h, 1 + s, d]
+            #   rfa_k/rfa_v: [b, h, n // c, d]
+            if rfa_k is not None and rfa_v is not None:
+                # we only take the chunk-level RFAs not in the current window
+                seen_seq_len = past_key_value.get_seq_length(self.layer_idx)
+                if seen_seq_len <= self.window_size:
+                    agg_k = s_k
+                    agg_v = s_v
+                    attn_mask = singleton_mask
+                else:
+                    # NOTE: we already updated the cache so the length now
+                    # includes the current token
+                    # we subtract 1 from seen_seq_len because we want
+                    # if seen_seq_len = 2048 -> num_windows_seen_so_far = 0
+                    # if seen_seq_len = 4096 -> num_windows_seen_so_far = 1
+                    # if seen_seq_len = 4097 -> num_windows_seen_so_far = 2
+                    # NOTE the cat order should be taken care of;
+                    # should align with the order based on which
+                    # the attention mask is constructed
+                    num_windows_seen_so_far = (seen_seq_len - 1) // self.window_size
+                    agg_k = torch.cat([s_k, rfa_k[..., :num_windows_seen_so_far * self.chunks_per_window, :]], dim=-2)
+                    agg_v = torch.cat([s_v, rfa_v[..., :num_windows_seen_so_far * self.chunks_per_window, :]], dim=-2)
+                    if singleton_mask is not None:
+                        assert chunk_mask is not None
+                        attn_mask = torch.cat([singleton_mask, chunk_mask], dim=-1)
+                    else:
+                        attn_mask = singleton_mask
+            else:
+                agg_k = s_k
+                agg_v = s_v
+                attn_mask = singleton_mask
+            attn_output = F.scaled_dot_product_attention(
+                q, agg_k, agg_v,
+                attn_mask=attn_mask,
+                is_causal=False,
+                dropout_p=0.0,
+                scale=self.head_dim_scaling
+            )
+        else:
+            # 3. in single-forward inference
+            attn_output = eva_agg_func_triton(
+                q, s_k, s_v,
+                rfa_k, rfa_v,
+                singleton_mask, chunk_mask,
+                self.head_dim_scaling, self.window_size, self.chunks_per_window
+            )
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def _multibyte_decoding_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # during multi-byte forwarding, we only read caches and do not update them
+        assert not output_attentions
+        bsz, q_len, _ = hidden_states.size()
+        if use_cache and past_key_value is None:
+            raise ValueError
+        assert USE_TRITON_IMPL
+        assert isinstance(attention_mask, torch.Tensor) and attention_mask.dtype == torch.bool
+        assert use_cache and past_key_value.get_seq_length(self.layer_idx) > 0
+        ############################################
+        # compute q, k, v from hidden states
+        ############################################
+        # [b, h, q_len, d]
+        q = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        k = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        v = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        ############################################
+        # apply rotary positional embeddings to q, k
+        ############################################
+        q, k = apply_rotary_pos_emb(q, k, cos, sin, position_ids)
+        ############################################
+        # update and get cached singleton tokens
+        ############################################
+        input_len = k.shape[-2]
+        window_pos = past_key_value.past_window_pos[self.layer_idx]
+        new_window_pos = window_pos + input_len
+        past_key_value.past_window_k[self.layer_idx][:, :, window_pos : new_window_pos, :] = k
+        past_key_value.past_window_v[self.layer_idx][:, :, window_pos : new_window_pos, :] = v
+        s_k = past_key_value.past_window_k[self.layer_idx][:, :, : new_window_pos, :]
+        s_v = past_key_value.past_window_v[self.layer_idx][:, :, : new_window_pos, :]
+        rfa_k, rfa_v = past_key_value.get_chunk_rfas(self.layer_idx)
+        ############################################
+        # compute the full attention output
+        ############################################
+        # 2. in decoding, the input shape is
+        #   q: [b, h, 1, d] or [b, h, z, d] (for multi-byte prediction)
+        #   k/v: [b, h, 1 + s, d]
+        #   rfa_k/rfa_v: [b, h, n // c, d]
+        if rfa_k is not None and rfa_v is not None:
+            # NOTE the cat order should be taken care of;
+            # should align with the order based on which
+            # the attention mask is constructed
+            # agg_k = torch.cat([s_k, rfa_k], dim=-2)
+            # agg_v = torch.cat([s_v, rfa_v], dim=-2)
+            agg_k = torch.cat([rfa_k, s_k], dim=-2)
+            agg_v = torch.cat([rfa_v, s_v], dim=-2)
+        else:
+            agg_k = s_k
+            agg_v = s_v
+        attn_output = F.scaled_dot_product_attention(
+            q, agg_k, agg_v,
+            attn_mask=attention_mask,
+            is_causal=False,
+            dropout_p=0.0,
+            scale=self.head_dim_scaling
+        )
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+        multibyte_decoding: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        assert not output_attentions
+        if use_cache and past_key_value is None:
+            raise ValueError
+        assert USE_TRITON_IMPL
+        if use_cache and multibyte_decoding:
+            return self._multibyte_decoding_forward(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cos=cos,
+                sin=sin,
+            )
+        else:
+            return self._triton_forward(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cos=cos,
+                sin=sin,
+            )

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/eva_agg_kernel.py ADDED Viewed

	@@ -0,0 +1,1766 @@

+import math
+import torch
+import triton
+import triton.language as tl
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
+        "EVEN_W": lambda args: args["WINDOW_SIZE"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _bwd_eva_agg_kernel_dkdv(
+    Q,
+    K,
+    V,
+    WindowMask,
+    DO,
+    LSE,
+    DO_T_O,
+    DK,
+    DV,
+    softmax_scale,
+    stride_qb, stride_qh, stride_qm,
+    stride_kb, stride_kh, stride_kn,
+    stride_vb, stride_vh, stride_vn,
+    stride_window_mask_b, stride_window_mask_m,
+    stride_do_b, stride_do_h, stride_do_m,
+    stride_lse_b, stride_lse_h,
+    stride_do_t_o_b, stride_do_t_o_h,
+    stride_dk_b, stride_dk_h, stride_dk_n,
+    stride_dv_b, stride_dv_h, stride_dv_n,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    headdim,
+    WINDOW_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_W: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_bh = tl.program_id(1)
+    off_h = off_bh % nheads
+    off_b = off_bh // nheads
+    start_n = tl.program_id(0)
+    # determine which window the current KV block belongs to
+    offs_w = (start_n * BLOCK_N) // WINDOW_SIZE
+    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # initialize pointers
+    q_ptrs = (
+        Q +
+        off_b * stride_qb +
+        off_h * stride_qh +
+        offs_m[:, None] * stride_qm + offs_d[None, :]
+    )
+    k_ptrs = (
+        K +
+        off_b * stride_kb +
+        off_h * stride_kh +
+        offs_n[:, None] * stride_kn + offs_d[None, :]
+    )
+    v_ptrs = (
+        V +
+        off_b * stride_vb +
+        off_h * stride_vh +
+        offs_n[:, None] * stride_vn + offs_d[None, :]
+    )
+    do_ptrs = (
+        DO +
+        off_b * stride_do_b +
+        off_h * stride_do_h +
+        offs_m[:, None] * stride_do_m + offs_d[None, :]
+    )
+    do_t_o_ptrs = (
+        DO_T_O +
+        off_b * stride_do_t_o_b +
+        off_h * stride_do_t_o_h +
+        offs_m[:, None]
+    )
+    lse_ptrs = (
+        LSE +
+        off_b * stride_lse_b +
+        off_h * stride_lse_h +
+        offs_m[:, None]
+    )
+    if MASK_TYPE == 1:
+        m_ptrs = (
+            WindowMask +
+            off_b * stride_window_mask_b +
+            (offs_m[:, None] * stride_window_mask_m + offs_n[None, :])
+        )
+    dk_ptrs = (
+        DK +
+        off_b * stride_dk_b +
+        off_h * stride_dk_h +
+        offs_n[:, None] * stride_dk_n + offs_d[None, :]
+    )
+    dv_ptrs = (
+        DV +
+        off_b * stride_dv_b +
+        off_h * stride_dv_h +
+        offs_n[:, None] * stride_dv_n + offs_d[None, :]
+    )
+    # 1. for singletons
+    # determine start and end of query block
+    begin_m = ((start_n * BLOCK_N) // BLOCK_M) * BLOCK_M
+    end_m = tl.minimum((offs_w + 1) * WINDOW_SIZE, seqlen_q)
+    dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            k = tl.load(k_ptrs)
+            v = tl.load(v_ptrs)
+        else:
+            k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+            v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    else:
+        if EVEN_HEADDIM:
+            k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+            v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+        else:
+            k = tl.load(
+                k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
+            )
+            v = tl.load(
+                v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
+            )
+    for start_m in range(begin_m, end_m, BLOCK_M):
+        start_m = tl.multiple_of(start_m, BLOCK_M)
+        # load q, do, and lse
+        if EVEN_M & EVEN_N:
+            if EVEN_HEADDIM:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m
+                )
+            else:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+            do_t_o = tl.load(
+                do_t_o_ptrs + start_m
+            )
+            lse = tl.load(
+                lse_ptrs + start_m
+            )
+        else:
+            if EVEN_HEADDIM:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm,
+                    mask=(start_m + offs_m)[:, None] < seqlen_q,
+                    other=0.0
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m,
+                    mask=(start_m + offs_m)[:, None] < seqlen_q,
+                    other=0.0
+                )
+            else:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm,
+                    mask=((start_m + offs_m)[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                    other=0.0
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m,
+                    mask=((start_m + offs_m)[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                    other=0.0
+                )
+            do_t_o = tl.load(
+                do_t_o_ptrs + start_m,
+                mask=(start_m + offs_m)[:, None] < seqlen_q,
+                other=0.0
+            )
+            lse = tl.load(
+                lse_ptrs + start_m,
+                mask=(start_m + offs_m)[:, None] < seqlen_q,
+                other=0.0
+            )
+        lse = tl.where(lse == float("-inf"), 0.0, lse)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, tl.trans(k))
+        if not EVEN_M:
+            qk += tl.where((start_m + offs_m)[:, None] < seqlen_q, 0, float("-inf"))
+        if MASK_TYPE == 1:
+            if EVEN_M & EVEN_W:
+                mask = tl.load(
+                    m_ptrs + (start_m * stride_window_mask_m) - (offs_w * WINDOW_SIZE)
+                )
+            else:
+                mask = tl.load(
+                    m_ptrs + (start_m * stride_window_mask_m) - (offs_w * WINDOW_SIZE),
+                    mask=((start_m + offs_m)[:, None] < seqlen_q)
+                    & (((start_m * stride_window_mask_m) - (offs_w * WINDOW_SIZE) + offs_n)[None, :] < WINDOW_SIZE),
+                    other=1,
+                )
+            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+            # to multiply with softmax_scale here.
+            # we assume mask already implies the causal masking
+            qk = qk * softmax_scale
+            qk = tl.where(mask, float("-inf"), qk)
+            p = tl.exp(qk - lse)
+        else:
+            qk += tl.where((start_m + offs_m)[:, None] >= offs_n[None, :], 0, float("-inf"))
+            p = tl.exp(qk * softmax_scale - lse)
+        # dp [M, N]
+        dp = tl.dot(do, tl.trans(v))
+        # p [M, N],  dp [M, N], do_t_o [M, 1] -> ds [M, N]
+        ds = (p * (dp - do_t_o) * softmax_scale).to(q.dtype)
+        # p is fp32 and [M, N], convert to q.dtype
+        # do [M, D] -> dv [N, D]
+        dv += tl.dot(tl.trans(p.to(do.dtype)), do)
+        # dk [N, D]
+        dk += tl.dot(tl.trans(ds), q)
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(dv_ptrs, dv)
+            tl.store(dk_ptrs, dk)
+        else:
+            tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
+            tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
+    else:
+        if EVEN_HEADDIM:
+            tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
+            tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
+        else:
+            tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
+            tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_C": lambda args: args["nchunks"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _bwd_eva_agg_kernel_drfa_kv(
+    Q,
+    RFA_K,
+    RFA_V,
+    ChunkMask,
+    DO,
+    LSE,
+    DO_T_O,
+    D_RFA_K,
+    D_RFA_V,
+    softmax_scale,
+    stride_qb, stride_qh, stride_qm,
+    stride_rfa_kb, stride_rfa_kh, stride_rfa_kc,
+    stride_rfa_vb, stride_rfa_vh, stride_rfa_vc,
+    stride_chunk_mask_b, stride_chunk_mask_m,
+    stride_do_b, stride_do_h, stride_do_m,
+    stride_lse_b, stride_lse_h,
+    stride_do_t_o_b, stride_do_t_o_h,
+    stride_d_rfa_k_b, stride_d_rfa_k_h, stride_d_rfa_k_c,
+    stride_d_rfa_v_b, stride_d_rfa_v_h, stride_d_rfa_v_c,
+    nheads,
+    seqlen_q,
+    nchunks,
+    headdim,
+    CHUNKS_PER_WINDOW: tl.constexpr,
+    WINDOW_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_C: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_bh = tl.program_id(1)
+    off_h = off_bh % nheads
+    off_b = off_bh // nheads
+    start_c = tl.program_id(0)
+    # there are 128 chunks per window
+    offs_c = start_c * BLOCK_N + tl.arange(0, BLOCK_N)
+    # determine which window the current KV block belongs to
+    offs_w = (start_c * BLOCK_N) // CHUNKS_PER_WINDOW
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # initialize pointers
+    q_ptrs = (
+        Q +
+        off_b * stride_qb +
+        off_h * stride_qh +
+        (offs_m[:, None] * stride_qm + offs_d[None, :])
+    )
+    do_ptrs = (
+        DO +
+        off_b * stride_do_b +
+        off_h * stride_do_h +
+        (offs_m[:, None] * stride_do_m + offs_d[None, :])
+    )
+    do_t_o_ptrs = (
+        DO_T_O +
+        off_b * stride_do_t_o_b +
+        off_h * stride_do_t_o_h +
+        (offs_m[:, None])
+    )
+    lse_ptrs = (
+        LSE +
+        off_b * stride_lse_b +
+        off_h * stride_lse_h +
+        (offs_m[:, None])
+    )
+    rfa_k_ptrs = (
+        RFA_K +
+        off_b * stride_rfa_kb +
+        off_h * stride_rfa_kh +
+        (offs_c[:, None] * stride_rfa_kc + offs_d[None, :])
+    )
+    rfa_v_ptrs = (
+        RFA_V +
+        off_b * stride_rfa_vb +
+        off_h * stride_rfa_vh +
+        (offs_c[:, None] * stride_rfa_vc + offs_d[None, :])
+    )
+    if MASK_TYPE == 1:
+        rfa_m_ptrs = (
+            ChunkMask +
+            off_b * stride_chunk_mask_b +
+            (offs_m[:, None] * stride_chunk_mask_m + offs_c[None, :])
+        )
+    d_rfa_k_ptrs = (
+        D_RFA_K +
+        off_b * stride_d_rfa_k_b +
+        off_h * stride_d_rfa_k_h +
+        (offs_c[:, None] * stride_d_rfa_k_c + offs_d[None, :])
+    )
+    d_rfa_v_ptrs = (
+        D_RFA_V +
+        off_b * stride_d_rfa_v_b +
+        off_h * stride_d_rfa_v_h +
+        (offs_c[:, None] * stride_d_rfa_v_c + offs_d[None, :])
+    )
+    d_rfa_k = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    d_rfa_v = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    if EVEN_C & EVEN_M:
+        if EVEN_HEADDIM:
+            rfa_k = tl.load(rfa_k_ptrs)
+            rfa_v = tl.load(rfa_v_ptrs)
+        else:
+            rfa_k = tl.load(rfa_k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+            rfa_v = tl.load(rfa_v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    else:
+        if EVEN_HEADDIM:
+            rfa_k = tl.load(rfa_k_ptrs, mask=offs_c[:, None] < nchunks, other=0.0)
+            rfa_v = tl.load(rfa_v_ptrs, mask=offs_c[:, None] < nchunks, other=0.0)
+        else:
+            rfa_k = tl.load(
+                rfa_k_ptrs, mask=(offs_c[:, None] < nchunks) & (offs_d[None, :] < headdim), other=0.0
+            )
+            rfa_v = tl.load(
+                rfa_v_ptrs, mask=(offs_c[:, None] < nchunks) & (offs_d[None, :] < headdim), other=0.0
+            )
+    begin_m = tl.minimum((offs_w + 1) * WINDOW_SIZE, seqlen_q)
+    end_m = seqlen_q
+    for start_m in range(begin_m, end_m, BLOCK_M):
+        start_m = tl.multiple_of(start_m, BLOCK_M)
+        # load q, do, and lse
+        if EVEN_M:
+            if EVEN_HEADDIM:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m
+                )
+            else:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+            do_t_o = tl.load(
+                do_t_o_ptrs + start_m
+            )
+            lse = tl.load(
+                lse_ptrs + start_m
+            )
+        else:
+            if EVEN_HEADDIM:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm,
+                    mask=(start_m + offs_m)[:, None] < seqlen_q,
+                    other=0.0
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m,
+                    mask=(start_m + offs_m)[:, None] < seqlen_q,
+                    other=0.0
+                )
+            else:
+                q = tl.load(
+                    q_ptrs + start_m * stride_qm,
+                    mask=((start_m + offs_m)[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                    other=0.0
+                )
+                do = tl.load(
+                    do_ptrs + start_m * stride_do_m,
+                    mask=((start_m + offs_m)[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                    other=0.0
+                )
+            do_t_o = tl.load(
+                do_t_o_ptrs + start_m,
+                mask=(start_m + offs_m)[:, None] < seqlen_q,
+                other=0.0
+            )
+            lse = tl.load(
+                lse_ptrs + start_m,
+                mask=(start_m + offs_m)[:, None] < seqlen_q,
+                other=0.0
+            )
+        lse = tl.where(lse == float("-inf"), 0.0, lse)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, tl.trans(rfa_k))
+        if not EVEN_M:
+            qk += tl.where((start_m + offs_m)[:, None] < seqlen_q, 0, float("-inf"))
+        if MASK_TYPE == 1:
+            if EVEN_M & EVEN_C:
+                mask = tl.load(
+                    rfa_m_ptrs + (start_m * stride_chunk_mask_m)
+                )
+            else:
+                mask = tl.load(
+                    rfa_m_ptrs + (start_m * stride_chunk_mask_m),
+                    mask=((start_m + offs_m)[:, None] < seqlen_q)
+                    & (offs_c[None, :] < nchunks),
+                    other=1,
+                )
+            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+            # to multiply with softmax_scale here.
+            # we assume mask already implies the causal masking
+            qk = qk * softmax_scale
+            qk = tl.where(mask, float("-inf"), qk)
+            p = tl.exp(qk - lse)
+        else:
+            p = tl.exp(qk * softmax_scale - lse)
+        dp = tl.dot(do, tl.trans(rfa_v))
+        ds = (p * (dp - do_t_o) * softmax_scale).to(q.dtype)
+        # p is fp32, convert to q.dtype
+        d_rfa_v += tl.dot(tl.trans(p.to(do.dtype)), do)
+        # move softmax_scale to ds to save computation
+        d_rfa_k += tl.dot(tl.trans(ds), q)
+    if EVEN_C & EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(d_rfa_v_ptrs, d_rfa_v)
+            tl.store(d_rfa_k_ptrs, d_rfa_k)
+        else:
+            tl.store(d_rfa_v_ptrs, d_rfa_v, mask=offs_d[None, :] < headdim)
+            tl.store(d_rfa_k_ptrs, d_rfa_k, mask=offs_d[None, :] < headdim)
+    else:
+        if EVEN_HEADDIM:
+            tl.store(d_rfa_v_ptrs, d_rfa_v, mask=offs_c[:, None] < nchunks)
+            tl.store(d_rfa_k_ptrs, d_rfa_k, mask=offs_c[:, None] < nchunks)
+        else:
+            tl.store(d_rfa_v_ptrs, d_rfa_v, mask=(offs_c[:, None] < nchunks) & (offs_d[None, :] < headdim))
+            tl.store(d_rfa_k_ptrs, d_rfa_k, mask=(offs_c[:, None] < nchunks) & (offs_d[None, :] < headdim))
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
+        "EVEN_C": lambda args: args["nchunks"] % args["BLOCK_N"] == 0,
+        "EVEN_W": lambda args: args["WINDOW_SIZE"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _bwd_eva_agg_kernel_dq(
+    Q,
+    K,
+    V,
+    RFA_K,
+    RFA_V,
+    WindowMask,
+    ChunkMask,
+    DO,
+    LSE,
+    DO_T_O,
+    DQ,
+    softmax_scale,
+    stride_qb, stride_qh, stride_qm,
+    stride_kb, stride_kh, stride_kn,
+    stride_vb, stride_vh, stride_vn,
+    stride_rfa_kb, stride_rfa_kh, stride_rfa_kc,
+    stride_rfa_vb, stride_rfa_vh, stride_rfa_vc,
+    stride_window_mask_b, stride_window_mask_m,
+    stride_chunk_mask_b, stride_chunk_mask_m,
+    stride_do_b, stride_do_h, stride_do_m,
+    stride_lse_b, stride_lse_h,
+    stride_do_t_o_b, stride_do_t_o_h,
+    stride_dq_b, stride_dq_h, stride_dq_m,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    nchunks,
+    headdim,
+    CHUNKS_PER_WINDOW: tl.constexpr,
+    WINDOW_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    EMPTY_RFA_KV: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_W: tl.constexpr,
+    EVEN_C: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_bh = tl.program_id(1)
+    off_h = off_bh % nheads
+    off_b = off_bh // nheads
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_w = (start_m * BLOCK_M) // WINDOW_SIZE
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_c = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # TODO: add paratheses or not
+    q_ptrs = (
+        Q +
+        off_b * stride_qb +
+        off_h * stride_qh +
+        (offs_m[:, None] * stride_qm + offs_d[None, :])
+    )
+    k_ptrs = (
+        K +
+        off_b * stride_kb +
+        off_h * stride_kh +
+        (offs_n[:, None] * stride_kn + offs_d[None, :])
+    )
+    v_ptrs = (
+        V +
+        off_b * stride_vb +
+        off_h * stride_vh +
+        (offs_n[:, None] * stride_vn + offs_d[None, :])
+    )
+    if EMPTY_RFA_KV == 0:
+        rfa_k_ptrs = (
+            RFA_K +
+            off_b * stride_rfa_kb +
+            off_h * stride_rfa_kh +
+            (offs_c[:, None] * stride_rfa_kc + offs_d[None, :])
+        )
+        rfa_v_ptrs = (
+            RFA_V +
+            off_b * stride_rfa_vb +
+            off_h * stride_rfa_vh +
+            (offs_c[:, None] * stride_rfa_vc + offs_d[None, :])
+        )
+    dq_ptrs = (
+        DQ +
+        off_b * stride_dq_b +
+        off_h * stride_dq_h +
+        (offs_m[:, None] * stride_dq_m + offs_d[None, :])
+    )
+    do_ptrs = (
+        DO +
+        off_b * stride_do_b +
+        off_h * stride_do_h +
+        (offs_m[:, None] * stride_do_m + offs_d[None, :])
+    )
+    do_t_o_ptrs = (
+        DO_T_O +
+        off_b * stride_do_t_o_b +
+        off_h * stride_do_t_o_h +
+        offs_m[:, None]
+    )
+    lse_ptrs = (
+        LSE +
+        off_b * stride_lse_b +
+        off_h * stride_lse_h +
+        offs_m[:, None]
+    )
+    ### load q, do, do_t_o, lse ####
+    if EVEN_M:
+        if EVEN_HEADDIM:
+            q = tl.load(
+                q_ptrs
+            )
+            do = tl.load(
+                do_ptrs
+            )
+        else:
+            q = tl.load(
+                q_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+            do = tl.load(
+                do_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+        do_t_o = tl.load(
+            do_t_o_ptrs
+        )
+        lse = tl.load(
+            lse_ptrs
+        )
+    else:
+        if EVEN_HEADDIM:
+            q = tl.load(
+                q_ptrs,
+                mask=offs_m[:, None] < seqlen_q,
+                other=0.0
+            )
+            do = tl.load(
+                do_ptrs,
+                mask=offs_m[:, None] < seqlen_q,
+                other=0.0
+            )
+        else:
+            q = tl.load(
+                q_ptrs,
+                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+            do = tl.load(
+                do_ptrs,
+                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+        do_t_o = tl.load(
+            do_t_o_ptrs,
+            mask=offs_m[:, None] < seqlen_q,
+            other=0.0
+        )
+        lse = tl.load(
+            lse_ptrs,
+            mask=offs_m[:, None] < seqlen_q,
+            other=0.0
+        )
+    lse = tl.where(lse == float("-inf"), 0.0, lse)
+    lse *= 1.4426950408889634  # log2(e)
+    qk_scale = softmax_scale
+    qk_scale *= 1.4426950408889634  # log2(e)
+    if MASK_TYPE == 1:
+        window_mask_ptrs = (
+            WindowMask +
+            off_b * stride_window_mask_b +
+            (offs_m[:, None] * stride_window_mask_m + offs_n[None, :])
+        )
+        if EMPTY_RFA_KV == 0:
+            chunk_mask_ptrs = (
+                ChunkMask +
+                off_b * stride_chunk_mask_b +
+                (offs_m[:, None] * stride_chunk_mask_m + offs_c[None, :])
+            )
+    dq = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
+    # loop over k, v and update accumulator
+    # Iterate over local singletons;
+    # so we only iterate over blocks within the current window
+    start_idx_n = offs_w * WINDOW_SIZE
+    end_idx_n = tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
+    for start_n in range(start_idx_n, end_idx_n, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        if EVEN_N & EVEN_M:
+            if EVEN_HEADDIM:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn
+                )
+            else:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+        else:
+            if EVEN_HEADDIM:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, tl.trans(k))
+        # Trying to combine the two masks seem to make the result wrong
+        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
+        if MASK_TYPE == 1:
+            if EVEN_M & EVEN_W:
+                window_mask = tl.load(
+                    window_mask_ptrs + start_n - start_idx_n
+                )
+            else:
+                window_mask = tl.load(
+                    window_mask_ptrs + start_n - start_idx_n,
+                    mask=(offs_m[:, None] < seqlen_q)
+                    & ((start_n - start_idx_n + offs_n)[None, :] < WINDOW_SIZE),
+                    other=1,
+                )
+            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+            # to multiply with softmax_scale here.
+            # we assume mask already implies the causal masking
+            qk = qk * qk_scale
+            qk = tl.where(window_mask, float("-inf"), qk)
+            p = tl.exp2(qk - lse)
+        else:
+            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf"))
+            p = tl.exp2(qk * qk_scale - lse)
+        if EVEN_N & EVEN_M:
+            if EVEN_HEADDIM:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn
+                )
+            else:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+        else:
+            if EVEN_HEADDIM:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        dp = tl.dot(do, tl.trans(v))
+        ds = (p * (dp - do_t_o) * softmax_scale).to(q.dtype)
+        dq += tl.dot(ds, k)
+    if EMPTY_RFA_KV == 0:
+        # Iterate over RFA chunks
+        # we only iterate over chunks before the current local singleton window
+        end_idx_c = tl.minimum(offs_w * CHUNKS_PER_WINDOW, nchunks)
+        for start_c in range(0, end_idx_c, BLOCK_N):
+            start_c = tl.multiple_of(start_c, BLOCK_N)
+            # -- compute qk ----
+            if EVEN_C & EVEN_M:
+                if EVEN_HEADDIM:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc
+                    )
+                else:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=offs_d[None, :] < headdim,
+                        other=0.0
+                    )
+            else:
+                if EVEN_HEADDIM:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=(start_c + offs_c)[:, None] < nchunks,
+                        other=0.0,
+                    )
+                else:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=((start_c + offs_c)[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                        other=0.0,
+                    )
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk += tl.dot(q, tl.trans(rfa_k))
+            # Trying to combine the two masks seem to make the result wrong
+            if not EVEN_C:  # Need to mask out otherwise the softmax is wrong
+                qk += tl.where((start_c + offs_c)[None, :] < nchunks, 0, float("-inf"))
+            if MASK_TYPE == 1:
+                if EVEN_C & EVEN_M:
+                    chunk_mask = tl.load(
+                        chunk_mask_ptrs + start_c
+                    )
+                else:
+                    chunk_mask = tl.load(
+                        chunk_mask_ptrs + start_c,
+                        mask=(offs_m[:, None] < seqlen_q) & ((start_c + offs_c)[None, :] < nchunks),
+                        other=1,
+                    )
+                # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+                # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+                # to multiply with softmax_scale here.
+                # we assume mask already implies the causal masking
+                qk = qk * qk_scale
+                qk = tl.where(chunk_mask, float("-inf"), qk)
+                p = tl.exp2(qk - lse)
+            else:
+                p = tl.exp2(qk * qk_scale - lse)
+            if EVEN_C & EVEN_M:
+                if EVEN_HEADDIM:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc
+                    )
+                else:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=offs_d[None, :] < headdim,
+                        other=0.0
+                    )
+            else:
+                if EVEN_HEADDIM:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=(start_c + offs_n)[:, None] < nchunks,
+                        other=0.0,
+                    )
+                else:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=((start_c + offs_n)[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                        other=0.0,
+                    )
+            dp = tl.dot(do, tl.trans(rfa_v))
+            ds = (p * (dp - do_t_o) * softmax_scale).to(q.dtype)
+            dq += tl.dot(ds, rfa_k)
+    start_m = tl.program_id(0)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    dq_ptrs = (
+        DQ +
+        off_b * stride_dq_b +
+        off_h * stride_dq_h +
+        (offs_m[:, None] * stride_dq_m + offs_d[None, :])
+    )
+    if EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(
+                dq_ptrs, dq
+            )
+        else:
+            tl.store(
+                dq_ptrs, dq,
+                mask=offs_d[None, :] < headdim
+            )
+    else:
+        if EVEN_HEADDIM:
+            tl.store(
+                dq_ptrs, dq,
+                mask=offs_m[:, None] < seqlen_q
+            )
+        else:
+            tl.store(
+                dq_ptrs, dq,
+                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)
+            )
+_capability_90_config = {
+    "fwd": {
+        (torch.bfloat16, 64): (128, 128, 4, 3),
+        (torch.bfloat16, 128): (128, 128, 8, 3),
+        (torch.float32, 64): (128, 64, 8, 3),
+        (torch.float32, 128): (64, 32, 4, 3),
+    },
+    "bwd_dq": {
+        (torch.bfloat16, 64): (128, 64, 4, 3),
+        (torch.bfloat16, 128): (128, 64, 8, 3),
+        (torch.float32, 64): (128, 64, 8, 2),
+        (torch.float32, 128): (32, 32, 4, 2),
+    },
+    "bwd_dkdv": {
+        (torch.bfloat16, 64): (128, 64, 4, 2),
+        (torch.bfloat16, 128): (128, 64, 8, 2),
+        (torch.float32, 64): (128, 64, 8, 2),
+        (torch.float32, 128): (32, 32, 4, 1),
+    },
+    "bwd_drfa_kv": {
+        (torch.bfloat16, 64): (128, 64, 4, 2),
+        (torch.bfloat16, 128): (128, 64, 8, 2),
+        (torch.float32, 64): (128, 64, 8, 2),
+        (torch.float32, 128): (32, 32, 4, 1),
+    }
+}
+_capability_80_config = {
+    "fwd": {
+        (torch.bfloat16, 64): (64, 64, 4, 3),
+        (torch.bfloat16, 128): (64, 64, 8, 3),
+        (torch.float32, 64): (64, 32, 4, 2),
+        (torch.float32, 128): (64, 32, 8, 1),
+    },
+    "bwd_dq": {
+        (torch.bfloat16, 64): (64, 64, 4, 3),
+        (torch.bfloat16, 128): (64, 32, 4, 2),
+        (torch.float32, 64): (32, 32, 4, 2),
+        (torch.float32, 128): (32, 32, 4, 2),
+    },
+    "bwd_dkdv": {
+        (torch.bfloat16, 64): (64, 64, 4, 3),
+        (torch.bfloat16, 128): (32, 32, 4, 2),
+        (torch.float32, 64): (32, 32, 4, 1),
+        (torch.float32, 128): (16, 64, 8, 1),
+    },
+    "bwd_drfa_kv": {
+        (torch.bfloat16, 64): (64, 64, 4, 3),
+        (torch.bfloat16, 128): (64, 32, 4, 3),
+        (torch.float32, 64): (32, 32, 4, 1),
+        (torch.float32, 128): (32, 32, 4, 1),
+    }
+}
+def _get_config(dtype, head_dim, mode) -> tuple[int, int, int, int]:
+    capability = torch.cuda.get_device_capability()
+    if capability >= (9, 0):
+        kernel_config = _capability_90_config[mode].get((dtype, head_dim), (32, 32, 4, 1))
+    elif capability >= (8, 0):
+        kernel_config = _capability_80_config[mode].get((dtype, head_dim), (16, 16, 4, 1))
+    else:
+        if mode == "fwd":
+            if dtype == torch.float32:
+                kernel_config = (32, 16, 4, 2)
+            else:
+                kernel_config = (64, 32, 4, 2)
+        else:
+            if dtype == torch.float32:
+                kernel_config = (16, 16, 4, 1)
+            else:
+                kernel_config = (32, 32, 4, 1)
+    return kernel_config
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
+        "EVEN_C": lambda args: args["nchunks"] % args["BLOCK_N"] == 0,
+        "EVEN_W": lambda args: args["WINDOW_SIZE"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _fwd_eva_agg_kernel(
+    Q,
+    K,
+    V,
+    RFA_K,
+    RFA_V,
+    WindowMask,
+    ChunkMask,
+    Out,
+    LSE,
+    softmax_scale,
+    stride_qb, stride_qh, stride_qm,
+    stride_kb, stride_kh, stride_kn,
+    stride_vb, stride_vh, stride_vn,
+    stride_rfa_kb, stride_rfa_kh, stride_rfa_kc,
+    stride_rfa_vb, stride_rfa_vh, stride_rfa_vc,
+    stride_window_mask_b, stride_window_mask_m,
+    stride_chunk_mask_b, stride_chunk_mask_m,
+    stride_ob, stride_oh, stride_om,
+    stride_lse_b, stride_lse_h,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    nchunks,
+    headdim,
+    CHUNKS_PER_WINDOW: tl.constexpr,
+    WINDOW_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    EMPTY_RFA_KV: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_W: tl.constexpr,
+    EVEN_C: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_bh = tl.program_id(1)
+    off_h = off_bh % nheads
+    off_b = off_bh // nheads
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_w = (start_m * BLOCK_M) // WINDOW_SIZE
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_c = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # TODO: add paratheses or not
+    q_ptrs = (
+        Q +
+        off_b * stride_qb +
+        off_h * stride_qh +
+        (offs_m[:, None] * stride_qm + offs_d[None, :])
+    )
+    k_ptrs = (
+        K +
+        off_b * stride_kb +
+        off_h * stride_kh +
+        (offs_n[:, None] * stride_kn + offs_d[None, :])
+    )
+    v_ptrs = (
+        V +
+        off_b * stride_vb +
+        off_h * stride_vh +
+        (offs_n[:, None] * stride_vn + offs_d[None, :])
+    )
+    if EMPTY_RFA_KV == 0:
+        rfa_k_ptrs = (
+            RFA_K +
+            off_b * stride_rfa_kb +
+            off_h * stride_rfa_kh +
+            (offs_c[:, None] * stride_rfa_kc + offs_d[None, :])
+        )
+        rfa_v_ptrs = (
+            RFA_V +
+            off_b * stride_rfa_vb +
+            off_h * stride_rfa_vh +
+            (offs_c[:, None] * stride_rfa_vc + offs_d[None, :])
+        )
+    qk_scale = softmax_scale
+    qk_scale *= 1.4426950408889634  # log2(e)
+    if MASK_TYPE == 1:
+        window_mask_ptrs = (
+            WindowMask +
+            off_b * stride_window_mask_b +
+            (offs_m[:, None] * stride_window_mask_m + offs_n[None, :])
+        )
+        if EMPTY_RFA_KV == 0:
+            chunk_mask_ptrs = (
+                ChunkMask +
+                off_b * stride_chunk_mask_b +
+                (offs_m[:, None] * stride_chunk_mask_m + offs_c[None, :])
+            )
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    d_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
+    # load q: it will stay in SRAM throughout
+    # [2022-10-30] TD: Triton bug - in the case of EVEN_M=True and EVEN_N=False, if we just call
+    # tl.load(q_ptrs), we get the wrong output!
+    if EVEN_M & EVEN_N:
+        if EVEN_HEADDIM:
+            q = tl.load(
+                q_ptrs
+            )
+        else:
+            q = tl.load(
+                q_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            q = tl.load(
+                q_ptrs,
+                mask=offs_m[:, None] < seqlen_q,
+                other=0.0
+            )
+        else:
+            q = tl.load(
+                q_ptrs,
+                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+    # loop over k, v and update accumulator
+    # Iterate over local singletons;
+    # so we only iterate over blocks within the current window
+    start_idx_n = offs_w * WINDOW_SIZE
+    end_idx_n = tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
+    for start_n in range(start_idx_n, end_idx_n, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        if EVEN_N & EVEN_M:
+            if EVEN_HEADDIM:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn
+                )
+            else:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+        else:
+            if EVEN_HEADDIM:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, tl.trans(k))
+        # Trying to combine the two masks seem to make the result wrong
+        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
+        if MASK_TYPE == 1:
+            if EVEN_M & EVEN_W:
+                window_mask = tl.load(
+                    window_mask_ptrs + start_n - start_idx_n
+                )
+            else:
+                window_mask = tl.load(
+                    window_mask_ptrs + start_n - start_idx_n,
+                    mask=(offs_m[:, None] < seqlen_q)
+                    & ((start_n - start_idx_n + offs_n)[None, :] < WINDOW_SIZE),
+                    other=1,
+                )
+            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+            # to multiply with softmax_scale here.
+            # we assume mask already implies the causal masking
+            qk = qk * qk_scale
+            qk = tl.where(window_mask, float("-inf"), qk)
+            m_ij = tl.maximum(tl.max(qk, 1), m_i)
+            masked_out_rows = (m_ij == float("-inf"))
+            m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+            p = tl.exp2(qk - m_ij_masked[:, None])
+        else:
+            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf"))
+            m_ij = tl.maximum(tl.max(qk, 1) * qk_scale, m_i)
+            masked_out_rows = (m_ij == float("-inf"))
+            m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+            p = tl.exp2(qk * qk_scale - m_ij_masked[:, None])
+        d_ij = tl.sum(p, 1)
+        # scale acc_o
+        prev_scale = tl.exp2(m_i - m_ij_masked)
+        # # -- update output accumulator --
+        acc_o = acc_o * prev_scale[:, None]
+        # update acc_o
+        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
+            if EVEN_HEADDIM:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn
+                )
+            else:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+        else:
+            if EVEN_HEADDIM:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        p = p.to(v.dtype)
+        acc_o = tl.dot(p, v, acc_o)
+        # -- update statistics
+        d_i = d_i * prev_scale + d_ij
+        m_i = m_ij
+    if EMPTY_RFA_KV == 0:
+        # Iterate over RFA chunks
+        # we only iterate over chunks before the current local singleton window
+        end_idx_c = tl.minimum(offs_w * CHUNKS_PER_WINDOW, nchunks)
+        for start_c in range(0, end_idx_c, BLOCK_N):
+            start_c = tl.multiple_of(start_c, BLOCK_N)
+            # -- compute qk ----
+            if EVEN_C & EVEN_M:
+                if EVEN_HEADDIM:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc
+                    )
+                else:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=offs_d[None, :] < headdim,
+                        other=0.0
+                    )
+            else:
+                if EVEN_HEADDIM:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=(start_c + offs_c)[:, None] < nchunks,
+                        other=0.0,
+                    )
+                else:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=((start_c + offs_c)[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                        other=0.0,
+                    )
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk += tl.dot(q, tl.trans(rfa_k))
+            # Trying to combine the two masks seem to make the result wrong
+            if not EVEN_C:  # Need to mask out otherwise the softmax is wrong
+                qk += tl.where((start_c + offs_c)[None, :] < nchunks, 0, float("-inf"))
+            if MASK_TYPE == 1:
+                if EVEN_C & EVEN_M:
+                    chunk_mask = tl.load(
+                        chunk_mask_ptrs + start_c
+                    )
+                else:
+                    chunk_mask = tl.load(
+                        chunk_mask_ptrs + start_c,
+                        mask=(offs_m[:, None] < seqlen_q) & ((start_c + offs_c)[None, :] < nchunks),
+                        other=1,
+                    )
+                # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+                # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+                # to multiply with softmax_scale here.
+                # we assume mask already implies the causal masking
+                qk = qk * qk_scale
+                qk = tl.where(chunk_mask, float("-inf"), qk)
+                m_ij = tl.maximum(tl.max(qk, 1), m_i)
+                masked_out_rows = (m_ij == float("-inf"))
+                m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+                p = tl.exp2(qk - m_ij_masked[:, None])
+            else:
+                m_ij = tl.maximum(tl.max(qk, 1) * qk_scale, m_i)
+                masked_out_rows = (m_ij == float("-inf"))
+                m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
+                p = tl.exp2(qk * qk_scale - m_ij_masked[:, None])
+            d_ij = tl.sum(p, 1)
+            # scale acc_o
+            prev_scale = tl.exp2(m_i - m_ij_masked)
+            # # -- update output accumulator --
+            acc_o = acc_o * prev_scale[:, None]
+            # update acc_o
+            # TODO: If we just do "if EVEN_N", there seems to be some race condition ?
+            if EVEN_C & EVEN_M:
+                if EVEN_HEADDIM:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc
+                    )
+                else:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=offs_d[None, :] < headdim,
+                        other=0.0
+                    )
+            else:
+                if EVEN_HEADDIM:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=(start_c + offs_n)[:, None] < nchunks,
+                        other=0.0,
+                    )
+                else:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=((start_c + offs_n)[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                        other=0.0,
+                    )
+            p = p.to(rfa_v.dtype)
+            acc_o = tl.dot(p, rfa_v, acc_o)
+            # -- update statistics
+            d_i = d_i * prev_scale + d_ij
+            m_i = m_ij
+    # for rows that are all -inf, set d_i to 1.0
+    d_i = tl.where(d_i == 0.0, 1.0, d_i)
+    # multiply by log(2)
+    lse_m = (m_i + tl.math.log2(d_i)) * 0.6931471805599453
+    acc_o = acc_o / d_i[:, None]
+    # TODO: understand why rematerialize offsets to save registers?
+    start_m = tl.program_id(0)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    out_ptrs = (
+        Out +
+        off_b * stride_ob +
+        off_h * stride_oh +
+        (offs_m[:, None] * stride_om + offs_d[None, :])
+    )
+    if EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_ptrs, acc_o
+            )
+        else:
+            tl.store(
+                out_ptrs, acc_o,
+                mask=offs_d[None, :] < headdim
+            )
+    else:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_ptrs, acc_o,
+                mask=offs_m[:, None] < seqlen_q
+            )
+        else:
+            tl.store(
+                out_ptrs, acc_o,
+                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)
+            )
+    lse_ptrs = (
+        LSE +
+        off_b * stride_lse_b +
+        off_h * stride_lse_h +
+        offs_m
+    )
+    if EVEN_M:
+        tl.store(
+            lse_ptrs, lse_m,
+        )
+    else:
+        tl.store(
+            lse_ptrs, lse_m,
+            mask=offs_m < seqlen_q
+        )
+def triton_eva_agg_fwd(
+    q, k, v, rfa_k, rfa_v,
+    window_mask,
+    chunk_mask,
+    softmax_scale,
+    window_size,
+    chunks_per_window
+):
+    if rfa_k is None and rfa_v is None:
+        empty_rfa_kv = 1
+        q, k, v = [
+            x if x.stride(-1) == 1 else x.contiguous()
+            for x in [q, k, v]
+        ]
+    else:
+        assert rfa_k is not None and rfa_v is not None, "Both rfa_k and rfa_v must either be None or have values at the same time."
+        empty_rfa_kv = 0
+        q, k, v, rfa_k, rfa_v = [
+            x if x.stride(-1) == 1 else x.contiguous()
+            for x in [q, k, v, rfa_k, rfa_v]
+        ]
+    # shape constraints
+    batch, nheads, seqlen_q, head_dim = q.shape
+    _,     _,      seqlen_k, _        = k.shape
+    if empty_rfa_kv == 0:
+        nchunks = rfa_k.shape[-2]
+        assert rfa_k.shape == (batch, nheads, nchunks, head_dim)
+        assert rfa_v.shape == (batch, nheads, nchunks, head_dim)
+        assert q.dtype == k.dtype == v.dtype == rfa_k.dtype == rfa_v.dtype
+    else:
+        nchunks = 0
+        assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type"
+    assert k.shape == (batch, nheads, seqlen_k, head_dim)
+    assert v.shape == (batch, nheads, seqlen_k, head_dim)
+    assert head_dim <= 128, "We only test head dimensions up to 128"
+    # assert q.dtype in [torch.float16, torch.bfloat16], "Only support fp16 and bf16"
+    assert q.dtype in [torch.bfloat16, torch.float], "Only support bf16 and fp32 for now"
+    assert q.is_cuda and k.is_cuda and v.is_cuda
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(head_dim)
+    mask_type = 0
+    if window_mask is not None:
+        mask_type = 1
+        assert window_mask.dtype == torch.bool
+        assert window_mask.is_cuda
+        assert window_mask.dim() == 4
+        assert window_mask.shape == (batch, 1, seqlen_q, window_size)
+        if window_mask.stride(-1) != 1:
+            window_mask = window_mask.contiguous()
+        assert chunk_mask is not None
+        assert chunk_mask.dtype == torch.bool
+        assert chunk_mask.is_cuda
+        assert chunk_mask.dim() == 4
+        assert chunk_mask.shape == (batch, 1, seqlen_q, nchunks)
+        if chunk_mask.stride(-1) != 1:
+            chunk_mask = chunk_mask.contiguous()
+    chunk_mask_strides = (
+        (chunk_mask.stride(0), chunk_mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    window_mask_strides = (
+        (window_mask.stride(0), window_mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    rfa_k_strides = (
+        (rfa_k.stride(0), rfa_k.stride(1), rfa_k.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    rfa_v_strides = (
+        (rfa_v.stride(0), rfa_v.stride(1), rfa_v.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    o = torch.empty_like(q)
+    lse = torch.empty((q.shape[0], q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(head_dim), 16)
+    BLOCK_M, BLOCK_N, num_warps, num_stages = _get_config(q.dtype, head_dim, "fwd")
+    assert chunks_per_window >= BLOCK_N, "chunks_per_window must be greater than BLOCK"
+    assert chunks_per_window % BLOCK_N == 0, "chunks_per_window must be a multiple of BLOCK_N"
+    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
+    _fwd_eva_agg_kernel[grid](
+        q,
+        k,
+        v,
+        rfa_k,
+        rfa_v,
+        window_mask,
+        chunk_mask,
+        o,
+        lse,
+        softmax_scale,
+        q.stride(0), q.stride(1), q.stride(2),
+        k.stride(0), k.stride(1), k.stride(2),
+        v.stride(0), v.stride(1), v.stride(2),
+        rfa_k_strides[0], rfa_k_strides[1], rfa_k_strides[2],
+        rfa_v_strides[0], rfa_v_strides[1], rfa_v_strides[2],
+        window_mask_strides[0], window_mask_strides[1],
+        chunk_mask_strides[0], chunk_mask_strides[1],
+        o.stride(0), o.stride(1), o.stride(2),
+        lse.stride(0), lse.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        nchunks,
+        head_dim,
+        chunks_per_window,
+        window_size,
+        mask_type,
+        empty_rfa_kv,
+        BLOCK_HEADDIM,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return o, lse
+def triton_eva_agg_bwd(
+    do,
+    q, k, v, rfa_k, rfa_v,
+    window_mask, chunk_mask,
+    o, lse,
+    dq, dk, dv, d_rfa_k, d_rfa_v,
+    softmax_scale,
+    window_size,
+    chunks_per_window,
+    empty_rfa_kv,
+    mask_type,
+):
+    if do.stride(-1) != 1:
+        do = do.contiguous()
+    # shape constraints
+    batch, nheads, seqlen_q, head_dim = q.shape
+    _,     _,      seqlen_k, _        = k.shape
+    if empty_rfa_kv == 0:
+        nchunks = rfa_k.shape[-2]
+        assert rfa_k.shape == (batch, nheads, nchunks, head_dim)
+        assert rfa_v.shape == (batch, nheads, nchunks, head_dim)
+        assert d_rfa_k.stride(-1) == d_rfa_v.stride(-1) == 1
+        assert q.dtype == k.dtype == v.dtype == rfa_k.dtype == rfa_v.dtype
+    else:
+        nchunks = 0
+        assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type"
+    assert lse.shape == (batch, nheads, seqlen_q)
+    assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == rfa_k.stride(-1) == rfa_v.stride(-1) == 1
+    assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(head_dim)
+    assert head_dim <= 128, "We only test head dimensions up to 128"
+    window_mask_strides = (
+        (window_mask.stride(0), window_mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    chunk_mask_strides = (
+        (chunk_mask.stride(0), chunk_mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    rfa_k_strides = (
+        (rfa_k.stride(0), rfa_k.stride(1), rfa_k.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    rfa_v_strides = (
+        (rfa_v.stride(0), rfa_v.stride(1), rfa_v.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    d_rfa_k_strides = (
+        (d_rfa_k.stride(0), d_rfa_k.stride(1), d_rfa_k.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    d_rfa_v_strides = (
+        (d_rfa_v.stride(0), d_rfa_v.stride(1), d_rfa_v.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    BLOCK_HEADDIM = max(triton.next_power_of_2(head_dim), 16)
+    do_t_o = torch.sum(do.to(torch.float32) * o.to(torch.float32), dim=-1).to(do.dtype)
+    BLOCK_M, BLOCK_N, num_warps, num_stages = _get_config(q.dtype, head_dim, "bwd_dq")
+    assert chunks_per_window >= BLOCK_N, "chunks_per_window must be greater than BLOCK"
+    assert chunks_per_window % BLOCK_N == 0, "chunks_per_window must be a multiple of BLOCK"
+    grid = lambda META: (
+        triton.cdiv(seqlen_q, META["BLOCK_M"]),
+        batch * nheads,
+    )
+    _bwd_eva_agg_kernel_dq[grid](
+        q,
+        k,
+        v,
+        rfa_k,
+        rfa_v,
+        window_mask,
+        chunk_mask,
+        do,
+        lse,
+        do_t_o,
+        dq,
+        softmax_scale,
+        q.stride(0), q.stride(1), q.stride(2),
+        k.stride(0), k.stride(1), k.stride(2),
+        v.stride(0), v.stride(1), v.stride(2),
+        rfa_k_strides[0], rfa_k_strides[1], rfa_k_strides[2],
+        rfa_v_strides[0], rfa_v_strides[1], rfa_v_strides[2],
+        window_mask_strides[0], window_mask_strides[1],
+        chunk_mask_strides[0], chunk_mask_strides[1],
+        do.stride(0), do.stride(1), do.stride(2),
+        lse.stride(0), lse.stride(1),
+        do_t_o.stride(0), do_t_o.stride(1),
+        dq.stride(0), dq.stride(1), dq.stride(2),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        nchunks,
+        head_dim,
+        chunks_per_window,
+        window_size,
+        mask_type,
+        empty_rfa_kv,
+        BLOCK_HEADDIM,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    BLOCK_M, BLOCK_N, num_warps, num_stages = _get_config(q.dtype, head_dim, "bwd_dkdv")
+    grid = lambda META: (
+        triton.cdiv(seqlen_k, META["BLOCK_N"]),
+        batch * nheads,
+    )
+    _bwd_eva_agg_kernel_dkdv[grid](
+        q,
+        k,
+        v,
+        window_mask,
+        do,
+        lse,
+        do_t_o,
+        dk,
+        dv,
+        softmax_scale,
+        q.stride(0), q.stride(1), q.stride(2),
+        k.stride(0), k.stride(1), k.stride(2),
+        v.stride(0), v.stride(1), v.stride(2),
+        window_mask_strides[0], window_mask_strides[1],
+        do.stride(0), do.stride(1), do.stride(2),
+        lse.stride(0), lse.stride(1),
+        do_t_o.stride(0), do_t_o.stride(1),
+        dk.stride(0), dk.stride(1), dk.stride(2),
+        dv.stride(0), dv.stride(1), dv.stride(2),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        head_dim,
+        window_size,
+        mask_type,
+        BLOCK_HEADDIM,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    if empty_rfa_kv == 0:
+        BLOCK_M, BLOCK_N, num_warps, num_stages = _get_config(q.dtype, head_dim, "bwd_drfa_kv")
+        grid = lambda META: (
+            triton.cdiv(nchunks, META["BLOCK_N"]),
+            batch * nheads,
+        )
+        _bwd_eva_agg_kernel_drfa_kv[grid](
+            q,
+            rfa_k,
+            rfa_v,
+            chunk_mask,
+            do,
+            lse,
+            do_t_o,
+            d_rfa_k,
+            d_rfa_v,
+            softmax_scale,
+            q.stride(0), q.stride(1), q.stride(2),
+            rfa_k_strides[0], rfa_k_strides[1], rfa_k_strides[2],
+            rfa_v_strides[0], rfa_v_strides[1], rfa_v_strides[2],
+            chunk_mask_strides[0], chunk_mask_strides[1],
+            do.stride(0), do.stride(1), do.stride(2),
+            lse.stride(0), lse.stride(1),
+            do_t_o.stride(0), do_t_o.stride(1),
+            d_rfa_k_strides[0], d_rfa_k_strides[1], d_rfa_k_strides[2],
+            d_rfa_v_strides[0], d_rfa_v_strides[1], d_rfa_v_strides[2],
+            nheads,
+            seqlen_q,
+            nchunks,
+            head_dim,
+            chunks_per_window,
+            window_size,
+            mask_type,
+            BLOCK_HEADDIM,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+class EvaAggFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, rfa_k, rfa_v, window_mask, chunk_mask, softmax_scale=None, window_size=None, chunks_per_window=None):
+        if rfa_k is None and rfa_v is None:
+            empty_rfa_kv = 1
+        else:
+            assert rfa_k is not None and rfa_v is not None, "Both rfa_k and rfa_v must either be None or have values at the same time."
+            empty_rfa_kv = 0
+        if window_mask is not None:
+            mask_type = 1
+        else:
+            mask_type = 0
+        o, lse = triton_eva_agg_fwd(
+            q, k, v, rfa_k, rfa_v, window_mask, chunk_mask, softmax_scale, window_size, chunks_per_window
+        )
+        ctx.save_for_backward(q, k, v, o, lse, rfa_k, rfa_v, window_mask, chunk_mask)
+        ctx.softmax_scale = softmax_scale
+        ctx.window_size = window_size
+        ctx.chunks_per_window = chunks_per_window
+        ctx.empty_rfa_kv = empty_rfa_kv
+        ctx.mask_type = mask_type
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        q, k, v, o, lse, rfa_k, rfa_v, window_mask, chunk_mask = ctx.saved_tensors
+        dq = torch.empty_like(q)
+        dk = torch.empty_like(k)
+        dv = torch.empty_like(v)
+        if ctx.empty_rfa_kv == 0:
+            d_rfa_k = torch.empty_like(rfa_k)
+            d_rfa_v = torch.empty_like(rfa_v)
+        else:
+            d_rfa_k = None
+            d_rfa_v = None
+        triton_eva_agg_bwd(
+            do,
+            q,
+            k,
+            v,
+            rfa_k,
+            rfa_v,
+            window_mask,
+            chunk_mask,
+            o,
+            lse,
+            dq,
+            dk,
+            dv,
+            d_rfa_k,
+            d_rfa_v,
+            softmax_scale=ctx.softmax_scale,
+            window_size=ctx.window_size,
+            chunks_per_window=ctx.chunks_per_window,
+            empty_rfa_kv=ctx.empty_rfa_kv,
+            mask_type=ctx.mask_type,
+        )
+        return dq, dk, dv, d_rfa_k, d_rfa_v, None, None, None, None, None
+def eva_agg_func_triton(
+        q, k, v, rfa_k, rfa_v,
+        window_mask, chunk_mask,
+        softmax_scale=None, window_size=None, chunks_per_window=None,
+    ):
+    return EvaAggFunc.apply(
+        q, k, v, rfa_k, rfa_v,
+        window_mask, chunk_mask,
+        softmax_scale, window_size, chunks_per_window,
+    )

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/eva_cache.py ADDED Viewed

	@@ -0,0 +1,761 @@

+from typing import Dict, Optional, Tuple, List, Any, Union
+import torch
+from transformers.cache_utils import Cache
+class EvaCache(Cache):
+    """
+    A cache that grows dynamically as more tokens are generated. This is the default for generative models.
+    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
+    `[batch_size, num_heads, seq_len, head_dim]`.
+    """
+    def __init__(self) -> None:
+        self.w_k: List[torch.Tensor] = []
+        self.w_v: List[torch.Tensor] = []
+        self.rf_q: List[torch.Tensor] = []
+        self.rf_k: List[torch.Tensor] = []
+        self.rf_v: List[torch.Tensor] = []
+        self.softmax_phi_k_v: List[torch.Tensor] = []
+        self.log_sum_phi_k: List[torch.Tensor] = []
+        self.rf_k_bar: List[torch.Tensor] = []
+        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+        # attention masks temporary buffer
+        self.rf_mask: List[Optional[torch.Tensor]] = []
+        self.s_mask: List[torch.Tensor] = []
+        self.chunk_mask: List[torch.Tensor] = []
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.w_k)
+    def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
+        """Given the sequence length of the new inputs, returns the usable length of the cache."""
+        # Cache without size limit -> all cache is usable
+        # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache
+        #   length, we will need to evict part of the cache (and thus not all cache is usable)
+        max_length = self.get_max_length()
+        previous_seq_length = self.get_seq_length(layer_idx)
+        if max_length is not None and previous_seq_length + new_seq_length > max_length:
+            return max_length - new_seq_length
+        return previous_seq_length
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.w_k)):
+            device = self.w_k[layer_idx].device
+            self.w_k[layer_idx] = self.w_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.w_v[layer_idx].device
+            self.w_v[layer_idx] = self.w_v[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_q[layer_idx].device
+            self.rf_q[layer_idx] = self.rf_q[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_k[layer_idx].device
+            self.rf_k[layer_idx] = self.rf_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_v[layer_idx].device
+            self.rf_v[layer_idx] = self.rf_v[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.softmax_phi_k_v[layer_idx].device
+            self.softmax_phi_k_v[layer_idx] = self.softmax_phi_k_v[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.log_sum_phi_k[layer_idx].device
+            self.log_sum_phi_k[layer_idx] = self.log_sum_phi_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_k_bar[layer_idx].device
+            self.rf_k_bar[layer_idx] = self.rf_k_bar[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_mask[layer_idx].device
+            self.rf_mask[layer_idx] = self.rf_mask[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.s_mask[layer_idx].device
+            self.s_mask[layer_idx] = self.s_mask[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.chunk_mask[layer_idx].device
+            self.chunk_mask[layer_idx] = self.chunk_mask[layer_idx].index_select(0, beam_idx.to(device))
+    @property
+    def seen_tokens(self):
+        if hasattr(self, "_seen_tokens"):
+            return self._seen_tokens
+        else:
+            return None
+    def update_past_len(
+        self,
+        cur_q_len: int,
+        layer_idx: int
+    ):
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self._seen_tokens += cur_q_len
+        return self._seen_tokens
+    def update_mask(
+            self,
+            prev_s_mask,
+            cur_s_mask,
+            chunk_mask,
+            rf_mask,
+            layer_idx,
+            window_size,
+            chunk_size,
+    ):
+        ############################################
+        # compute masks for singletons
+        ############################################
+        q_len = None
+        if len(self.s_mask) <= layer_idx:
+            q_len = chunk_mask.shape[-2]
+            # prefill stage
+            # q is of shape [b, h, n, d]
+            if q_len < window_size:
+                assert prev_s_mask is None
+            # w_v =  # [b, h, 1, j, d]
+            # store the past window-wise key-value pairs
+            self.s_mask.append(cur_s_mask[..., -1:, :] if cur_s_mask is not None else prev_s_mask[..., -1, -1:, :])
+        else:
+            # decoding stage
+            prev_s_mask = None
+            cached_s_mask = self.s_mask[layer_idx]
+            assert cached_s_mask is not None
+            if cached_s_mask.shape[-1] == window_size:
+                cur_s_mask = cur_s_mask
+            else:
+                cur_s_mask = torch.cat([cached_s_mask, cur_s_mask], dim=-1)
+            # store the past window-wise key-value pairs
+            self.s_mask[layer_idx] = cur_s_mask
+        ############################################
+        # compute masks for intra-chunks
+        ############################################
+        dump_rf_mask = None
+        if len(self.rf_mask) <= layer_idx:
+            # initialize chunk stats
+            # prefill stage
+            if q_len < chunk_size:
+                cur_rf_mask = rf_mask
+            else:
+                if q_len % chunk_size == 0:
+                    dump_rf_mask = rf_mask
+                    cur_rf_mask = None
+                else:
+                    remainder_tokens = q_len % chunk_size
+                    if rf_mask is not None:
+                        dump_rf_mask, cur_rf_mask = torch.split(rf_mask, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    else:
+                        dump_rf_mask = None
+                        cur_rf_mask = None
+            self.rf_mask.append(cur_rf_mask)
+        else:
+            past_rf_mask = self.rf_mask[layer_idx]
+            if past_rf_mask is not None:
+                # when decoding tokens, we always assume the
+                # incoming token mask is 0 (not masked)
+                cur_rf_mask = torch.cat([past_rf_mask, rf_mask], dim=-2)
+            else:
+                # we do not need to use rf_mask anymore after we receive generated tokens
+                cur_rf_mask = None
+            # We need to store rf_k_bar and RFA-results that
+            # compute the per-chunk RFA.
+            # Dump the chunk if the len of current chunk reaches <chunk_size>.
+            if cur_rf_mask is not None and cur_rf_mask.shape[-2] == chunk_size:
+                dump_rf_mask = cur_rf_mask
+                cur_rf_mask = None
+            self.rf_mask[layer_idx] = cur_rf_mask
+        ############################################
+        # compute masks for inter chunks
+        ############################################
+        if len(self.chunk_mask) <= layer_idx:
+            # prefill stage
+            # q is of shape [b, h, n, d]
+            if q_len < window_size:
+                cur_chunk_mask = chunk_mask
+                prev_chunk_mask = None
+            else:
+                if q_len % window_size == 0:
+                    cur_chunk_mask = None
+                    prev_chunk_mask = chunk_mask
+                else:
+                    remainder_tokens = q_len % window_size
+                    # [b, h, n-r, d] [b, h, r, d]
+                    prev_chunk_mask, cur_chunk_mask = torch.split(chunk_mask, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                bsz, num_heads, _, head_dim = prev_chunk_mask.shape
+                prev_chunk_mask = prev_chunk_mask.reshape(bsz, num_heads, -1, window_size, head_dim)
+                assert prev_s_mask is not None
+                if prev_s_mask.shape[-3] == 1 and prev_chunk_mask.shape[-3] > 1:
+                    # need to expand
+                    prev_s_mask = prev_s_mask.expand(-1, -1, prev_chunk_mask.shape[-3], -1, -1)
+            # w_v =  # [b, h, 1, j, d]
+            # store the past window-wise key-value pairs
+            self.chunk_mask.append(cur_chunk_mask[..., -1:, :] if cur_chunk_mask is not None else prev_chunk_mask[..., -1, -1:, :])
+        else:
+            # decoding stage
+            prev_chunk_mask = None
+            cur_chunk_mask = self.chunk_mask[layer_idx]
+            # if the current sequence length reaches <chunk_size>,
+            # we append a new 1 to the end of chunk_mask
+            seen_seq_len = self.get_seq_length(layer_idx)
+            if seen_seq_len > 0 and seen_seq_len % chunk_size == 0:
+                past_chunk_mask = self.chunk_mask[layer_idx]
+                if past_chunk_mask is not None:
+                    # when decoding tokens, we always assume the
+                    # incoming token mask is 0 (not masked)
+                    cur_chunk_mask = torch.cat([past_chunk_mask, chunk_mask], dim=-1)
+                else:
+                    cur_chunk_mask = chunk_mask
+                self.chunk_mask[layer_idx] = cur_chunk_mask
+            # if the len of current sequence reaches <window_size> + 1,
+            # we turn on the mask for most recent chunks
+            if seen_seq_len > 0 and seen_seq_len % window_size == 1:
+                cur_chunk_mask = self.chunk_mask[layer_idx]
+                # we do not need to use rf_mask anymore after we receive generated tokens
+                num_chunks_per_window = window_size // chunk_size
+                cur_chunk_mask[..., -num_chunks_per_window:] = False
+                self.chunk_mask[layer_idx] = cur_chunk_mask
+        return (prev_s_mask, cur_s_mask, prev_chunk_mask, cur_chunk_mask, dump_rf_mask)
+    def update_singletons(
+            self,
+            q,
+            k,
+            v,
+            layer_idx,
+            window_size,
+    ):
+        if len(self.w_k) <= layer_idx:
+            # prefill stage
+            # q is of shape [b, h, n, d]
+            q_len = q.shape[-2]
+            if q_len < window_size:
+                w_q = q
+                w_k = k
+                w_v = v
+                past_w_q = past_w_k = past_w_v = None
+            else:
+                if q_len % window_size == 0:
+                    w_q = None
+                    w_k = None
+                    w_v = None
+                    past_w_q = q
+                    past_w_k = k
+                    past_w_v = v
+                else:
+                    remainder_tokens = q_len % window_size
+                    # [b, h, n-r, d] [b, h, r, d]
+                    past_w_q, w_q = torch.split(q, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    past_w_k, w_k = torch.split(k, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    past_w_v, w_v = torch.split(v, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                bsz, num_heads, _, head_dim = past_w_q.shape
+                past_w_q = past_w_q.reshape(bsz, num_heads, -1, window_size, head_dim)
+                past_w_k = past_w_k.reshape(bsz, num_heads, -1, window_size, head_dim)
+                past_w_v = past_w_v.reshape(bsz, num_heads, -1, window_size, head_dim)
+            # w_q = q[..., None, -window_size:, :] # [b, h, 1, j, d]
+            # w_k =  # [b, h, 1, j, d]
+            # w_v =  # [b, h, 1, j, d]
+            # store the past window-wise key-value pairs
+            # if w_k is None, it means we happen to pass in a sqeuence that is divisible by window_size
+            # we leave the cache with window_size-sized kv pairs to be cleared next iteration
+            self.w_k.append(w_k if w_k is not None else past_w_k[..., -1, :, :])
+            self.w_v.append(w_v if w_v is not None else past_w_v[..., -1, :, :])
+        else:
+            # decoding stage
+            past_w_q = past_w_k = past_w_v = None
+            # this is implemented as either a sliding window or fixed window
+            w_q = q # [b, h, 1, d]
+            w_k = k # [b, h, 1, d]
+            w_v = v # [b, h, 1, d]
+            cached_w_k = self.w_k[layer_idx]
+            assert cached_w_k is not None # [b, h, j, d]
+            if cached_w_k.shape[-2] == window_size:
+                w_k = w_k
+            else:
+                w_k = torch.cat([cached_w_k, w_k], dim=-2)
+            cached_w_v = self.w_v[layer_idx]
+            assert cached_w_v is not None
+            if cached_w_v.shape[-2] == window_size:
+                w_v = w_v
+            else:
+                w_v = torch.cat([cached_w_v, w_v], dim=-2)
+            # store the past window-wise key-value pairs
+            self.w_k[layer_idx] = w_k
+            self.w_v[layer_idx] = w_v
+        return (past_w_q, past_w_k, past_w_v), (w_q, w_k, w_v)
+    def update_chunks(
+            self,
+            q,
+            k,
+            v,
+            layer_idx,
+            chunk_size
+    ):
+        q_len = q.shape[-2]
+        dump_q = None
+        dump_k = None
+        dump_v = None
+        if len(self.rf_q) <= layer_idx:
+            # initialize chunk stats
+            # prefill stage
+            if q_len < chunk_size:
+                rf_q = q
+                rf_k = k
+                rf_v = v
+            else:
+                if q_len % chunk_size == 0:
+                    rf_q = None
+                    rf_k = None
+                    rf_v = None
+                    dump_q = q
+                    dump_k = k
+                    dump_v = v
+                else:
+                    remainder_tokens = q_len % chunk_size
+                    # [b, h, n-r, d] [b, h, r, d]
+                    dump_q, rf_q = torch.split(q, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    dump_k, rf_k = torch.split(k, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    dump_v, rf_v = torch.split(v, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+            self.rf_q.append(rf_q)
+            self.rf_k.append(rf_k)
+            self.rf_v.append(rf_v)
+        else:
+            # decode tokens
+            # add query, key & value to the current chunk.
+            past_rf_q = self.rf_q[layer_idx]
+            if past_rf_q is not None:
+                rf_q = torch.cat([past_rf_q, q], dim=-2)
+            else:
+                rf_q = q
+            past_rf_k = self.rf_k[layer_idx]
+            if past_rf_k is not None:
+                rf_k = torch.cat([past_rf_k, k], dim=-2)
+            else:
+                rf_k = k
+            past_rf_v = self.rf_v[layer_idx]
+            if past_rf_v is not None:
+                rf_v = torch.cat([past_rf_v, v], dim=-2)
+            else:
+                rf_v = v
+            # We need to store rf_k_bar and RFA-results that
+            # compute the per-chunk RFA.
+            # Dump the chunk if the len of current chunk reaches <chunk_size>.
+            if rf_q.shape[-2] == chunk_size:
+                dump_q = rf_q
+                dump_k = rf_k
+                dump_v = rf_v
+                # clear the chunk
+                rf_q = None
+                rf_k = None
+                rf_v = None
+            self.rf_q[layer_idx] = rf_q
+            self.rf_k[layer_idx] = rf_k
+            self.rf_v[layer_idx] = rf_v
+        return dump_q, dump_k, dump_v
+    def update_chunk_rfas(
+        self,
+        softmax_phi_k_v,
+        log_sum_phi_k,
+        rf_k_bar,
+        layer_idx,
+        random_feature_dim
+    ):
+        if len(self.softmax_phi_k_v) <= layer_idx:
+            # prefill stage
+            self.softmax_phi_k_v.append(softmax_phi_k_v)
+            self.log_sum_phi_k.append(log_sum_phi_k)
+            self.rf_k_bar.append(rf_k_bar)
+        else:
+            # token decoding
+            past_softmax_phi_k_v = self.softmax_phi_k_v[layer_idx]
+            past_log_sum_phi_k = self.log_sum_phi_k[layer_idx]
+            past_rf_k_bar = self.rf_k_bar[layer_idx]
+            if past_softmax_phi_k_v is not None:
+                if random_feature_dim == 1:
+                    dim = -2
+                else:
+                    dim = -3
+                softmax_phi_k_v = torch.cat([past_softmax_phi_k_v, softmax_phi_k_v], dim=dim)
+            if past_log_sum_phi_k is not None:
+                if random_feature_dim == 1:
+                    dim = -2
+                else:
+                    dim = -3
+                log_sum_phi_k = torch.cat([past_log_sum_phi_k, log_sum_phi_k], dim=dim)
+            if past_rf_k_bar is not None:
+                rf_k_bar = torch.cat([past_rf_k_bar, rf_k_bar], dim=-2)
+            self.softmax_phi_k_v[layer_idx] = softmax_phi_k_v
+            self.log_sum_phi_k[layer_idx] = log_sum_phi_k
+            self.rf_k_bar[layer_idx] = rf_k_bar
+        return softmax_phi_k_v, log_sum_phi_k, rf_k_bar
+    def get_chunk_rfas(self, layer_idx):
+        if len(self.softmax_phi_k_v) <= layer_idx:
+            return (
+                None,
+                None,
+                None
+            )
+        else:
+            return (
+                self.softmax_phi_k_v[layer_idx],
+                self.log_sum_phi_k[layer_idx],
+                self.rf_k_bar[layer_idx]
+            )
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        if len(self.w_k) <= layer_idx:
+            return 0
+        return self._seen_tokens
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
+        return None
+    def update(
+        self,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError("`update` is not used in Eva Cache.")
+class EvaStaticCacheForTriton(Cache):
+    """
+    A variant of EvaCache for eva's triton kernels
+    """
+    def __init__(
+        self,
+        batch_size,
+        num_key_value_heads,
+        window_size,
+        head_dim,
+        num_layers,
+        dtype,
+        device
+    ) -> None:
+        self.past_window_k: List[torch.Tensor] = []
+        self.past_window_v: List[torch.Tensor] = []
+        cache_shape = (batch_size, num_key_value_heads, window_size, head_dim)
+        for idx in range(num_layers):
+            new_window_k = torch.zeros(cache_shape, dtype=dtype, device=device)
+            new_window_v = torch.zeros(cache_shape, dtype=dtype, device=device)
+            self.past_window_k.append(new_window_k)
+            self.past_window_v.append(new_window_v)
+        self.past_window_pos: List[int] = []
+        self.rfa_k: List[torch.Tensor] = []
+        self.rfa_v: List[torch.Tensor] = []
+        # self.rfa_mask: List[torch.Tensor] = []
+        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+        # attention masks temporary buffer
+        self.rf_mask: List[Optional[torch.Tensor]] = []
+        self.s_mask: List[torch.Tensor] = []
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.past_window_pos)
+    def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
+        """Given the sequence length of the new inputs, returns the usable length of the cache."""
+        # Cache without size limit -> all cache is usable
+        # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache
+        #   length, we will need to evict part of the cache (and thus not all cache is usable)
+        max_length = self.get_max_length()
+        previous_seq_length = self.get_seq_length(layer_idx)
+        if max_length is not None and previous_seq_length + new_seq_length > max_length:
+            return max_length - new_seq_length
+        return previous_seq_length
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.past_window_k)):
+            device = self.past_window_k[layer_idx].device
+            self.past_window_k[layer_idx] = self.past_window_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.past_window_v[layer_idx].device
+            self.past_window_v[layer_idx] = self.past_window_v[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rfa_k[layer_idx].device
+            self.rfa_k[layer_idx] = self.rfa_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rfa_v[layer_idx].device
+            self.rfa_v[layer_idx] = self.rfa_v[layer_idx].index_select(0, beam_idx.to(device))
+            # device = self.rfa_mask[layer_idx].device
+            # self.rfa_mask[layer_idx] = self.rfa_mask[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_mask[layer_idx].device
+            self.rf_mask[layer_idx] = self.rf_mask[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.s_mask[layer_idx].device
+            self.s_mask[layer_idx] = self.s_mask[layer_idx].index_select(0, beam_idx.to(device))
+    @property
+    def seen_tokens(self):
+        if hasattr(self, "_seen_tokens"):
+            return self._seen_tokens
+        else:
+            return None
+    def update_past_len(
+        self,
+        cur_q_len: int,
+        layer_idx: int
+    ):
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self._seen_tokens += cur_q_len
+        return self._seen_tokens
+    def update_mask(
+            self,
+            s_mask,
+            rf_mask,
+            layer_idx,
+            window_size,
+    ):
+        ############################################
+        # compute masks for singletons
+        ############################################
+        if len(self.s_mask) <= layer_idx:
+            # prefill stage
+            # q is of shape [b, h, n, d]
+            # s_v =  # [b, h, 1, j, d]
+            # store the past window-wise key-value pairs
+            if s_mask is None:
+                cur_s_mask = None
+            else:
+                q_len = s_mask.shape[-2]
+                # s_mask is of shape [b, h, n, w]
+                # let r = q_len % window_size
+                # if r == 0, the mask to be appended is of shape [b, h, 1, w]
+                # otherwise, r < w, the mask to be appended is of shape [b, h, 1, r]
+                remainder_tokens = q_len % window_size
+                if remainder_tokens == 0:
+                    cur_s_mask = None
+                else:
+                    cur_s_mask = s_mask[..., -1:, :remainder_tokens]
+            self.s_mask.append(cur_s_mask)
+            # we use the passed s_mask for subsequent computations
+            dump_s_mask = s_mask
+        else:
+            # decoding stage
+            past_s_mask = self.s_mask[layer_idx]
+            if past_s_mask is None:
+                assert s_mask is None
+                cur_s_mask = None
+            else:
+                assert s_mask is not None
+                cur_s_mask = torch.cat([past_s_mask, s_mask], dim=-1)
+            dump_s_mask = cur_s_mask
+            if cur_s_mask is not None and cur_s_mask.shape[-1] == window_size:
+                cur_s_mask = None
+                # store the past window-wise key-value pairs
+            self.s_mask[layer_idx] = cur_s_mask
+        ############################################
+        # compute masks for intra-chunks
+        ############################################
+        dump_rf_mask = None
+        if len(self.rf_mask) <= layer_idx:
+            # initialize chunk stats
+            # prefill stage
+            if rf_mask is None:
+                cur_rf_mask = None
+            else:
+                q_len = rf_mask.shape[-2]
+                if q_len < window_size:
+                    dump_rf_mask = None
+                    cur_rf_mask = rf_mask
+                else:
+                    if q_len % window_size == 0:
+                        dump_rf_mask = rf_mask
+                        cur_rf_mask = None
+                    else:
+                        remainder_tokens = q_len % window_size
+                        dump_rf_mask, cur_rf_mask = torch.split(rf_mask, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+            self.rf_mask.append(cur_rf_mask)
+        else:
+            past_rf_mask = self.rf_mask[layer_idx]
+            if past_rf_mask is not None:
+                # when decoding tokens, we always assume the
+                # incoming token mask is 0 (not masked)
+                cur_rf_mask = torch.cat([past_rf_mask, rf_mask], dim=-2)
+            else:
+                cur_rf_mask = None
+            if cur_rf_mask is not None and cur_rf_mask.shape[-2] == window_size:
+                dump_rf_mask = cur_rf_mask
+                cur_rf_mask = None
+            self.rf_mask[layer_idx] = cur_rf_mask
+        return dump_s_mask, dump_rf_mask
+    def update_singletons_and_chunks(
+            self,
+            k,
+            v,
+            layer_idx,
+            window_size,
+    ):
+        if len(self.past_window_pos) <= layer_idx:
+            # prefill stage
+            s_k = k
+            s_v = v
+            input_len = k.shape[-2]
+            window_pos = 0
+            if input_len <= window_size:
+                new_window_pos = window_pos + input_len
+                cached_window_k = k
+                cached_window_v = v
+                dump_k = None
+                dump_v = None
+            else:
+                remainder_tokens = input_len % window_size
+                if remainder_tokens == 0:
+                    remainder_tokens = window_size
+                new_window_pos = window_pos + remainder_tokens
+                # [b, h, n-r, d] [b, h, r, d]
+                cached_window_k = k[..., -remainder_tokens:, :]
+                cached_window_v = v[..., -remainder_tokens:, :]
+                dump_k = k[..., :-remainder_tokens, :]
+                dump_v = v[..., :-remainder_tokens, :]
+            # store the past window-wise key-value pairs
+            self.past_window_k[layer_idx][:, :, window_pos : new_window_pos, :] = cached_window_k
+            self.past_window_v[layer_idx][:, :, window_pos : new_window_pos, :] = cached_window_v
+            self.past_window_pos.append(new_window_pos)
+        else:
+            # decoding stage
+            # if the previous cache has full tokens,
+            # roll back to the first elements
+            if self.past_window_pos[layer_idx] == window_size:
+                self.past_window_pos[layer_idx] = 0
+                dump_k = self.past_window_k[layer_idx].clone()
+                dump_v = self.past_window_v[layer_idx].clone()
+            else:
+                dump_k = None
+                dump_v = None
+            input_len = k.shape[-2]
+            window_pos = self.past_window_pos[layer_idx]
+            new_window_pos = window_pos + input_len
+            self.past_window_k[layer_idx][:, :, window_pos : new_window_pos, :] = k
+            self.past_window_v[layer_idx][:, :, window_pos : new_window_pos, :] = v
+            s_k = self.past_window_k[layer_idx][:, :, : new_window_pos, :]
+            s_v = self.past_window_v[layer_idx][:, :, : new_window_pos, :]
+            self.past_window_pos[layer_idx] = new_window_pos
+        return s_k, s_v, dump_k, dump_v
+    def update_chunk_rfas(
+        self,
+        rfa_k,
+        rfa_v,
+        layer_idx,
+    ):
+        if len(self.rfa_k) <= layer_idx:
+            # prefill stage
+            self.rfa_k.append(rfa_k)
+            self.rfa_v.append(rfa_v)
+        else:
+            # token decoding
+            past_rfa_k = self.rfa_k[layer_idx]
+            past_rfa_v = self.rfa_v[layer_idx]
+            if past_rfa_k is not None:
+                rfa_k = torch.cat([past_rfa_k, rfa_k], dim=-2)
+            if past_rfa_v is not None:
+                rfa_v = torch.cat([past_rfa_v, rfa_v], dim=-2)
+            self.rfa_k[layer_idx] = rfa_k
+            self.rfa_v[layer_idx] = rfa_v
+        return rfa_k, rfa_v
+    def get_past_window_pos(self, layer_idx):
+        if len(self.past_window_pos) <= layer_idx:
+            return None
+        else:
+            return self.past_window_pos[layer_idx]
+    def get_past_window_kv(self, layer_idx):
+        if len(self.past_window_pos) <= layer_idx:
+            return None, None
+        else:
+            return (
+                self.past_window_k[layer_idx][:, :, : self.past_window_pos[layer_idx], :],
+                self.past_window_v[layer_idx][:, :, : self.past_window_pos[layer_idx], :]
+            )
+    def get_chunk_rfas(self, layer_idx):
+        if len(self.rfa_k) <= layer_idx:
+            return None, None
+        else:
+            return self.rfa_k[layer_idx], self.rfa_v[layer_idx]
+    def get_seq_length(self, layer_idx = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # layer_idx must be provided since otherwise
+        # any layer > 0 can only get the updated _seen_tokens
+        if len(self.past_window_pos) <= layer_idx:
+            return 0
+        return self._seen_tokens
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
+        return None
+    def update(
+        self,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError("`update` is not used in Eva Cache.")

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/eva_prep_kv_kernel.py ADDED Viewed

	@@ -0,0 +1,1017 @@

+import math
+import torch
+import triton
+import triton.language as tl
+@triton.heuristics(
+    {
+        "EVEN_N": lambda args: args["seqlen"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _fwd_eva_prep_kv_kernel(
+    K, # [b, h, n, d]
+    V, # [b, h, n, d]
+    PARAM_MU, # [1, h, 1, 1, d]
+    PARAM_PHI,  # [1, h, 1, 1, d]
+    Mask, # [b, h, n, 1]
+    Out_RFA_K, # [b, h, c, d]
+    Out_RFA_V, # [b, h, c, d]
+    softmax_scale,
+    stride_kb, stride_kh, stride_kn,
+    stride_vb, stride_vh, stride_vn,
+    stride_mu_h,
+    stride_phi_h,
+    stride_mb, stride_mn,
+    stride_ok_b, stride_ok_h, stride_ok_c,
+    stride_ov_b, stride_ov_h, stride_ov_c,
+    nheads,
+    seqlen,
+    nchunks,
+    headdim,
+    CHUNKS_PER_BLOCK: tl.constexpr,
+    CHUNK_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_n = tl.program_id(0)
+    offs_bh = tl.program_id(1)
+    offs_h = offs_bh % nheads
+    offs_b = offs_bh // nheads
+    # initialize offsets
+    # we load BLOCK_N keys and values each time, and
+    # reshape it to [CHUNKS_PER_BLOCK, CHUNK_SIZE]
+    offs_c = tl.arange(0, CHUNKS_PER_BLOCK)
+    offs_m = tl.arange(0, CHUNK_SIZE)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    k_ptrs = (
+        K +
+        offs_b * stride_kb +
+        offs_h * stride_kh +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_kn +
+            offs_d[None, None, :]
+        )
+    )
+    v_ptrs = (
+        V +
+        offs_b * stride_vb +
+        offs_h * stride_vh +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_vn +
+            offs_d[None, None, :]
+        )
+    )
+    param_mu_ptrs = (
+        PARAM_MU +
+        offs_h * stride_mu_h +
+        offs_d[None, None, :]
+    )
+    param_phi_ptrs = (
+        PARAM_PHI +
+        offs_h * stride_phi_h +
+        offs_d[None, None, :]
+    )
+    log2e = 1.4426950408889634
+    if MASK_TYPE == 1:
+        m_ptrs = (
+            Mask +
+            offs_b * stride_mb +
+            (
+                (
+                    start_n * BLOCK_N +
+                    offs_c[:, None] * CHUNK_SIZE +
+                    offs_m[None, :]
+                ) * stride_mn
+            )
+        )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            k = tl.load(
+                k_ptrs
+            )
+        else:
+            k = tl.load(
+                k_ptrs,
+                mask=offs_d[None, None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            k = tl.load(
+                k_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None, None] * CHUNK_SIZE +
+                        offs_m[None, :, None]
+                    ) < seqlen,
+                other=0.0
+            )
+        else:
+            k = tl.load(
+                k_ptrs,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+                other=0.0
+            )
+    param_mu = tl.load(param_mu_ptrs).to(k.dtype)
+    rfa_k_c_w = tl.zeros([CHUNKS_PER_BLOCK, CHUNK_SIZE], dtype=tl.float32)
+    rfa_k_c_w += tl.sum(k * param_mu, axis=-1)
+    rfa_k_c_w *= log2e
+    if MASK_TYPE == 1:
+        if EVEN_N:
+            mask = tl.load(
+                m_ptrs
+            )
+        else:
+            mask = tl.load(
+                m_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None] * CHUNK_SIZE +
+                        offs_m[None, :]
+                    ) < seqlen,
+                other=1,
+            )
+        rfa_k_c_w = tl.where(mask, float("-inf"), rfa_k_c_w)
+    m_rfa_k_c_w = tl.max(rfa_k_c_w, axis=-1)
+    masked_out_rows_rfa_k = (m_rfa_k_c_w == float("-inf"))
+    m_rfa_k_c_w_masked = tl.where(masked_out_rows_rfa_k, 0, m_rfa_k_c_w)
+    rfa_k_c_w = tl.exp2(rfa_k_c_w - m_rfa_k_c_w_masked[:, None])
+    denom_k = tl.sum(rfa_k_c_w, axis=-1)
+    denom_k = tl.where(denom_k == 0.0, 1.0, denom_k)
+    rfa_k_c_w = rfa_k_c_w / denom_k[:, None]
+    rfa_k_c = tl.sum(k * rfa_k_c_w[:, :, None].to(k.dtype), axis=-2)
+    # TODO: understand why rematerialize offsets to save registers?
+    offs_out_c = start_n * CHUNKS_PER_BLOCK + tl.arange(0, CHUNKS_PER_BLOCK)
+    out_rfa_k_ptrs = (
+        Out_RFA_K +
+        offs_b * stride_ok_b +
+        offs_h * stride_ok_h +
+        (offs_out_c[:, None] * stride_ok_c + offs_d[None, :])
+    )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_rfa_k_ptrs, rfa_k_c
+            )
+        else:
+            tl.store(
+                out_rfa_k_ptrs, rfa_k_c,
+                mask=offs_d[None, :] < headdim
+            )
+    else:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_rfa_k_ptrs, rfa_k_c,
+                mask=offs_out_c[:, None] < nchunks
+            )
+        else:
+            tl.store(
+                out_rfa_k_ptrs, rfa_k_c,
+                mask=(offs_out_c[:, None] < nchunks) & (offs_d[None, :] < headdim)
+            )
+    param_phi = tl.load(param_phi_ptrs).to(k.dtype)
+    rfa_v_c_w = tl.zeros([CHUNKS_PER_BLOCK, CHUNK_SIZE], dtype=tl.float32)
+    rfa_v_c_w += tl.sum(k * param_phi, axis=-1)
+    rfa_v_c_w -= (0.5 * tl.sum(k * k, axis=-1))
+    rfa_v_c_w *= log2e * softmax_scale
+    if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+        rfa_v_c_w += tl.where(
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None] * CHUNK_SIZE +
+                offs_m[None, :]
+            ) < seqlen,
+            0,
+            float("-inf")
+        )
+    if MASK_TYPE == 1:
+        rfa_v_c_w = tl.where(mask, float("-inf"), rfa_v_c_w)
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            v = tl.load(
+                v_ptrs
+            )
+        else:
+            v = tl.load(
+                v_ptrs,
+                mask=offs_d[None, None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            v = tl.load(
+                v_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None, None] * CHUNK_SIZE +
+                        offs_m[None, :, None]
+                    ) < seqlen,
+                other=0.0
+            )
+        else:
+            v = tl.load(
+                v_ptrs,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+                other=0.0
+            )
+    m_rfa_v_c_w = tl.max(rfa_v_c_w, axis=-1)
+    masked_out_rows_rfa_v = (m_rfa_v_c_w == float("-inf"))
+    m_rfa_v_c_w_masked = tl.where(masked_out_rows_rfa_v, 0, m_rfa_v_c_w)
+    rfa_v_c_w = tl.exp2(rfa_v_c_w - m_rfa_v_c_w_masked[:, None])
+    denom_v = tl.sum(rfa_v_c_w, axis=-1)
+    denom_v = tl.where(denom_v == 0.0, 1.0, denom_v)
+    rfa_v_c_w = rfa_v_c_w / denom_v[:, None]
+    rfa_v_c = tl.sum(v * rfa_v_c_w[:, :, None].to(v.dtype), axis=-2)
+    offs_out_c = start_n * CHUNKS_PER_BLOCK + tl.arange(0, CHUNKS_PER_BLOCK)
+    out_rfa_v_ptrs = (
+        Out_RFA_V +
+        offs_b * stride_ov_b +
+        offs_h * stride_ov_h +
+        (offs_out_c[:, None] * stride_ov_c + offs_d[None, :])
+    )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_rfa_v_ptrs, rfa_v_c
+            )
+        else:
+            tl.store(
+                out_rfa_v_ptrs, rfa_v_c,
+                mask=offs_d[None, :] < headdim
+            )
+    else:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_rfa_v_ptrs, rfa_v_c,
+                mask=offs_out_c[:, None] < nchunks
+            )
+        else:
+            tl.store(
+                out_rfa_v_ptrs, rfa_v_c,
+                mask=(offs_out_c[:, None] < nchunks) & (offs_d[None, :] < headdim)
+            )
+@triton.heuristics(
+    {
+        "EVEN_N": lambda args: args["seqlen"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _bwd_eva_prep_kv_kernel(
+    RFA_K, # [b, h, c, d]
+    RFA_V, # [b, h, c, d]
+    K, # [b, h, n, d]
+    V, # [b, h, n, d]
+    PARAM_MU, # [1, h, 1, 1, d]
+    PARAM_PHI,  # [1, h, 1, 1, d]
+    Mask, # [b, h, n, 1]
+    D_RFA_K, # [b, h, c, d]
+    D_RFA_V, # [b, h, c, d]
+    D_K, # [b, h, n, d]
+    D_V, # [b, h, n, d]
+    D_PARAM_MU_PARTIAL, # [b, h, g, d]
+    D_PARAM_PHI_PARTIAL, # [b, h, g, d]
+    softmax_scale,
+    stride_rfa_k_b, stride_rfa_k_h, stride_rfa_k_c,
+    stride_rfa_v_b, stride_rfa_v_h, stride_rfa_v_c,
+    stride_kb, stride_kh, stride_kn,
+    stride_vb, stride_vh, stride_vn,
+    stride_mu_h,
+    stride_phi_h,
+    stride_mb, stride_mn,
+    stride_d_rfa_k_b, stride_d_rfa_k_h, stride_d_rfa_k_c,
+    stride_d_rfa_v_b, stride_d_rfa_v_h, stride_d_rfa_v_c,
+    stride_d_k_b, stride_d_k_h, stride_d_k_n,
+    stride_d_v_b, stride_d_v_h, stride_d_v_n,
+    stride_d_mu_b, stride_d_mu_h, stride_d_mu_g,
+    stride_d_phi_b, stride_d_phi_h, stride_d_phi_g,
+    nheads,
+    seqlen,
+    nchunks,
+    headdim,
+    CHUNKS_PER_BLOCK: tl.constexpr,
+    CHUNK_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_n = tl.program_id(0)
+    offs_bh = tl.program_id(1)
+    offs_h = offs_bh % nheads
+    offs_b = offs_bh // nheads
+    # initialize offsets
+    # we load BLOCK_N keys and values each time, and
+    # reshape it to [CHUNKS_PER_BLOCK, CHUNK_SIZE]
+    offs_c = tl.arange(0, CHUNKS_PER_BLOCK)
+    offs_m = tl.arange(0, CHUNK_SIZE)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    offs_rfa_c = start_n * CHUNKS_PER_BLOCK + offs_c
+    k_ptrs = (
+        K +
+        offs_b * stride_kb +
+        offs_h * stride_kh +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_kn +
+            offs_d[None, None, :]
+        )
+    )
+    rfa_k_ptrs = (
+        RFA_K +
+        offs_b * stride_rfa_k_b +
+        offs_h * stride_rfa_k_h +
+        (offs_rfa_c[:, None] * stride_rfa_k_c + offs_d[None, :])
+    )
+    rfa_v_ptrs = (
+        RFA_V +
+        offs_b * stride_rfa_v_b +
+        offs_h * stride_rfa_v_h +
+        (offs_rfa_c[:, None] * stride_rfa_v_c + offs_d[None, :])
+    )
+    d_rfa_k_ptrs = (
+        D_RFA_K +
+        offs_b * stride_d_rfa_k_b +
+        offs_h * stride_d_rfa_k_h +
+        (offs_rfa_c[:, None] * stride_d_rfa_k_c + offs_d[None, :])
+    )
+    d_rfa_v_ptrs = (
+        D_RFA_V +
+        offs_b * stride_d_rfa_v_b +
+        offs_h * stride_d_rfa_v_h +
+        (offs_rfa_c[:, None] * stride_d_rfa_v_c + offs_d[None, :])
+    )
+    param_mu_ptrs = (
+        PARAM_MU +
+        offs_h * stride_mu_h +
+        offs_d[None, None, :]
+    )
+    param_phi_ptrs = (
+        PARAM_PHI +
+        offs_h * stride_phi_h +
+        offs_d[None, None, :]
+    )
+    log2e = 1.4426950408889634
+    if MASK_TYPE == 1:
+        m_ptrs = (
+            Mask +
+            offs_b * stride_mb +
+            (
+                (
+                    start_n * BLOCK_N +
+                    offs_c[:, None] * CHUNK_SIZE +
+                    offs_m[None, :]
+                ) * stride_mn
+            )
+        )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            k = tl.load(
+                k_ptrs
+            )
+        else:
+            k = tl.load(
+                k_ptrs,
+                mask=offs_d[None, None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            k = tl.load(
+                k_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None, None] * CHUNK_SIZE +
+                        offs_m[None, :, None]
+                    ) < seqlen,
+                other=0.0
+            )
+        else:
+            k = tl.load(
+                k_ptrs,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+                other=0.0
+            )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            rfa_k = tl.load(
+                rfa_k_ptrs
+            )
+        else:
+            rfa_k = tl.load(
+                rfa_k_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            rfa_k = tl.load(
+                rfa_k_ptrs,
+                mask=offs_rfa_c[:, None] < nchunks,
+                other=0.0
+            )
+        else:
+            rfa_k = tl.load(
+                rfa_k_ptrs,
+                mask=(offs_rfa_c[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            d_rfa_k = tl.load(
+                d_rfa_k_ptrs
+            )
+        else:
+            d_rfa_k = tl.load(
+                d_rfa_k_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            d_rfa_k = tl.load(
+                d_rfa_k_ptrs,
+                mask=offs_rfa_c[:, None] < nchunks,
+                other=0.0
+            )
+        else:
+            d_rfa_k = tl.load(
+                d_rfa_k_ptrs,
+                mask=(offs_rfa_c[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+    param_mu = tl.load(param_mu_ptrs).to(k.dtype)
+    mu_c_w = tl.zeros([CHUNKS_PER_BLOCK, CHUNK_SIZE], dtype=tl.float32)
+    mu_c_w += tl.sum(k * param_mu, axis=-1)
+    mu_c_w *= log2e
+    if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+        mu_c_w += tl.where(
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None] * CHUNK_SIZE +
+                offs_m[None, :]
+            ) < seqlen,
+            0,
+            float("-inf")
+        )
+    if MASK_TYPE == 1:
+        if EVEN_N:
+            mask = tl.load(
+                m_ptrs
+            )
+        else:
+            mask = tl.load(
+                m_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None] * CHUNK_SIZE +
+                        offs_m[None, :]
+                    ) < seqlen,
+                other=1,
+            )
+        mu_c_w = tl.where(mask, float("-inf"), mu_c_w)
+    # [c, w]
+    m_mu_c_w = tl.max(mu_c_w, axis=-1)
+    masked_out_rows_mu = (m_mu_c_w == float("-inf"))
+    m_mu_c_w_masked = tl.where(masked_out_rows_mu, 0, m_mu_c_w)
+    mu_c_w = tl.exp2(mu_c_w - m_mu_c_w_masked[:, None])
+    denom_mu = tl.sum(mu_c_w, axis=-1)
+    denom_mu = tl.where(denom_mu == 0.0, 1.0, denom_mu)
+    mu_tilde_c_w = mu_c_w / denom_mu[:, None]
+    mu_tilde_c_w = mu_tilde_c_w.to(k.dtype)
+    # [c, d] [c, w, d] -> [c, w]
+    d_mu_tilde_c_w = tl.sum(d_rfa_k[:, None, :] * k, axis=-1)
+    # [c, d] [c, d] -> [c]
+    d_out_rfa_k_t_rfa_k = tl.sum(d_rfa_k * rfa_k, axis=-1)[:, None]
+    d_mu_c_w = (d_mu_tilde_c_w - d_out_rfa_k_t_rfa_k) * mu_tilde_c_w
+    # [c, w] [c, w, d] -> [d]
+    d_param_mu = tl.sum(tl.sum(d_mu_c_w[:, :, None] * k, axis=0), axis=0)
+    # [c, w] [c, d] + [c, w] [1, 1, d] -> [c, w, d]
+    d_k = mu_tilde_c_w[:, :, None] * d_rfa_k[:, None, :] + d_mu_c_w[:, :, None] * param_mu
+    d_param_mu_partial_ptrs = (
+        D_PARAM_MU_PARTIAL +
+        offs_b * stride_d_mu_b +
+        offs_h * stride_d_mu_h +
+        start_n * stride_d_mu_g +
+        offs_d
+    )
+    if EVEN_HEADDIM:
+        tl.store(
+            d_param_mu_partial_ptrs, d_param_mu
+        )
+    else:
+        tl.store(
+            d_param_mu_partial_ptrs, d_param_mu,
+            mask=offs_d < headdim
+        )
+    v_ptrs = (
+        V +
+        offs_b * stride_vb +
+        offs_h * stride_vh +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_vn +
+            offs_d[None, None, :]
+        )
+    )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            v = tl.load(
+                v_ptrs
+            )
+        else:
+            v = tl.load(
+                v_ptrs,
+                mask=offs_d[None, None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            v = tl.load(
+                v_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None, None] * CHUNK_SIZE +
+                        offs_m[None, :, None]
+                    ) < seqlen,
+                other=0.0
+            )
+        else:
+            v = tl.load(
+                v_ptrs,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+                other=0.0
+            )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            rfa_v = tl.load(
+                rfa_v_ptrs
+            )
+        else:
+            rfa_v = tl.load(
+                rfa_v_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            rfa_v = tl.load(
+                rfa_v_ptrs,
+                mask=offs_rfa_c[:, None] < nchunks,
+                other=0.0
+            )
+        else:
+            rfa_v = tl.load(
+                rfa_v_ptrs,
+                mask=(offs_rfa_c[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            d_rfa_v = tl.load(
+                d_rfa_v_ptrs
+            )
+        else:
+            d_rfa_v = tl.load(
+                d_rfa_v_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            d_rfa_v = tl.load(
+                d_rfa_v_ptrs,
+                mask=offs_rfa_c[:, None] < nchunks,
+                other=0.0
+            )
+        else:
+            d_rfa_v = tl.load(
+                d_rfa_v_ptrs,
+                mask=(offs_rfa_c[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+    param_phi = tl.load(param_phi_ptrs).to(k.dtype)
+    phi_c_w = tl.zeros([CHUNKS_PER_BLOCK, CHUNK_SIZE], dtype=tl.float32)
+    phi_c_w += tl.sum(k * param_phi, axis=-1)
+    phi_c_w -= (0.5 * tl.sum(k * k, axis=-1))
+    phi_c_w *= log2e * softmax_scale
+    if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+        phi_c_w += tl.where(
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None] * CHUNK_SIZE +
+                offs_m[None, :]
+            ) < seqlen,
+            0,
+            float("-inf")
+        )
+    if MASK_TYPE == 1:
+        phi_c_w = tl.where(mask, float("-inf"), phi_c_w)
+    m_phi_c_w = tl.max(phi_c_w, axis=-1)
+    masked_out_rows_phi = (m_phi_c_w == float("-inf"))
+    m_phi_c_w_masked = tl.where(masked_out_rows_phi, 0, m_phi_c_w)
+    phi_c_w = tl.exp2(phi_c_w - m_phi_c_w_masked[:, None])
+    denom_phi = tl.sum(phi_c_w, axis=-1)
+    denom_phi = tl.where(denom_phi == 0.0, 1.0, denom_phi)
+    phi_tilde_c_w = phi_c_w / denom_phi[:, None]
+    # phi_c_w = tl.exp2(phi_c_w - tl.max(phi_c_w, axis=-1)[:, None])
+    # phi_tilde_c_w = phi_c_w / tl.sum(phi_c_w, axis=-1)[:, None]
+    phi_tilde_c_w = phi_tilde_c_w.to(k.dtype)
+    d_phi_tilde_c_w = tl.sum(d_rfa_v[:, None, :] * v, axis=-1)
+    d_out_rfa_v_t_rfa_v = tl.sum(d_rfa_v * rfa_v, axis=-1)[:, None]
+    d_phi_c_w = (d_phi_tilde_c_w.to(tl.float32) - d_out_rfa_v_t_rfa_v.to(tl.float32)) * phi_tilde_c_w
+    d_param_phi = tl.sum(tl.sum(d_phi_c_w[:, :, None] * k * softmax_scale, axis=0), axis=0)
+    d_v = phi_tilde_c_w[:, :, None] * d_rfa_v[:, None, :]
+    # [c, w, d] + [c, w] * [1, 1, d] - [c, w, d]
+    d_k = d_k + softmax_scale * d_phi_c_w[:, :, None] * (param_phi - k)
+    d_k_ptrs = (
+        D_K +
+        offs_b * stride_d_k_b +
+        offs_h * stride_d_k_h +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_d_k_n +
+            offs_d[None, None, :]
+        )
+    )
+    d_v_ptrs = (
+        D_V +
+        offs_b * stride_d_v_b +
+        offs_h * stride_d_v_h +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_d_v_n +
+            offs_d[None, None, :]
+        )
+    )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            tl.store(
+                d_k_ptrs, d_k
+            )
+            tl.store(
+                d_v_ptrs, d_v
+            )
+        else:
+            tl.store(
+                d_k_ptrs, d_k,
+                mask=offs_d[None, None, :] < headdim
+            )
+            tl.store(
+                d_v_ptrs, d_v,
+                mask=offs_d[None, None, :] < headdim
+            )
+    else:
+        if EVEN_HEADDIM:
+            tl.store(
+                d_k_ptrs, d_k,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ),
+            )
+            tl.store(
+                d_v_ptrs, d_v,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ),
+            )
+        else:
+            tl.store(
+                d_k_ptrs, d_k,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+            )
+            tl.store(
+                d_v_ptrs, d_v,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+            )
+    d_param_phi_partial_ptrs = (
+        D_PARAM_PHI_PARTIAL +
+        offs_b * stride_d_phi_b +
+        offs_h * stride_d_phi_h +
+        start_n * stride_d_phi_g +
+        offs_d
+    )
+    if EVEN_HEADDIM:
+        tl.store(
+            d_param_phi_partial_ptrs, d_param_phi
+        )
+    else:
+        tl.store(
+            d_param_phi_partial_ptrs, d_param_phi,
+            mask=offs_d < headdim
+        )
+def triton_eva_prep_kv_fwd(k, v, param_mu, param_phi, mask, softmax_scale, chunksize):
+    k, v, param_mu, param_phi = [
+        x if x.stride(-1) == 1 else x.contiguous()
+        for x in [k, v, param_mu, param_phi]
+    ]
+    # shape constraints
+    batch, nheads, seqlen, head_dim = k.shape
+    assert seqlen % chunksize == 0, "seqlen must be divisible by chunksize"
+    nchunks = seqlen // chunksize
+    assert k.shape == (batch, nheads, seqlen, head_dim)
+    assert v.shape == (batch, nheads, seqlen, head_dim)
+    assert param_mu.shape == (1, nheads, 1, 1, head_dim)
+    assert param_phi.shape == (1, nheads, 1, 1, head_dim)
+    assert head_dim <= 128, "We only test head dimensions up to 128"
+    assert k.dtype == v.dtype == param_mu.dtype == param_phi.dtype, "All tensors must have the same type"
+    assert k.dtype in [torch.bfloat16, torch.float], "Only support bf16 and fp32 for now"
+    assert k.is_cuda and v.is_cuda
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(head_dim)
+    mask_type = 0
+    if mask is not None:
+        mask_type = 1
+        assert mask.dtype == torch.bool
+        assert mask.is_cuda
+        assert mask.dim() == 4
+        assert mask.shape == (batch, 1, seqlen, 1)
+        if mask.stride(-1) != 1:
+            mask = mask.contiguous()
+    mask_strides = (
+        (mask.stride(0), mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    out_rfa_k = torch.empty((batch, nheads, nchunks, head_dim), dtype=k.dtype, device=k.device)
+    out_rfa_v = torch.empty((batch, nheads, nchunks, head_dim), dtype=v.dtype, device=v.device)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(head_dim), 16)
+    BLOCK = 128
+    num_warps = 4 if head_dim <= 64 else 8
+    assert (BLOCK > chunksize) & (BLOCK % chunksize) == 0, "BLOCK must be divisible by chunksize"
+    chunks_per_block = BLOCK // chunksize
+    grid = lambda META: (triton.cdiv(seqlen, META["BLOCK_N"]), batch * nheads)
+    _fwd_eva_prep_kv_kernel[grid](
+        k,
+        v,
+        param_mu,
+        param_phi,
+        mask,
+        out_rfa_k,
+        out_rfa_v,
+        softmax_scale,
+        k.stride(0), k.stride(1), k.stride(2),
+        v.stride(0), v.stride(1), v.stride(2),
+        param_mu.stride(1),
+        param_phi.stride(1),
+        mask_strides[0], mask_strides[1],
+        out_rfa_k.stride(0), out_rfa_k.stride(1), out_rfa_k.stride(2),
+        out_rfa_v.stride(0), out_rfa_v.stride(1), out_rfa_v.stride(2),
+        nheads,
+        seqlen,
+        nchunks,
+        head_dim,
+        chunks_per_block,
+        chunksize,
+        mask_type,
+        BLOCK_HEADDIM,
+        BLOCK_N=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return out_rfa_k, out_rfa_v
+def triton_eva_prep_kv_bwd(
+        d_rfa_k, d_rfa_v,
+        k, v, param_mu, param_phi,
+        mask,
+        rfa_k, rfa_v,
+        d_k, d_v, d_param_mu, d_param_phi,
+        softmax_scale,
+        mask_type,
+        chunksize
+    ):
+    d_rfa_k, d_rfa_v = [
+        x if x.stride(-1) == 1 else x.contiguous()
+        for x in [d_rfa_k, d_rfa_v]
+    ]
+    # shape constraints
+    batch, nheads, seqlen, head_dim = k.shape
+    assert seqlen % chunksize == 0, "seqlen must be divisible by chunksize"
+    nchunks = seqlen // chunksize
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(head_dim)
+    mask_strides = (
+        (mask.stride(0), mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    BLOCK_HEADDIM = max(triton.next_power_of_2(head_dim), 16)
+    BLOCK = 128
+    num_warps = 4 if head_dim <= 64 else 8
+    assert (BLOCK > chunksize) & (BLOCK % chunksize) == 0, "BLOCK must be divisible by chunksize"
+    chunks_per_block = BLOCK // chunksize
+    partial_groups = triton.cdiv(seqlen, BLOCK)
+    d_param_mu_partial = torch.zeros((batch, nheads, partial_groups, head_dim), dtype=torch.float32, device=d_rfa_k.device)
+    d_param_phi_partial = torch.zeros((batch, nheads, partial_groups, head_dim), dtype=torch.float32, device=d_rfa_k.device)
+    grid = lambda META: (partial_groups, batch * nheads)
+    _bwd_eva_prep_kv_kernel[grid](
+        rfa_k, # [b, h, c, d]
+        rfa_v, # [b, h, c, d]
+        k, # [b, h, n, d]
+        v, # [b, h, n, d]
+        param_mu, # [1, h, 1, 1, d]
+        param_phi,  # [1, h, 1, 1, d]
+        mask, # [b, h, n, 1]
+        d_rfa_k, # [b, h, c, d]
+        d_rfa_v, # [b, h, c, d]
+        d_k, # [b, h, n, d]
+        d_v, # [b, h, n, d]
+        d_param_mu_partial, # [b, h, g, d]
+        d_param_phi_partial, # [b, h, g, d]
+        softmax_scale,
+        rfa_k.stride(0), rfa_k.stride(1), rfa_k.stride(2),
+        rfa_v.stride(0), rfa_v.stride(1), rfa_v.stride(2),
+        k.stride(0), k.stride(1), k.stride(2),
+        v.stride(0), v.stride(1), v.stride(2),
+        param_mu.stride(1),
+        param_phi.stride(1),
+        mask_strides[0], mask_strides[1],
+        d_rfa_k.stride(0), d_rfa_k.stride(1), d_rfa_k.stride(2),
+        d_rfa_v.stride(0), d_rfa_v.stride(1), d_rfa_v.stride(2),
+        d_k.stride(0), d_k.stride(1), d_k.stride(2),
+        d_v.stride(0), d_v.stride(1), d_v.stride(2),
+        d_param_mu_partial.stride(0), d_param_mu_partial.stride(1), d_param_mu_partial.stride(2),
+        d_param_phi_partial.stride(0), d_param_phi_partial.stride(1), d_param_phi_partial.stride(2),
+        nheads,
+        seqlen,
+        nchunks,
+        head_dim,
+        chunks_per_block,
+        chunksize,
+        mask_type,
+        BLOCK_HEADDIM,
+        BLOCK_N=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    d_param_mu.copy_(d_param_mu_partial.sum(dim=(0, -2), keepdim=True).unsqueeze(-2).to(d_param_mu.dtype))
+    d_param_phi.copy_(d_param_phi_partial.sum(dim=(0, -2), keepdim=True).unsqueeze(-2).to(d_param_phi.dtype))
+class EvaPrepKVFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, k, v, param_mu, param_phi, mask, softmax_scale=None, chunksize=None):
+        if mask is not None:
+            mask_type = 1
+        else:
+            mask_type = 0
+        rfa_k, rfa_v = triton_eva_prep_kv_fwd(
+            k, v, param_mu, param_phi, mask, softmax_scale, chunksize
+        )
+        ctx.save_for_backward(k, v, param_mu, param_phi, mask, rfa_k, rfa_v)
+        ctx.softmax_scale = softmax_scale
+        ctx.chunksize = chunksize
+        ctx.mask_type = mask_type
+        return rfa_k, rfa_v
+    @staticmethod
+    def backward(ctx, d_rfa_k, d_rfa_v):
+        k, v, param_mu, param_phi, mask, rfa_k, rfa_v = ctx.saved_tensors
+        d_k = torch.empty_like(k)
+        d_v = torch.empty_like(v)
+        d_param_mu = torch.empty_like(param_mu)
+        d_param_phi = torch.empty_like(param_phi)
+        triton_eva_prep_kv_bwd(
+            d_rfa_k, d_rfa_v,
+            k, v, param_mu, param_phi,
+            mask,
+            rfa_k, rfa_v,
+            d_k, d_v, d_param_mu, d_param_phi,
+            ctx.softmax_scale,
+            ctx.mask_type,
+            ctx.chunksize
+        )
+        return d_k, d_v, d_param_mu, d_param_phi, None, None, None
+def eva_prep_kv_func_triton(
+        k, v,
+        param_mu, param_phi,
+        mask,
+        softmax_scale=None, chunksize=None
+    ):
+    return EvaPrepKVFunc.apply(
+        k, v,
+        param_mu, param_phi,
+        mask,
+        softmax_scale, chunksize
+    )

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/eva_pt_ref.py ADDED Viewed

	@@ -0,0 +1,420 @@

+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+MASK_MIN_VALUE = -10e10
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """
+    Rotates half the hidden dims (last dim) of the input.
+    Args:
+        x: Rotary embedded tensor
+    Return:
+        Tensor with half of last dim negated and rotated to the front.
+    """
+    x1, x2 = x.split(x.shape[-1] // 2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor,
+                         position_ids: torch.Tensor) -> torch.Tensor:
+    """
+    Apply rotary embedding (cos, sin) to the query and key tensor on the sequence dimension.
+    The legends for dimensions are defined as:
+    num_heads: number of attention heads
+    current_seq_len: the current batch's sequence length, should be either 1 or max_seq_len
+    max_seq_len: the static sequence length, different from current_seq_len in cached inference case where it is always
+                 maximum lenghth, e.g. the length of static sequence length of KV cache
+    Args:
+        q: Query tensor, of size (batch_size, num_heads, current_seq_len, head_dim)
+        k: Key tensor, of size (batch_size, num_key_value_heads, current_seq_len, head_dim)
+        cos: Cosine base of rotary embedding, of size (max_seq_len, head_dim)
+        sin: Sine base of rotary embedding, of size (max_seq_len, head_dim)
+        position_ids: The position indices of the tokens corresponding to the query and key tensors. It has a size of
+                      (batch_size, current_seq_len).
+    Returns:
+        Embedded query and key tensor of same size as input.
+    """
+    bs, nheads, cur_seq_len, head_dim = q.shape
+    assert len(
+        k.shape) == 4, f"k should be of shape (batch_size, num_heads, current_seq_len, head_dim), got {k.shape} instead"
+    assert k.shape[0] == bs, f"k has a different batch_size {k.shape[0]} compared to q {bs}"
+    assert list(k.shape[2:]) == [cur_seq_len,
+                                 head_dim], f"k has different current_seq_len and/or head_dim compared to q"
+    assert cos.shape[3] == head_dim, f"cos should have dim of head dim {head_dim}, got {cos.shape[3]} instead"
+    assert list(position_ids.shape) in [[bs, cur_seq_len], [1, cur_seq_len]],\
+            f"position_ids should be of shape {[bs, cur_seq_len]} or {[1, cur_seq_len]}, got {position_ids.shape} instead"
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def attention_op(
+        q,
+        k,
+        v,
+        attn_mask,
+        mixedp_attn,
+        head_dim_scaling
+    ):
+    attn = torch.matmul(q, k.transpose(-2, -1))
+    if mixedp_attn:
+        attn = attn.to(torch.float)
+    attn = attn * head_dim_scaling
+    if attn_mask is not None:
+        attn = attn.masked_fill(attn_mask, MASK_MIN_VALUE)
+    attn_weights = torch.softmax(attn, dim=-1).to(q.dtype)
+    attn_output = torch.matmul(attn_weights, v)
+    return attn_output
+def prm_projection(
+    x: torch.Tensor,
+    projection_matrix: torch.Tensor,
+    mixedp_attn: bool = False
+    ):
+    """
+    Constructs nonnegative kernel features for fast softmax attention.
+    Args:
+    x: input for which features are computed
+    projection_matrix: random matrix used to compute features
+    Returns:
+    Random features for fast attention.
+    """
+    # x : [..., m, d]
+    # proj : [..., r, d]
+    scaling_factor = (x.shape[-1] ** -0.5)
+    proj_x = torch.matmul(projection_matrix, x.transpose(-1, -2)) # [..., r, m]
+    norm = torch.sum(x ** 2, dim=-1).unsqueeze(-2) * 0.5 # [..., 1]
+    if mixedp_attn:
+        proj_x = proj_x.to(torch.float)
+        norm = norm.to(torch.float)
+    phi_x =  scaling_factor * (proj_x - norm)
+    return phi_x
+class EvaAttention(nn.Module):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.head_dim_scaling = self.head_dim ** -0.5
+        self.max_position_embeddings = config.max_position_embeddings
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.window_size = config.window_size
+        self.num_chunks = config.num_chunks
+        self.chunk_size = config.chunk_size
+        if self.chunk_size is not None:
+            assert self.window_size >= self.chunk_size and self.window_size % self.chunk_size == 0
+            # chunk_size overrides the number of landmarks
+            self.num_chunks = None
+        self.chunks_per_window = int(self.window_size // self.chunk_size)
+        self.random_feature_dim = 1
+        self.adaptive_phi = nn.Parameter(
+            torch.randn(
+                1,
+                self.num_heads,
+                1,
+                1,
+                self.head_dim
+            ).clamp(-1., 1.) * self.head_dim_scaling
+        )
+        self.adaptive_mu_k = nn.Parameter(
+            torch.randn(
+                1,
+                self.num_heads,
+                1,
+                1,
+                self.head_dim
+            ).clamp(-1., 1.) * self.head_dim_scaling
+        )
+    def _generate_feature_map(self, rf_q, rf_k, rf_v):
+        rf_k_logits = torch.sum(self.adaptive_mu_k.to(rf_k.dtype) * rf_k, dim=-1, keepdim=True) # b h c m 1
+        if self.config.mixedp_attn:
+            rf_k_logits = rf_k_logits.to(torch.float)
+        rf_k_weights = torch.softmax(rf_k_logits, dim=-2).to(rf_k.dtype)
+        rf_k_bar = torch.sum(rf_k_weights * rf_k, dim=-2)
+        weights = self.adaptive_phi.to(rf_k.dtype)
+        return weights, rf_k_bar
+    def _calculate_chunk_rfa_cache(self, rf_q, rf_k, rf_v, weights, rf_mask=None):
+        proj_x = torch.sum(weights * rf_k, dim=-1, keepdim=True)
+        norm = torch.sum(rf_k ** 2, dim=-1, keepdim=True) * 0.5 # [..., 1]
+        if self.config.mixedp_attn:
+            proj_x = proj_x.to(torch.float)
+            norm = norm.to(torch.float)
+        log_phi_k = self.head_dim_scaling * (proj_x - norm)
+        if rf_mask is not None:
+            log_phi_k = log_phi_k.masked_fill(rf_mask, MASK_MIN_VALUE)
+        # [b, h, c, m, r]
+        softmax_phi_k = torch.softmax(log_phi_k, dim=-2).to(rf_k.dtype)
+        softmax_phi_k_v = torch.sum(softmax_phi_k * rf_v, dim=-2)
+        # [b, h, c, r, m] [b, h, c, m, d] -> [b, h, c, r, d]
+        # softmax_phi_k_v = torch.matmul(softmax_phi_k.transpose(-1, -2), rf_v).squeeze(-2)
+        log_sum_phi_k = None
+        return softmax_phi_k_v, log_sum_phi_k
+    def _calculate_chunk_rfa(self, q, softmax_phi_k_v, log_sum_phi_k, weights):
+        if self.random_feature_dim == 1:
+            # when r = 1, the snis weights becomes 1, so this takes no effect
+            # [b, h, c, r, d] -> [b, h, c, d]
+            return softmax_phi_k_v
+        else:
+            # [b, h, c, r, d] [b, h, 1, s, d] -> [b, h, c, r, s]
+            log_phi_q = prm_projection(q.unsqueeze(-3), weights, self.config.mixedp_attn)
+            # [b, h, c, r, s] [b, h, c, r, 1] -> [b, h, c, r, s]
+            sniw = torch.softmax(log_phi_q + log_sum_phi_k, dim=-1).to(q.dtype)
+            # [b, h, c, r, s] [b, h, c, r, d] -> [b, h, c, s, d] -> [b, h, s, c, d]
+            rfa_per_chunk = torch.matmul(sniw.transpose(-1, -2), softmax_phi_k_v).transpose(-3, -2)
+            return rfa_per_chunk
+    def window_partition(self, x, window_size=None):
+        window_size = window_size if window_size is not None else self.window_size
+        gw, d = x.shape[-2:]
+        leading_dims = x.shape[:-2]
+        n_groups = gw // window_size
+        return x.reshape(*leading_dims, n_groups, window_size, d)
+    def window_merge(self, x, window_size=None):
+        g, w, d = x.shape[-3:]
+        leading_dims = x.shape[:-3]
+        return x.reshape(*leading_dims, g * w, d)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+        multibyte_decoding: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        assert not output_attentions
+        bsz, q_len, _ = hidden_states.size()
+        ############################################
+        # initialize past states if not provided
+        ############################################
+        if use_cache and past_key_value is None:
+            raise ValueError
+        if use_cache and multibyte_decoding:
+            raise NotImplementedError("Multibyte decoding is not supported for PyTorch native implementation")
+        # assert isinstance(attention_mask, tuple)
+        if len(attention_mask) == 4:
+            assert use_cache
+            prev_causal_mask, cur_causal_mask, chunk_causal_mask, intra_chunk_mask = attention_mask
+        elif len(attention_mask) == 3:
+            assert not use_cache
+            window_causal_mask, chunk_causal_mask, intra_chunk_mask = attention_mask
+        else:
+            raise NotImplementedError("Only attention-mask tuple with length 2 or 3 is supported")
+        ############################################
+        # compute q, k, v from hidden states
+        ############################################
+        # [b, h, q_len, d]
+        q = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        k = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        v = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if use_cache:
+            past_key_value.update_past_len(q.shape[-2], self.layer_idx)
+        ############################################
+        # apply rotary positional embeddings to q, k
+        ############################################
+        q, k = apply_rotary_pos_emb(q, k, cos, sin, position_ids)
+        ############################################
+        # compute q, k, v stats for the local window
+        ############################################
+        if use_cache:
+            (prev_w_q, prev_w_k, prev_w_v), (cur_w_q, cur_w_k, cur_w_v) = past_key_value.update_singletons(
+                q,
+                k,
+                v,
+                self.layer_idx,
+                self.window_size,
+            )
+        else:
+            prev_w_q = self.window_partition(q) # [b, h, w, i, d]
+            prev_w_k = self.window_partition(k) # [b, h, w, j, d]
+            prev_w_v = self.window_partition(v) # [b, h, w, j, d]
+            # during training, we assume window_size divides seq_len so no remainders
+            cur_w_q = cur_w_k = cur_w_v = None
+        ############################################
+        # compute q, k, v stats for chunk-level RFAs
+        ############################################
+        if use_cache:
+            dump_q, dump_k, dump_v = past_key_value.update_chunks(q, k, v, self.layer_idx, self.chunk_size)
+        else:
+            dump_q, dump_k, dump_v = q, k, v
+        if use_cache:
+            prev_s_mask, cur_s_mask, prev_chunk_mask, cur_chunk_mask, dump_rf_mask = past_key_value.update_mask(
+                prev_s_mask=prev_causal_mask,
+                cur_s_mask=cur_causal_mask,
+                chunk_mask=chunk_causal_mask,
+                rf_mask=intra_chunk_mask,
+                layer_idx=self.layer_idx,
+                window_size=self.window_size,
+                chunk_size=self.chunk_size,
+            )
+        else:
+            prev_s_mask = self.window_partition(prev_causal_mask) # [1, 1, w, i, j]
+            cur_s_mask = None
+            prev_chunk_mask = self.window_partition(chunk_causal_mask)
+            cur_chunk_mask = None
+            dump_rf_mask = intra_chunk_mask
+            if prev_s_mask.shape[-3] == 1:
+                # need to expand
+                prev_s_mask = prev_s_mask.expand(-1, -1, prev_chunk_mask.shape[-3], -1, -1)
+        if (
+            dump_q is not None and
+            dump_k is not None and
+            dump_v is not None
+        ):
+            # [b, h, c, j, d]
+            rf_q = self.window_partition(dump_q, window_size=self.chunk_size)
+            # [b, h, c, j, d]
+            rf_k = self.window_partition(dump_k, window_size=self.chunk_size)
+            # [b, h, c, j, d]
+            rf_v = self.window_partition(dump_v, window_size=self.chunk_size)
+            if dump_rf_mask is not None:
+                rf_mask = self.window_partition(dump_rf_mask, window_size=self.chunk_size)
+                rf_q = rf_q.masked_fill(rf_mask, 0.)
+                rf_k = rf_k.masked_fill(rf_mask, 0.)
+                rf_v = rf_v.masked_fill(rf_mask, 0.)
+            else:
+                rf_mask = None
+        else:
+            rf_q = None
+            rf_k = None
+            rf_v = None
+            rf_mask = None
+        if rf_q is not None:
+            # import pdb; pdb.set_trace()
+            weights, rf_k_bar = self._generate_feature_map(rf_q, rf_k, rf_v)
+            softmax_phi_k_v, log_sum_phi_k = self._calculate_chunk_rfa_cache(rf_q, rf_k, rf_v, weights, rf_mask=rf_mask)
+            if use_cache:
+                softmax_phi_k_v, log_sum_phi_k, rf_k_bar = past_key_value.update_chunk_rfas(
+                    softmax_phi_k_v, log_sum_phi_k, rf_k_bar, self.layer_idx, 1
+                )
+        elif use_cache:
+            weights = None
+            softmax_phi_k_v, log_sum_phi_k, rf_k_bar = past_key_value.get_chunk_rfas(self.layer_idx)
+        else:
+            weights = None
+            softmax_phi_k_v = None
+            log_sum_phi_k = None
+            rf_k_bar = None
+        if rf_k_bar is not None:
+            rfa_per_chunk = self._calculate_chunk_rfa(q, softmax_phi_k_v, log_sum_phi_k, weights)
+        ############################################
+        # compute meta-attention weights for
+        # - group-wise RFAs and
+        # - singletons (equivalent to exact local attention)
+        ############################################
+        if prev_w_k is not None:
+            if rf_k_bar is not None:
+                num_windows = prev_w_k.shape[-3]
+                # rf_k_bar and rfa_per_chunk take the shape [b, h, c, d]
+                # -> [b, h, 1, c, d] -> [b, h, w, c, d]
+                prev_rf_k_bar = rf_k_bar.unsqueeze(-3).expand(-1, -1, num_windows, -1, -1)
+                prev_rfa_per_chunk = rfa_per_chunk.unsqueeze(-3).expand(-1, -1, num_windows, -1, -1)
+                prev_agg_k = torch.cat([prev_w_k, prev_rf_k_bar], dim=-2)
+                prev_agg_v = torch.cat([prev_w_v, prev_rfa_per_chunk], dim=-2)
+                prev_attn_mask = torch.cat([prev_s_mask, prev_chunk_mask], dim=-1)
+            else:
+                prev_agg_k = prev_w_k
+                prev_agg_v = prev_w_v
+                prev_attn_mask = prev_s_mask
+            prev_attn_output = attention_op(
+                q=prev_w_q,
+                k=prev_agg_k,
+                v=prev_agg_v,
+                attn_mask=prev_attn_mask,
+                mixedp_attn=self.config.mixedp_attn,
+                head_dim_scaling=self.head_dim_scaling
+            )
+            prev_attn_output = self.window_merge(prev_attn_output)
+        if cur_w_k is not None:
+            if rf_k_bar is not None:
+                # rf_k_bar and rfa_per_chunk take the shape [b, h, c, d]
+                # cur_w_k and cur_w_v also has shape [b, h, m, d]
+                cur_agg_k = torch.cat([cur_w_k, rf_k_bar], dim=-2)
+                cur_agg_v = torch.cat([cur_w_v, rfa_per_chunk], dim=-2)
+                cur_attn_mask = torch.cat([cur_s_mask, cur_chunk_mask], dim=-1)
+            else:
+                cur_agg_k = cur_w_k
+                cur_agg_v = cur_w_v
+                cur_attn_mask = cur_s_mask
+            cur_attn_output = attention_op(
+                q=cur_w_q,
+                k=cur_agg_k,
+                v=cur_agg_v,
+                attn_mask=cur_attn_mask,
+                mixedp_attn=self.config.mixedp_attn,
+                head_dim_scaling=self.head_dim_scaling
+            )
+        if prev_w_k is not None and cur_w_k is not None:
+            attn_output = torch.cat([prev_attn_output, cur_attn_output], dim=-2)
+        elif prev_w_k is not None:
+            attn_output = prev_attn_output
+        elif cur_w_k is not None:
+            attn_output = cur_attn_output
+        else:
+            raise ValueError("There must be some bug")
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        attn_weights = None
+        return attn_output, attn_weights, past_key_value

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.47.1"
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/image_processing_evabyte.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# coding=utf-8
+"""Image processor class for EvaByte."""
+from typing import Dict, List, Optional, Union, Tuple
+import io
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_utils import (
+    ImageInput,
+    PILImageResampling,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from PIL import Image
+def _get_qtable_bytes():
+    return {
+        5: b'\xff\xd8\xff\xdb\x00C\x00\xa0nx\x8cxd\xa0\x8c\x82\x8c\xb4\xaa\xa0\xbe\xf0\xff\xff\xf0\xdc\xdc\xf0\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xdb\x00C\x01\xa0\xb4\xb4\xf0\xd2\xf0\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xd9',
+        10: b'\xff\xd8\xff\xdb\x00C\x00P7<F<2PFAFZUP_x\xc8\x82xnnx\xf5\xaf\xb9\x91\xc8\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xdb\x00C\x01PZZxix\xeb\x82\x82\xeb\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xd9',
+        15: b'\xff\xd8\xff\xdb\x00C\x005%(/(!5/+/<95?P\x85WPIIP\xa3u{a\x85\xc1\xaa\xcb\xc8\xbe\xaa\xba\xb7\xd5\xf0\xff\xff\xd5\xe2\xff\xe6\xb7\xba\xff\xff\xff\xff\xff\xff\xff\xff\xff\xce\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xdb\x00C\x015<<PFP\x9dWW\x9d\xff\xdc\xba\xdc\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xd9',
+        20: b'\xff\xd8\xff\xdb\x00C\x00(\x1c\x1e#\x1e\x19(#!#-+(0<dA<77<{X]Id\x91\x80\x99\x96\x8f\x80\x8c\x8a\xa0\xb4\xe6\xc3\xa0\xaa\xda\xad\x8a\x8c\xc8\xff\xcb\xda\xee\xf5\xff\xff\xff\x9b\xc1\xff\xff\xff\xfa\xff\xe6\xfd\xff\xf8\xff\xdb\x00C\x01(--<5<vAAv\xf8\xa5\x8c\xa5\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xf8\xff\xd9',
+        25: b'\xff\xd8\xff\xdb\x00C\x00 \x16\x18\x1c\x18\x14 \x1c\x1a\x1c$" &0P40,,0bFJ:Ptfzxrfpn\x80\x90\xb8\x9c\x80\x88\xae\x8anp\xa0\xda\xa2\xae\xbe\xc4\xce\xd0\xce|\x9a\xe2\xf2\xe0\xc8\xf0\xb8\xca\xce\xc6\xff\xdb\x00C\x01 $$0*0^44^\xc6\x84p\x84\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xc6\xff\xd9',
+        30: b'\xff\xd8\xff\xdb\x00C\x00\x1b\x12\x14\x17\x14\x11\x1b\x17\x16\x17\x1e\x1c\x1b (B+(%%(Q:=0B`Ued_U][jx\x99\x81jq\x90s[]\x85\xb5\x86\x90\x9e\xa3\xab\xad\xabg\x80\xbc\xc9\xba\xa6\xc7\x99\xa8\xab\xa4\xff\xdb\x00C\x01\x1b\x1e\x1e(#(N++N\xa4n]n\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xa4\xff\xd9',
+        50: b'\xff\xd8\xff\xdb\x00C\x00\x10\x0b\x0c\x0e\x0c\n\x10\x0e\r\x0e\x12\x11\x10\x13\x18(\x1a\x18\x16\x16\x181#%\x1d(:3=<9387@H\\N@DWE78PmQW_bghg>Mqypdx\\egc\xff\xdb\x00C\x01\x10\x12\x12\x18\x15\x18/\x1a\x1a/cB8Bcccccccccccccccccccccccccccccccccccccccccccccccccc\xff\xd9',
+        75: b'\xff\xd8\xff\xdb\x00C\x00\x08\x06\x06\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f\x14\x1d\x1a\x1f\x1e\x1d\x1a\x1c\x1c $.\' ",#\x1c\x1c(7),01444\x1f\'9=82<.342\xff\xdb\x00C\x01\x08\t\t\x0c\x0b\x0c\x18\r\r\x182!\x1c!22222222222222222222222222222222222222222222222222\xff\xd9',
+        95: b'\xff\xd8\xff\xdb\x00C\x00\x02\x01\x01\x01\x01\x01\x02\x01\x01\x01\x02\x02\x02\x02\x02\x04\x03\x02\x02\x02\x02\x05\x04\x04\x03\x04\x06\x05\x06\x06\x06\x05\x06\x06\x06\x07\t\x08\x06\x07\t\x07\x06\x06\x08\x0b\x08\t\n\n\n\n\n\x06\x08\x0b\x0c\x0b\n\x0c\t\n\n\n\xff\xdb\x00C\x01\x02\x02\x02\x02\x02\x02\x05\x03\x03\x05\n\x07\x06\x07\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\xff\xd9',
+        100: b'\xff\xd8\xff\xdb\x00C\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xff\xdb\x00C\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\xff\xd9',
+    }
+def _resize_if_exceeding_max_len(
+    width: int, height: int, min_len: Optional[int] = 16, max_len: Optional[int] = None
+) -> Tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+    Args:
+        height (`int`):
+            Height of the input image.
+        width (`int`):
+            Width of the input image.
+        max_len (`Dict[str, int]`, *optional*, defaults to the maximum size of the image):
+            Defines the maximum dimensions of the image.
+    Returns:
+        The output size of the image after resizing.
+    """
+    max_len = max(height, width) if max_len is None else max_len
+    aspect_ratio = width / height
+    if width >= height and width > max_len:
+        width = max_len
+        height = int(width / aspect_ratio)
+        if height % 2 != 0:
+            height += 1
+    elif height > width and height > max_len:
+        height = max_len
+        width = int(height * aspect_ratio)
+        if width % 2 != 0:
+            width += 1
+    # Avoid resizing to a size smaller than 1
+    height = max(height, min_len)
+    width = max(width, min_len)
+    return width, height
+class EvaByteImageProcessor(BaseImageProcessor):
+    model_input_names = []
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        size: Dict[str, int] = None,
+        do_convert_rgb: bool = True,
+        jpeg_quality: int = 25,
+        jpeg_subsampling: str = "4:2:0",
+        jpeg_streamtype: str = 2,
+        jpeg_restart_marker_blocks: int = 1,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.size = size if size is not None else {"longest_edge": 384}
+        self.do_convert_rgb = do_convert_rgb
+        self.jpeg_quality = jpeg_quality
+        self.jpeg_subsampling = jpeg_subsampling
+        self.jpeg_streamtype = jpeg_streamtype
+        self.jpeg_restart_marker_blocks = jpeg_restart_marker_blocks
+    def jpeg_encode(
+        self,
+        image,
+        jpeg_quality,
+        jpeg_subsampling,
+        jpeg_streamtype,
+        jpeg_restart_marker_blocks,
+    ):
+        with io.BytesIO() as output:
+            image.save(
+                output,
+                format="JPEG",
+                quality=jpeg_quality,
+                subsampling=jpeg_subsampling,
+                streamtype=jpeg_streamtype,
+                restart_marker_blocks=jpeg_restart_marker_blocks
+            )
+            jpeg_bytes = output.getvalue()
+        return jpeg_bytes
+    def jpeg_merge_qtables(
+        self,
+        image_bytes,
+        jpeg_quality=None,
+    ):
+        if jpeg_quality is None:
+            jpeg_quality = self.jpeg_quality
+        qtable_bytes = _get_qtable_bytes()[jpeg_quality]
+        return image_bytes[:2] + qtable_bytes[2:-2] + image_bytes[2:]
+    def resize(
+        self,
+        image: Image,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+    ) -> Image:
+        if "longest_edge" in size:
+            width, height = image.size
+            # Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio
+            width, height = _resize_if_exceeding_max_len(width, height, max_len=size["longest_edge"])
+            size = (width, height)
+        elif "width" in size and "height" in size:
+            size = (size["width"], size["height"])
+        else:
+            raise ValueError("size must be a dictionary with key 'longest_edge' or 'height' and 'width'.")
+        resized_image = image.resize(size, resample=resample)
+        return resized_image
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        resample = None,
+        size: Dict[str, int] = None,
+        do_convert_rgb: bool = None,
+        jpeg_quality: int = None,
+        jpeg_subsampling: str = None,
+        jpeg_streamtype: str = None,
+        jpeg_restart_marker_blocks: int = None,
+    ):
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        jpeg_quality = jpeg_quality if jpeg_quality is not None else self.jpeg_quality
+        jpeg_subsampling = jpeg_subsampling if jpeg_subsampling is not None else self.jpeg_subsampling
+        jpeg_streamtype = jpeg_streamtype if jpeg_streamtype is not None else self.jpeg_streamtype
+        jpeg_restart_marker_blocks = jpeg_restart_marker_blocks if jpeg_restart_marker_blocks is not None else self.jpeg_restart_marker_blocks
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        images_list = images
+        if do_convert_rgb:
+            images_list = [
+                [
+                    image.convert("RGB") for image in images
+                ]
+                for images in images_list
+            ]
+        if do_resize:
+            images_list = [
+                [
+                    self.resize(image=image, size=size, resample=resample)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+        jpeg_bytes = [
+            [
+                self.jpeg_encode(
+                    image,
+                    jpeg_quality,
+                    jpeg_subsampling,
+                    jpeg_streamtype,
+                    jpeg_restart_marker_blocks
+                ) for image in images
+            ]
+            for images in images_list
+        ]
+        return jpeg_bytes

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+  "metadata": {
+    "total_size": 57058938880
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.2.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.26.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.3.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.27.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.4.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.28.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.5.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.6.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.30.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.6.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.30.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.7.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.31.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.7.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.31.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.8.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.32.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.2.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.14.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.38.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.37.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.norm.weight": "model-00003-of-00003.safetensors",
+    "lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.32.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.10.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.34.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.10.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.34.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.36.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.38.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.39.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.16.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.11.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.35.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.12.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.36.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.36.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.0.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.15.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.15.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.39.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.39.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.18.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.adaptive_phi": "model-00003-of-00003.safetensors"
+  }
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/modeling_evabyte.py ADDED Viewed

	@@ -0,0 +1,912 @@

+from typing import List, Optional, Tuple, Union
+import math
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_evabyte import EvaByteConfig
+from .multibyte_decoding_evabyte import MultiByteDecodingMixin
+try:
+    import triton
+    USE_TRITON_IMPL = True
+    from .eva import EvaAttention
+    from .eva_agg_kernel import triton_eva_agg_fwd
+    from .eva_prep_kv_kernel import triton_eva_prep_kv_fwd
+except ImportError:
+    USE_TRITON_IMPL = False
+    print("WARNING: triton is not installed, using fallback EVA which might be slow and throw errors")
+    from .eva_pt_ref import EvaAttention
+from .eva_cache import EvaCache, EvaStaticCacheForTriton
+MASK_MIN_VALUE = -10e10
+def prepare_eva_attention_mask(
+        seq_len,
+        device,
+        chunk_size,
+        window_size,
+        use_cache=False,
+        cache=None
+    ):
+    """
+    Prepare attention masks for EVA.
+    """
+    chunk_causal_mask  = None
+    window_causal_mask = None
+    if use_cache:
+        cached_seq_len = cache.get_seq_length()
+        total_seq_len = seq_len + cached_seq_len
+        # cached_seq_len will be 0 during prefilling
+        # padded_seq_len = chunk_size * math.ceil(total_seq_len / chunk_size)
+        padded_seq_len = window_size * math.ceil(total_seq_len / window_size)
+        num_chunks = padded_seq_len // chunk_size
+    else:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        assert seq_len % chunk_size == 0
+        num_chunks = seq_len // chunk_size
+        assert seq_len % window_size == 0
+    # create causal mask
+    ################################
+    # generate chunked causal masks
+    ################################
+    # [b, h, j, c, c]
+    chunks_per_window = window_size // chunk_size
+    if num_chunks >= chunks_per_window:
+        chunk_causal_mask = torch.ones(
+            (chunk_size, num_chunks, num_chunks),
+            device=device,
+            dtype=torch.bool
+        ).triu(0)
+        num_blocks = num_chunks // chunks_per_window
+        chunk_causal_mask = chunk_causal_mask.reshape(
+            chunk_size,
+            num_blocks,
+            chunks_per_window,
+            num_blocks,
+            chunks_per_window
+        ).transpose(-2, -3)
+        block_diag_zero = (
+            torch.eye(num_blocks, device=device, dtype=torch.bool)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .unsqueeze(0)
+        )
+        # Set diagonal blocks to zero
+        chunk_causal_mask = chunk_causal_mask.masked_fill(block_diag_zero, True)
+        # Reshape back to original size
+        chunk_causal_mask = (
+            chunk_causal_mask
+            .transpose(-2, -3)
+            .reshape(chunk_size, num_chunks, num_chunks)
+            .transpose(-2, -3)
+            .reshape(chunk_size * num_chunks, num_chunks)
+            .unsqueeze(0)
+            .unsqueeze(0)
+        )
+    else:
+        chunk_causal_mask = torch.ones(
+            (1, 1, chunk_size, num_chunks, num_chunks),
+            device=device,
+            dtype=torch.bool,
+        ).triu(0).transpose(-2, -3) # [1, 1, c, j, c]
+        chunk_causal_mask = chunk_causal_mask.reshape(
+            1, 1, chunk_size * num_chunks, num_chunks
+        ) # [1, 1, n, c]
+    if use_cache:
+        chunk_causal_mask = chunk_causal_mask[..., cached_seq_len : cached_seq_len + seq_len, :]
+    window_causal_mask = torch.ones(
+        (1, 1, 1, window_size, window_size),
+        device=device
+    ).triu(1).to(torch.bool)
+    return (chunk_causal_mask, window_causal_mask)
+def pad_to_multiple(tensor, multiple, dim=-2, value=0, create_mask=False, left_padding=False):
+    assert dim < 0 # only accept ``dim'' index in a reverse manner
+    seqlen = int(tensor.shape[dim])
+    m = seqlen / multiple
+    if m.is_integer():
+        if create_mask:
+            return tensor, torch.ones(size=(tensor.shape[0], tensor.shape[dim]), dtype=torch.bool, device=tensor.device)
+        else:
+            return tensor
+    remainder = math.ceil(m) * multiple - seqlen
+    pad_offset = (0,) * (-1 - dim) * 2
+    if left_padding:
+        padded_res = F.pad(tensor, (*pad_offset, remainder, 0), value=value)
+    else:
+        padded_res = F.pad(tensor, (*pad_offset, 0, remainder), value=value)
+    if create_mask:
+        # assume dim 0 is the batch size
+        padding_mask = torch.ones(size=(padded_res.shape[0], padded_res.shape[dim]), dtype=torch.bool, device=padded_res.device)
+        if left_padding:
+            padding_mask[:, :remainder] = False
+        else:
+            padding_mask[:, -remainder:] = False
+        return padded_res, padding_mask
+    else:
+        return padded_res
+class EvaByteRMSNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.fp32_ln = True
+        self.variance_epsilon = config.rms_norm_eps
+        self.add_unit_offset = config.norm_add_unit_offset
+        if self.add_unit_offset:
+            self.weight = nn.Parameter(torch.zeros(config.hidden_size))
+        else:
+            self.weight = nn.Parameter(torch.ones(config.hidden_size))
+    def forward(self, hidden_states):
+        _hidden_states = hidden_states.to(torch.float32 if self.fp32_ln else torch.bfloat16)
+        variance = _hidden_states.pow(2).mean(-1, keepdim=True)
+        _hidden_states = _hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        if self.add_unit_offset:
+            return ((1 + self.weight) * _hidden_states).type_as(hidden_states)
+        else:
+            return (self.weight * _hidden_states).type_as(hidden_states)
+class EvaByteRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._set_cos_sin_cache(seq_len=max_position_embeddings,
+                                device=self.inv_freq.device,
+                                dtype=torch.get_default_dtype())
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        # return (
+        #     self.cos_cached[:seq_len].to(dtype=x.dtype),
+        #     self.sin_cached[:seq_len].to(dtype=x.dtype),
+        # )
+        if seq_len < self.max_seq_len_cached:
+            cos_slice = self.cos_cached.split(seq_len, dim=0)[0]
+            sin_slice = self.sin_cached.split(seq_len, dim=0)[0]
+        else:
+            cos_slice = self.cos_cached
+            sin_slice = self.sin_cached
+        return (
+            cos_slice.to(dtype=x.dtype),
+            sin_slice.to(dtype=x.dtype),
+        )
+class EvaByteLinearScalingRotaryEmbedding(EvaByteRotaryEmbedding):
+    """EvaByteRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class EvaByteDynamicNTKScalingRotaryEmbedding(EvaByteRotaryEmbedding):
+    """EvaByteRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * ((self.scaling_factor * seq_len / self.max_position_embeddings) -
+                                (self.scaling_factor - 1))**(self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class EvaByteMLP(nn.Module):
+    def __init__(self, config, layer_idx: int = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.layer_idx = layer_idx
+        self.config = config
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class EvaByteDecoderLayer(nn.Module):
+    def __init__(self, config: EvaByteConfig, layer_idx: int = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = EvaAttention(config=config, layer_idx=layer_idx)
+        self.mlp = EvaByteMLP(config, layer_idx=layer_idx)
+        self.input_layernorm = EvaByteRMSNorm(config)
+        self.post_attention_layernorm = EvaByteRMSNorm(config)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+            cos: Optional[torch.Tensor] = None,
+            sin: Optional[torch.Tensor] = None,
+            multibyte_decoding: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        if self.config.fp32_skip_add:
+            residual = residual.float()
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(hidden_states=hidden_states,
+                                                                            attention_mask=attention_mask,
+                                                                            position_ids=position_ids,
+                                                                            past_key_value=past_key_value,
+                                                                            output_attentions=output_attentions,
+                                                                            use_cache=use_cache,
+                                                                            cos=cos,
+                                                                            sin=sin,
+                                                                            multibyte_decoding=multibyte_decoding)
+        hidden_states = (residual + hidden_states).to(hidden_states.dtype)
+        # Fully Connected
+        residual = hidden_states
+        if self.config.fp32_skip_add:
+            residual = residual.float()
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = (residual + hidden_states).to(hidden_states.dtype)
+        outputs = (hidden_states, )
+        if output_attentions:
+            outputs += (self_attn_weights, )
+        if use_cache:
+            outputs += (present_key_value, )
+        return outputs
+class EvaBytePreTrainedModel(PreTrainedModel):
+    config_class = EvaByteConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["EvaByteDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    def _init_weights(self, module):
+        std = getattr(self.config, "initializer_range", 0.02)
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, EvaByteModel):
+            module.gradient_checkpointing = value
+class EvaByteModel(EvaBytePreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`EvaByteDecoderLayer`]
+    Args:
+        config: EvaByteConfig
+    """
+    def __init__(self, config: EvaByteConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = self.config.max_position_embeddings
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([EvaByteDecoderLayer(config, layer_idx=layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.norm = EvaByteRMSNorm(config)
+        self.gradient_checkpointing = False
+        self.rope = config.rope_theta
+        # Initialize weights and apply final processing
+        self.post_init()
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = EvaByteRotaryEmbedding(self.head_dim,
+                                                   max_position_embeddings=self.max_position_embeddings,
+                                                   base=self.rope)
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = EvaByteLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope)
+            elif scaling_type == "dynamic":
+                self.rotary_emb = EvaByteDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope)
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def _helper_padding_mask(
+            self,
+            padding_mask,
+            causal_mask
+    ):
+        padding_mask = torch.logical_or(padding_mask, padding_mask.transpose(-1, -2))
+        return torch.logical_or(padding_mask, causal_mask)
+    def _prepare_eva_generation_attn_mask_triton(
+        self,
+        attention_mask,
+        input_ids,
+        use_cache,
+        past_key_values
+    ):
+        batch_size, seq_len = input_ids.shape
+        if use_cache and past_key_values.get_seq_length() > 0:
+            # decoding phase
+            if past_key_values.rf_mask[0] is not None:
+                cur_rf_mask = torch.zeros(
+                    (batch_size, 1, seq_len, 1),
+                    dtype=past_key_values.rf_mask[0].dtype,
+                    device=past_key_values.rf_mask[0].device
+                )
+            else:
+                cur_rf_mask = None
+            if past_key_values.s_mask[0] is not None:
+                cur_s_mask = torch.zeros(
+                    (batch_size, 1, seq_len, 1),
+                    dtype=past_key_values.s_mask[0].dtype,
+                    device=past_key_values.s_mask[0].device
+                )
+            else:
+                cur_s_mask = None
+            seen_tokens = past_key_values.get_seq_length()
+            if seen_tokens <= self.config.window_size:
+                rfa_chunks_dummy_mask = None
+            else:
+                if cur_s_mask is not None:
+                    chunks_per_window = int(self.config.window_size // self.config.chunk_size)
+                    # the ongoing decoding step would be (seen_seq_len + 1)-th token
+                    num_windows_seen_so_far = seen_tokens // self.config.window_size
+                    rfa_chunks_dummy_mask = torch.zeros(
+                        (batch_size, 1, seq_len, num_windows_seen_so_far * chunks_per_window),
+                        dtype=past_key_values.s_mask[0].dtype,
+                        device=past_key_values.s_mask[0].device
+                    )
+                else:
+                    rfa_chunks_dummy_mask = None
+            # rf_mask and cur_mask are 0s because we do not want to mask them
+            return (cur_s_mask, cur_rf_mask, rfa_chunks_dummy_mask)
+        if attention_mask is not None and torch.any(attention_mask == 0.0):
+            # convert 0 -> padding to 1 -> padding
+            padded_attention_mask = pad_to_multiple(
+                attention_mask,
+                self.config.window_size,
+                dim=-1,
+                value=0,
+                create_mask=False,
+                left_padding=False
+            )
+            # convert 0 -> padding to 1 -> padding
+            padded_rf_mask = ~padded_attention_mask.unsqueeze(1).unsqueeze(-1).to(torch.bool) # [b, 1, n, 1]
+            # [b, 1, w, j, 1]
+            padded_w_attn_mask = padded_rf_mask.reshape(batch_size, 1, -1, self.config.window_size, 1).to(torch.bool)
+            # [b, 1, w, j, 1] [b, 1, w, 1, j] -> [b, 1, w, j, j]
+            w_padding_mask = torch.logical_or(padded_w_attn_mask, padded_w_attn_mask.transpose(-1, -2))
+            w_causal_mask = torch.ones(
+                (1, 1, 1, self.config.window_size, self.config.window_size),
+                device=input_ids.device
+            ).triu(1).to(torch.bool)
+            s_mask = torch.logical_or(w_padding_mask, w_causal_mask)
+            s_mask = s_mask.reshape(batch_size, 1, -1, self.config.window_size)
+            s_mask = s_mask[..., :seq_len, :]
+            # negate the attention mask to get the padding mask
+            rf_mask = ~attention_mask.unsqueeze(1).unsqueeze(-1).to(torch.bool) # [b, 1, n, 1]
+            return (s_mask, rf_mask)
+        else:
+            return (None, None)
+    def _prepare_eva_generation_attn_mask(
+        self,
+        attention_mask,
+        input_ids,
+        use_cache,
+        past_key_values
+    ):
+        batch_size, seq_len = input_ids.shape
+        if use_cache and past_key_values.get_seq_length() > 0:
+            # decoding phase
+            if past_key_values.rf_mask[0] is not None:
+                rf_mask = torch.zeros(
+                    (batch_size, 1, seq_len, 1),
+                    dtype=past_key_values.rf_mask[0].dtype,
+                    device=past_key_values.rf_mask[0].device
+                )
+            else:
+                rf_mask = None
+            cur_causal_mask = torch.zeros(
+                (batch_size, 1, seq_len, 1),
+                dtype=torch.bool,
+                device=input_ids.device
+            )
+            chunk_causal_mask = torch.ones(
+                (batch_size, 1, seq_len, 1),
+                dtype=torch.bool,
+                device=input_ids.device
+            )
+            # chunk_causal_mask are 1s because we will mask them by default and
+            # will be unmasked when the current singleton attention is processed over
+            return (None, cur_causal_mask, chunk_causal_mask, rf_mask)
+        true_num_chunks = seq_len // self.config.chunk_size
+        chunk_causal_mask, _ = prepare_eva_attention_mask(
+            seq_len,
+            input_ids.device,
+            self.config.chunk_size,
+            self.config.window_size,
+            use_cache=use_cache,
+            cache=past_key_values
+        )
+        chunk_causal_mask = chunk_causal_mask[..., :seq_len, :true_num_chunks]
+        if attention_mask is not None and torch.any(attention_mask == 0.0):
+            # convert 0 -> padding to 1 -> padding
+            rf_mask = ~attention_mask.unsqueeze(1).unsqueeze(-1).to(torch.bool) # [b, 1, n, 1]
+        else:
+            rf_mask = None
+        if seq_len < self.config.window_size:
+            cur_window_mask = torch.ones(
+                (1, 1, seq_len, seq_len),
+                device=input_ids.device
+            ).triu(1).to(torch.bool)
+            if rf_mask is not None:
+                cur_window_mask = self._helper_padding_mask(rf_mask, cur_window_mask)
+            prev_window_mask = None
+        else:
+            if seq_len % self.config.window_size == 0:
+                num_windows = seq_len // self.config.window_size
+                cur_window_mask = None
+                prev_window_mask = torch.ones(
+                    (1, 1, num_windows, self.config.window_size, self.config.window_size),
+                    device=input_ids.device
+                ).triu(1).to(torch.bool)
+                if rf_mask is not None:
+                    prev_rf_mask = rf_mask.reshape(batch_size, 1, -1, self.config.window_size, 1)
+                    prev_window_mask = self._helper_padding_mask(prev_rf_mask, prev_window_mask)
+            else:
+                num_windows = seq_len // self.config.window_size
+                remainder_tokens = seq_len % self.config.window_size
+                cur_window_mask = torch.ones(
+                    (1, 1, remainder_tokens, remainder_tokens),
+                    device=input_ids.device
+                ).triu(1).to(torch.bool)
+                prev_window_mask = torch.ones(
+                    (1, 1, num_windows, self.config.window_size, self.config.window_size),
+                    device=input_ids.device
+                ).triu(1).to(torch.bool)
+                if rf_mask is not None:
+                    prev_rf_mask, cur_rf_mask = torch.split(rf_mask, [seq_len - remainder_tokens, remainder_tokens], dim=-2)
+                    cur_window_mask = self._helper_padding_mask(cur_rf_mask, cur_window_mask)
+                    prev_rf_mask = prev_rf_mask.reshape(batch_size, 1, -1, self.config.window_size, 1)
+                    prev_window_mask = self._helper_padding_mask(prev_rf_mask, prev_window_mask)
+        return (prev_window_mask, cur_window_mask, chunk_causal_mask, rf_mask)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            multibyte_decoding: Optional[bool] = None,
+    ) -> Tuple:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else self.config.output_hidden_states)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if self.gradient_checkpointing and self.training and use_cache:
+            raise ValueError("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+        batch_size, seq_len = input_ids.shape
+        #### Step 0. Hack
+        if (not self.training) and (not use_cache) and (not multibyte_decoding):
+            # forward-only inference mode.
+            # We tweak use_cache to be True to reuse code for generation
+            use_cache = True
+            device = input_ids.device if input_ids is not None else None
+            if position_ids is None:
+                position_ids = torch.arange(0, seq_len, device=device, dtype=int).reshape(1, -1).expand(batch_size, -1)
+        #### Step 1. Prepare caches if in inference mode
+        if use_cache:
+            if past_key_values is not None:
+                assert isinstance(past_key_values, Cache)
+            else:
+                if not USE_TRITON_IMPL:
+                    past_key_values = EvaCache()
+                else:
+                    past_key_values = EvaStaticCacheForTriton(
+                        input_ids.shape[0],
+                        self.config.num_attention_heads,
+                        self.config.window_size,
+                        self.config.hidden_size // self.config.num_attention_heads,
+                        self.config.num_hidden_layers,
+                        self.embed_tokens.weight.dtype,
+                        self.embed_tokens.weight.device,
+                    )
+        if not multibyte_decoding:
+            if use_cache:
+                if USE_TRITON_IMPL:
+                    causal_mask = self._prepare_eva_generation_attn_mask_triton(
+                        attention_mask,
+                        input_ids,
+                        use_cache,
+                        past_key_values
+                    )
+                else:
+                    causal_mask = self._prepare_eva_generation_attn_mask(
+                        attention_mask,
+                        input_ids,
+                        use_cache,
+                        past_key_values
+                    )
+            else:
+                assert self.training
+                assert seq_len % self.config.window_size == 0, "Training is only tested for sequences that are a multiple of window_size"
+                # for training, we need to pass in the attention mask
+                # usually calculated by _prepare_training_attn_mask()
+                causal_mask = attention_mask
+        else:
+            assert use_cache
+            causal_mask = attention_mask
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        max_seq_length = past_seen_tokens + inputs_embeds.shape[1]
+        hidden_states = inputs_embeds
+        if position_ids is None:
+            assert not use_cache, "during decoding we must explicitly pass position_ids to the model call"
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(past_seen_tokens, max_seq_length, device=device, dtype=int).reshape(1, -1).expand(batch_size, -1)
+        cos, sin = self.rotary_emb(hidden_states, seq_len=max_seq_length)
+        assert len(cos.shape) == 2, f"cos should be of shape (max_seq_len, head_dim), got {cos.shape} instead"
+        assert sin.shape == cos.shape, f"sin should be of shape (max_seq_len, head_dim), got {sin.shape} instead"
+        assert len(position_ids.shape) == 2, f"position_ids should be of 2D, got {position_ids.shape} instead"
+        cos = cos[position_ids, :]
+        sin = sin[position_ids, :]
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states, )
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cos,
+                    sin,
+                    multibyte_decoding,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cos=cos,
+                    sin=sin,
+                    multibyte_decoding=multibyte_decoding,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1], )
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states, )
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class EvaByteForCausalLM(EvaBytePreTrainedModel, MultiByteDecodingMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        EvaBytePreTrainedModel.__init__(self, config)
+        self.model = EvaByteModel(config)
+        self.vocab_size = config.vocab_size
+        # define multibyte prediction heads
+        if hasattr(config, "num_pred_heads") and config.num_pred_heads > 1:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size * config.num_pred_heads, bias=False)
+        else:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            return_all_pred_logits: Optional[bool] = None,
+            multibyte_decoding: Optional[bool] = None) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is None:
+            assert past_key_values is None
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            multibyte_decoding=multibyte_decoding,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        if self.config.fp32_logits:
+            logits = logits.float()
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(reduction="none")
+            if hasattr(self.config, "num_pred_heads") and self.config.num_pred_heads > 1:
+                shift_logits = logits.view(logits.shape[0], logits.shape[1], self.config.num_pred_heads, self.config.vocab_size)
+                # shift_logits = shift_logits.view(-1, logits.shape[1] * self.config.num_pred_heads, self.config.vocab_size)
+                shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            else:
+                shift_logits = logits.view(-1, self.config.vocab_size)
+            shift_labels = labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if hasattr(self.config, "num_pred_heads") and self.config.num_pred_heads > 1:
+            all_pred_logits = logits.reshape(logits.shape[0], logits.shape[1], self.config.num_pred_heads, self.config.vocab_size)
+            if return_all_pred_logits:
+                logits = all_pred_logits
+            else:
+                logits = all_pred_logits[..., 0, :]
+        if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return (loss, ) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past_key_values=None,
+                                      attention_mask=None,
+                                      inputs_embeds=None,
+                                      use_cache=True,
+                                      **kwargs):
+        # prefill phase:
+        # input_ids:      b x s
+        # attention_mask: None if no padding or b x s
+        # position_ids :  b x s
+        # token gen phase:
+        # input_ids : b x 1
+        # attention_mask: b x 1 x s
+        # position_ids:  b x 1
+        past_length = 0
+        if past_key_values is not None:
+            assert isinstance(past_key_values, Cache)
+            past_length = past_key_values.get_seq_length()
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        # must initialize position_ids at each step during GPU inference
+        assert position_ids is not None
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), )
+        return reordered_past

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/multibyte_decoding_evabyte.py ADDED Viewed

	@@ -0,0 +1,881 @@

+# The implementation of multibyte deocidng is largely adapted from
+# Medusa decoding: https://github.com/FasterDecoding/Medusa
+import torch
+import torch.nn.functional as F
+from transformers.generation.stopping_criteria import (
+    MaxLengthCriteria,
+    StoppingCriteriaList,
+)
+from typing import Union, List
+from .eva_cache import EvaStaticCacheForTriton
+from .eva_prep_kv_kernel import triton_eva_prep_kv_fwd
+class MultibyteEosTokenCriteria:
+    """
+    This class implements a simple stopping criteria to stop generation whenever
+    the "end-of-sequence" token is generated in the last `new_tokens` tokens.
+    Adapted from
+    https://github.com/huggingface/transformers/blob/main/src/transformers/generation/stopping_criteria.py#L446
+    By default, it uses the `model.generation_config.eos_token_id`.
+    Args:
+        eos_token_id (`Union[int, List[int]]`):
+            The id(s) of the *end-of-sequence* token.
+    """
+    def __init__(self, eos_token_ids: Union[int, List[int]]):
+        if isinstance(eos_token_ids, int):
+            eos_token_ids = [eos_token_ids]
+        self.eos_token_ids = eos_token_ids
+    def __call__(self, input_ids: torch.LongTensor, new_tokens: int) -> bool:
+        current_input_len = input_ids.shape[-1]
+        new_token_ids = input_ids[:, current_input_len - new_tokens:]
+        for eos_token_id in self.eos_token_ids:
+            if torch.any(new_token_ids == eos_token_id):
+                return True
+        return False
+def build_tree(spec):
+    nodes_at_depth = []
+    nodes_at_depth.append([()])  # Root at depth 1
+    for d in range(1, len(spec) + 1):
+        prev_nodes = nodes_at_depth[d - 1]
+        spec_list = spec[d - 1]
+        current_nodes = []
+        for node_idx, node in enumerate(prev_nodes):
+            if node_idx < len(spec_list):
+                num_children = spec_list[node_idx]
+            else:
+                num_children = 0
+            for child_idx in range(num_children):
+                new_node = node + (child_idx,)
+                current_nodes.append(new_node)
+        nodes_at_depth.append(current_nodes)
+    # Flatten the list of nodes, excluding the root node if desired
+    all_nodes = [node for depth_nodes in nodes_at_depth for node in depth_nodes if node]
+    return all_nodes
+evabyte_7b_95 = build_tree(
+    [
+        [10],
+        [10, 8, 2, 2, 1, 1],
+        [10, 4, 2, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 1],
+        [8, 2, 2, 1, 0, 0, 0, 0, 0, 0, 1],
+        [6, 2, 1, 1],
+        [4, 2, 1, 1],
+        [4, 2, 1],
+    ]
+)
+evabyte_7b_31 = build_tree(
+    [
+        [4],
+        [3, 2, 1, 1],
+        [3, 2, 1, 1],
+        [2, 1, 1],
+        [2, 1],
+        [2, 1],
+        [2, 1],
+    ]
+)
+TOPK = 10 # topk for sparse tree (10 is a placeholder and it is sufficient)
+def pad_path(path, length, pad_value=-2):
+    """
+    Pad the given path list with a specific value up to a specified length.
+    Parameters:
+    - path (list): The original list that needs padding.
+    - length (int): The desired length of the padded list.
+    - pad_value (optional, default=-2): The value to use for padding.
+    Returns:
+    - list: A new list based on the original path but padded to the desired length.
+    Example:
+    >>> pad_path([1,2,3], 5)
+    [1, 2, 3, -2, -2]
+    Note:
+    If the given path is already longer than the specified length,
+    then no padding occurs, and the original path is returned.
+    """
+    return path + [pad_value] * (length - len(path))
+def reset_past_key_values(passed_key_values):
+    """
+    Resets the current lengths in the passed key-values to zero.
+    This function is designed to be used during the evaluation of a baseline model.
+    It iterates through each layer's key-values and sets their current lengths to zero,
+    effectively resetting their state.
+    Args:
+    - passed_key_values (list of torch.Tensor): Contains past hidden states and past attention values for each layer.
+    Returns:
+    - passed_key_values (list of torch.Tensor): Updated past hidden states and past attention values with reset lengths.
+    """
+    for i in range(len(passed_key_values)):
+        for j in range(2):
+            passed_key_values[i][j].current_length.fill_(0)
+    return passed_key_values
+def get_nucleus_one_token(logit, temperature, top_p):
+    """
+    Performs token sampling based on the nucleus (top-p) sampling method.
+    This function selects a token from a given logit distribution using the nucleus sampling strategy.
+    It allows for more controlled and diverse generation compared to traditional top-k sampling.
+    Args:
+        logit (torch.Tensor): The logits from a language model output, expected to be a 2D tensor (BxC).
+        temperature (float): A temperature parameter to control the randomness in sampling.
+                             Higher values increase diversity, lower values make selections more deterministic.
+        top_p (float): The cumulative probability threshold for nucleus sampling.
+                       It controls the size of the set of high-probability tokens to consider for sampling.
+    Returns:
+        torch.Tensor: A tensor containing the indices of the sampled tokens.
+    """
+    if top_p >= 1:
+        return torch.multinomial(F.softmax(logit / temperature, dim=-1), 1)
+    logit = logit / temperature
+    probs = torch.softmax(logit, dim=-1)
+    sorted_logits, sorted_indices = torch.sort(probs, descending=True)
+    cum_probs = torch.cumsum(sorted_logits, dim=-1)
+    sorted_indices_to_remove = cum_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+    logit[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logit, dim=-1), 1)
+    return sampled_tokens
+def get_typical_one_token(logit, temperature, posterior_threshold, posterior_alpha):
+    """
+    Implements token sampling based on the typical sampling method.
+    This function selects a token from a given logit distribution using the typical sampling strategy,
+    aiming to balance between diversity and likelihood in a more nuanced way compared to traditional methods.
+    Args:
+        logit (torch.Tensor): The logits from a language model output, expected to be a 2D tensor.
+        temperature (float): A parameter to control the randomness in sampling.
+                              Higher values increase diversity, lower values make selections more deterministic.
+        posterior_threshold (float): A threshold to decide the lower bound of probabilities to be considered for sampling.
+        posterior_alpha (float): A scaling factor applied to the entropy-based adaptive threshold.
+    Returns:
+        torch.Tensor: A tensor containing the indices of the sampled tokens.
+    """
+    logit = logit / temperature
+    probs = torch.softmax(logit, dim=-1)
+    entropy = -torch.sum(
+            probs * torch.log(probs + 1e-5), dim=-1
+        )
+    threshold = torch.minimum(
+            torch.ones_like(entropy) * posterior_threshold,
+            torch.exp(-entropy) * posterior_alpha,
+        )
+    indices_to_remove = probs < threshold.unsqueeze(-1)
+    logit[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logit, dim=-1), 1)
+    return sampled_tokens
+def generate_medusa_buffers(medusa_choices, device="cuda"):
+    """
+    Generate buffers for the Medusa structure based on the provided choices.
+    Parameters:
+    - medusa_choices (list): A nested list representing tree in the Medusa structure.
+    - device (str): Device to which the tensors should be moved. Default is "cuda".
+    Returns:
+    - dict: A dictionary containing buffers related to the Medusa structure.
+    """
+    # Sort the medusa_choices based on their lengths and then their values
+    sorted_medusa_choices = sorted(medusa_choices, key=lambda x: (len(x), x))
+    medusa_len = len(sorted_medusa_choices) + 1
+    # Initialize depth_counts to keep track of how many choices have a particular depth
+    depth_counts = [0] * max([len(path) for path in sorted_medusa_choices])
+    for path in sorted_medusa_choices:
+        depth_counts[len(path) - 1] += 1
+    # Create the attention mask for Medusa
+    medusa_attn_mask = torch.eye(medusa_len, medusa_len)
+    medusa_attn_mask[:, 0] = 1
+    start = 0
+    for i in range(len(depth_counts)):
+        for j in range(depth_counts[i]):
+            cur_medusa_choice = sorted_medusa_choices[start + j]
+            # retrieve ancestor position
+            if len(cur_medusa_choice) == 1:
+                continue
+            ancestor_idx = []
+            for c in range(len(cur_medusa_choice) - 1):
+                ancestor_idx.append(sorted_medusa_choices.index(cur_medusa_choice[:c+1]) + 1)
+            medusa_attn_mask[j + start + 1, ancestor_idx] = 1
+        start += depth_counts[i]
+    # Generate tree indices for the Medusa structure
+    medusa_tree_indices = torch.zeros(medusa_len, dtype=torch.long)
+    medusa_tree_indices[0] = 0
+    start = 0
+    for i in range(len(depth_counts)):
+        for j in range(depth_counts[i]):
+            cur_medusa_choice = sorted_medusa_choices[start + j]
+            medusa_tree_indices[start + j + 1] = cur_medusa_choice[-1] + TOPK * i + 1
+        start += depth_counts[i]
+    # Generate position IDs for the Medusa structure
+    medusa_position_ids = torch.zeros(medusa_len, dtype=torch.long)
+    start = 0
+    for i in range(len(depth_counts)):
+        medusa_position_ids[start + 1: start + depth_counts[i] + 1] = i + 1
+        start += depth_counts[i]
+    # Generate retrieval indices for Medusa structure verification
+    retrieve_indices_nest = []
+    retrieve_paths = []
+    for i in range(len(sorted_medusa_choices)):
+        cur_medusa_choice = sorted_medusa_choices[-i-1]
+        retrieve_indice = []
+        if cur_medusa_choice in retrieve_paths:
+            continue
+        else:
+            for c in range(len(cur_medusa_choice)):
+                retrieve_indice.append(sorted_medusa_choices.index(cur_medusa_choice[:c+1]))
+                retrieve_paths.append(cur_medusa_choice[:c+1])
+        retrieve_indices_nest.append(retrieve_indice)
+    max_length = max([len(x) for x in retrieve_indices_nest])
+    retrieve_indices = [pad_path(path, max_length) for path in retrieve_indices_nest]
+    retrieve_indices = torch.tensor(retrieve_indices, dtype=torch.long)
+    retrieve_indices = retrieve_indices + 1
+    retrieve_indices = torch.cat([torch.zeros((retrieve_indices.shape[0], 1), dtype=torch.long), retrieve_indices], dim=1)
+    # Aggregate the generated buffers into a dictionary
+    medusa_buffers = {
+        "medusa_attn_mask": medusa_attn_mask.unsqueeze(0).unsqueeze(0),
+        "tree_indices": medusa_tree_indices,
+        "medusa_position_ids": medusa_position_ids.unsqueeze(0),
+        "retrieve_indices": retrieve_indices,
+    }
+    # Move the tensors in the dictionary to the specified device
+    medusa_buffers = {
+        k: v.clone().to(device)
+        if isinstance(v, torch.Tensor)
+        else torch.tensor(v, device=device)
+        for k, v in medusa_buffers.items()
+    }
+    return medusa_buffers
+def generate_candidates(
+        medusa_logits,
+        logits,
+        tree_indices,
+        retrieve_indices,
+        temperature = 0,
+        posterior_threshold=0.3,
+        posterior_alpha = 0.09,
+        top_p=0.8,
+        sampling = 'typical',
+        fast = False
+    ):
+    # Say we have 3 heads, and the top-4 for each head are:
+    # [10, 3, 8, 4]
+    # [9, 5, 1, 6]
+    # [7, 16, 3, 2]
+    # candidates_id = 10
+    if temperature == 0 or fast:
+        candidates_ids = torch.argmax(logits[:, -1]).unsqueeze(0)
+    else:
+        if sampling == 'typical':
+            candidates_ids = get_typical_one_token(logits[:, -1], temperature, posterior_threshold, posterior_alpha).squeeze(0)
+        elif sampling == 'nucleus':
+            candidates_ids = get_nucleus_one_token(logits[:, -1], temperature, top_p).squeeze(0)
+        else:
+            raise NotImplementedError
+    # this calculates the top-k medusa logits
+    # candidates_medusa_id = [
+    #   [9, 5, 1, 6]
+    #   [7, 16, 3, 2]
+    # ]
+    candidates_medusa_ids = torch.topk(medusa_logits[:, 0, -1], TOPK, dim=-1).indices
+    # [10, 9, 5, 1, 6, 7, 16, 3, 2]
+    candidate_ids = torch.cat([candidates_ids, candidates_medusa_ids.view(-1)], dim=-1)
+    # based on the pre-defined tree_indices, select the corresponding candidates
+    # if we select top-2 and top-3 for the two heads (we select top-1 for the first head):
+    # tree_candidates = [10, 9, 5, 7, 16, 3, 7, 16, 3]
+    tree_candidate_ids = candidate_ids[tree_indices]
+    # tree_candidate_ids = [10, 9, 5, 7, 16, 3, 7, 16, 3, 0]
+    # Sometimes the tree_indices are padded, so we append a zero here
+    # so that all padded indices select the appended zero.
+    tree_candidate_ids_ext = torch.cat(
+        [
+            tree_candidate_ids,
+            torch.zeros((1), dtype=torch.long, device=tree_candidate_ids.device)
+        ],
+        dim=0
+    )
+    # [[10, 9, 7], [10, 9, 16], [10, 9, 3], [10, 5, 7], [10, 5, 16], [10, 5, 3]]
+    unflattened_candidate_ids = tree_candidate_ids_ext[retrieve_indices]
+    tree_candidate_ids = tree_candidate_ids.unsqueeze(0)
+    return tree_candidate_ids, unflattened_candidate_ids
+def get_nucleus_posterior_mask(logits, candidates, temperature, top_p):
+    """
+    Generates a posterior mask for token candidates using nucleus (top-p) sampling.
+    This function applies nucleus sampling to a set of logits, and then generates a mask indicating
+    which candidate tokens are selected. It adapts the sampling strategy to accommodate for
+    temperature scaling and cumulative probability thresholding.
+    Args:
+        logits (torch.Tensor): A tensor of logits from a language model output.
+        candidates (torch.Tensor): A tensor of candidate tokens to compare against sampled tokens.
+        temperature (float): A parameter to scale the logits, controlling randomness in sampling.
+        top_p (float): The cumulative probability threshold for nucleus sampling.
+    Returns:
+        torch.Tensor: A posterior mask indicating which candidate tokens match the sampled tokens.
+    """
+    # adapted from https://github.com/huggingface/transformers/blob/18a879f47576822aa1a5c49aecb27d89bfa5fa69/examples/run_generation.py#L79
+    # Apply temperature
+    logits = logits[:, :-1] / temperature
+    n_samples, n_tokens = logits.shape[0], logits.shape[1]
+    logits = logits.view(n_samples*n_tokens, -1)
+    if top_p >= 1:
+        sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+        sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+        posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+        return posterior_mask
+    # Convert to probabilities (softmax)
+    probs = F.softmax(logits, dim=-1)
+    # Sort the probabilities
+    sorted_logits, sorted_indices = torch.sort(probs, descending=True)
+    # Compute cumulative probabilities
+    cum_probs = torch.cumsum(sorted_logits, dim=-1)
+    # Create mask for the top-p nucleus
+    sorted_indices_to_remove = cum_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+    # Remove low-probability tokens
+    logits[indices_to_remove] = float('-inf')
+    # Sample from the remaining tokens
+    sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+    sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+    # Create a mask for selected tokens
+    posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+    return posterior_mask
+def get_typical_posterior_mask(logits, candidates, temperature, posterior_threshold, posterior_alpha):
+    """
+    Args:
+        logits (torch.Tensor): A tensor of logits from a language model output.
+        candidates (torch.Tensor): A tensor of candidate tokens to compare against sampled tokens.
+        temperature (float): A parameter to scale the logits, controlling randomness in sampling.
+        posterior_threshold (float): The minimum threshold for probabilities to be considered in sampling.
+        posterior_alpha (float): A scaling factor applied to the entropy-based adaptive threshold.
+    Returns:
+        torch.Tensor: A posterior mask indicating which candidate tokens match the sampled tokens.
+    """
+    logits = logits[:, :-1] / temperature
+    n_samples, n_tokens = logits.shape[0], logits.shape[1]
+    logits = logits.view(n_samples*n_tokens, -1)
+    probs = F.softmax(logits, dim=-1)
+    entropy = -torch.sum(
+            probs * torch.log(probs + 1e-5), dim=-1
+        )
+    threshold = torch.minimum(
+            torch.ones_like(entropy) * posterior_threshold,
+            torch.exp(-entropy) * posterior_alpha,
+        )
+    indices_to_remove = probs < threshold.unsqueeze(-1)
+    logits[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+    sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+    posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+    return posterior_mask
+def evaluate_posterior(
+    logits,
+    candidates,
+    temperature,
+    posterior_threshold=0.3,
+    posterior_alpha = 0.09,
+    top_p=0.8,
+    sampling = 'typical',
+    fast = True
+):
+    if logits.shape[1] <= 1:
+        return torch.tensor(0, dtype=torch.long, device=candidates.device), 0
+    # Greedy decoding based on temperature value
+    if temperature == 0:
+        # Find the tokens that match the maximum logits for each position in the sequence
+        posterior_mask = (
+            candidates[:, 1:] == torch.argmax(logits[:, :-1], dim=-1)
+        ).int()
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        accept_length = candidates_accept_length.max().item()
+        # Choose the best candidate
+        if accept_length == 0:
+            # Default to the first candidate if none are accepted
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+        return best_candidate, accept_length
+    elif sampling == 'typical':
+        if fast:
+            posterior_prob = torch.softmax(logits[:, :-1] / temperature, dim=-1)
+            candidates_prob = torch.gather(
+                posterior_prob, dim=-1, index=candidates[:, 1:].unsqueeze(-1)
+            ).squeeze(-1)
+            posterior_entropy = -torch.sum(
+                posterior_prob * torch.log(posterior_prob + 1e-5), dim=-1
+            )  # torch.sum(torch.log(*)) is faster than torch.prod
+            threshold = torch.minimum(
+                torch.ones_like(posterior_entropy) * posterior_threshold,
+                torch.exp(-posterior_entropy) * posterior_alpha,
+            )
+            posterior_mask = candidates_prob > threshold
+            candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+            # Choose the best candidate based on the evaluated posterior probabilities
+            accept_length = candidates_accept_length.max().item()
+            if accept_length == 0:
+                # If no candidates are accepted, just choose the first one
+                best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+            else:
+                best_candidates = torch.where(candidates_accept_length == accept_length)[0]
+                # Accept the best one according to likelihood
+                likelihood = torch.sum(
+                    torch.log(candidates_prob[best_candidates, :accept_length]), dim=-1
+                )
+                best_candidate = best_candidates[torch.argmax(likelihood)]
+            return best_candidate, accept_length
+        # Calculate posterior probabilities and thresholds for candidate selection
+        posterior_mask = get_typical_posterior_mask(logits, candidates, temperature, posterior_threshold, posterior_alpha)
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        # Choose the best candidate based on the evaluated posterior probabilities
+        accept_length = candidates_accept_length.max().item()
+        if accept_length == 0:
+            # If no candidates are accepted, just choose the first one
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+            # Accept the best one according to likelihood
+        return best_candidate, accept_length
+    elif sampling == 'nucleus':
+        assert top_p < 1.0 + 1e-6, "top_p should between 0 and 1"
+        posterior_mask = get_nucleus_posterior_mask(logits, candidates, temperature, top_p)
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        accept_length = candidates_accept_length.max().item()
+        # Choose the best candidate
+        if accept_length == 0:
+            # Default to the first candidate if none are accepted
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+        return best_candidate, accept_length
+    else:
+        raise NotImplementedError
+def update_inference_inputs(
+    input_ids,
+    medusa_logits,
+    logits,
+    candidate_ids,
+    best_candidate,
+    accept_length,
+):
+    input_ids = torch.cat(
+        [
+            input_ids,
+            candidate_ids[None, best_candidate, : accept_length + 1]
+        ],
+        dim=-1
+    )
+    logits = logits[
+        None, best_candidate, accept_length : accept_length + 1
+    ]
+    medusa_logits = medusa_logits[
+        :, None, best_candidate, accept_length : accept_length + 1
+    ]
+    # Update the new token counter
+    new_token = accept_length + 1
+    return input_ids, medusa_logits, logits, new_token
+def split_logits(full_logits):
+    # logits has shape [b, n, heads, vocab_size]
+    logits = full_logits[..., 0, :]
+    medusa_logits = full_logits[..., 1:, :].permute(2, 0, 1, 3)
+    return medusa_logits, logits
+class MultiByteDecodingMixin:
+    def multi_byte_pred_update_cache(
+        self,
+        past_key_values,
+        retrieve_indices,
+        best_candidate,
+        new_tokens,
+    ):
+        prev_window_len = past_key_values.get_past_window_pos(0)
+        select_indices = (
+            retrieve_indices[best_candidate, : new_tokens] + prev_window_len
+        )
+        for layer_idx in range(self.config.num_hidden_layers):
+            past_key_values.update_past_len(new_tokens, layer_idx)
+            past_window_k = past_key_values.past_window_k[layer_idx]
+            past_window_v = past_key_values.past_window_v[layer_idx]
+            tgt_window_k = past_window_k[..., select_indices, :]
+            tgt_window_v = past_window_v[..., select_indices, :]
+            dst_window_k = past_window_k[..., prev_window_len : prev_window_len + new_tokens, :]
+            dst_window_v = past_window_v[..., prev_window_len : prev_window_len + new_tokens, :]
+            dst_window_k.copy_(tgt_window_k, non_blocking=True)
+            dst_window_v.copy_(tgt_window_v, non_blocking=True)
+            new_window_len = prev_window_len + new_tokens
+            if new_window_len >= self.config.window_size:
+                assert new_window_len < 2 * self.config.window_size
+                dump_k = past_window_k[..., :self.config.window_size, :].clone()
+                dump_v = past_window_v[..., :self.config.window_size, :].clone()
+                _window_len = new_window_len - self.config.window_size
+                if _window_len > 0:
+                    new_window_k = past_window_k[..., self.config.window_size : new_window_len, :]
+                    new_window_v = past_window_v[..., self.config.window_size : new_window_len, :]
+                    _dst_window_k = past_window_k[..., : _window_len, :]
+                    _dst_window_v = past_window_v[..., : _window_len, :]
+                    _dst_window_k.copy_(new_window_k, non_blocking=True)
+                    _dst_window_v.copy_(new_window_v, non_blocking=True)
+                past_key_values.past_window_pos[layer_idx] = _window_len
+            else:
+                dump_k = None
+                dump_v = None
+                past_key_values.past_window_pos[layer_idx] = new_window_len
+            if dump_k is not None and dump_v is not None:
+                rfa_k, rfa_v = triton_eva_prep_kv_fwd(
+                    dump_k, dump_v,
+                    self.model.layers[layer_idx].self_attn.adaptive_mu_k,
+                    self.model.layers[layer_idx].self_attn.adaptive_phi,
+                    None,
+                    self.model.layers[layer_idx].self_attn.head_dim_scaling,
+                    self.model.layers[layer_idx].self_attn.chunk_size
+                )
+                rfa_k, rfa_v = past_key_values.update_chunk_rfas(
+                    rfa_k, rfa_v, layer_idx
+                )
+        return past_key_values
+    def _multi_byte_pred_update_cache_when_prefil_len_eq_window_size(
+        self,
+        past_key_values,
+    ):
+        prev_window_len = past_key_values.get_past_window_pos(0)
+        for layer_idx in range(self.config.num_hidden_layers):
+            past_window_k = past_key_values.past_window_k[layer_idx]
+            past_window_v = past_key_values.past_window_v[layer_idx]
+            new_window_len = prev_window_len
+            if new_window_len == self.config.window_size:
+                dump_k = past_window_k[..., :self.config.window_size, :].clone()
+                dump_v = past_window_v[..., :self.config.window_size, :].clone()
+                past_key_values.past_window_pos[layer_idx] = 0
+                if dump_k is not None and dump_v is not None:
+                    rfa_k, rfa_v = triton_eva_prep_kv_fwd(
+                        dump_k, dump_v,
+                        self.model.layers[layer_idx].self_attn.adaptive_mu_k,
+                        self.model.layers[layer_idx].self_attn.adaptive_phi,
+                        None,
+                        self.model.layers[layer_idx].self_attn.head_dim_scaling,
+                        self.model.layers[layer_idx].self_attn.chunk_size
+                    )
+                    rfa_k, rfa_v = past_key_values.update_chunk_rfas(
+                        rfa_k, rfa_v, layer_idx
+                    )
+        return past_key_values
+    def multi_byte_pred_update_attn_mask(
+        self,
+        last_iter_new_tokens,
+        tree_candidate_ids,
+        past_attn_mask,
+        medusa_attn_mask,
+        past_key_values,
+    ):
+        batch_size, tree_candidate_len = tree_candidate_ids.shape
+        seen_tokens = past_key_values.get_seq_length()
+        # NOTE: past_key_values has been updated so now
+        # seen_tokens incldues new tokens from the last tree iteration
+        assert seen_tokens > 0
+        # so one iteration would not cross two windows
+        assert last_iter_new_tokens < self.config.window_size
+        if past_attn_mask is not None and seen_tokens < self.config.window_size:
+            past_attn_mask = torch.cat(
+                [
+                    past_attn_mask,
+                    torch.ones(
+                        [batch_size, 1, tree_candidate_len, last_iter_new_tokens],
+                        dtype=torch.bool,
+                        device=self.device
+                    )
+                ],
+                dim=-1
+            )
+        else:
+            # we initialize attn mask each time when
+            # 1. the model crosses the window bounary, or
+            # 2. after prefilling
+            chunks_per_window = int(self.config.window_size // self.config.chunk_size)
+            window_tokens = seen_tokens % self.config.window_size
+            num_windows_seen_so_far = seen_tokens // self.config.window_size
+            attn_mask_len = num_windows_seen_so_far * chunks_per_window + window_tokens
+            past_attn_mask = torch.ones(
+                (batch_size, 1, tree_candidate_len, attn_mask_len),
+                dtype=torch.bool,
+                device=self.device
+            )
+        # note that 1 indicates the position is not masked
+        tree_attn_mask = torch.cat(
+            [
+                past_attn_mask,
+                medusa_attn_mask.to(torch.bool)
+            ],
+            dim=-1
+        )
+        return tree_attn_mask, past_attn_mask
+    @torch.no_grad()
+    def multi_byte_generate(
+        self,
+        input_ids,
+        attention_mask=None,
+        temperature=0.0,
+        max_length=None,
+        max_new_tokens=None,
+        stopping_criteria=None,
+        posterior_threshold=0.09,
+        posterior_alpha=0.3,
+        top_p=0.8,
+        sampling='typical',
+        fast=True,
+        do_sample=False,
+        medusa_choices=None,
+        return_acc_lengths=False
+    ):
+        if do_sample or temperature > 0.0:
+            fast = False
+        ### Prepare `max_length` depending on other stopping criteria.
+        if max_new_tokens is not None:
+            max_length = max_new_tokens + input_ids.shape[-1]
+        elif max_new_tokens is None and max_length is None:
+            max_length = getattr(self.config, "max_position_embeddings", 32768)
+        ### Set up stopping criteria
+        eos_stop_criteria = MultibyteEosTokenCriteria(self.generation_config.eos_token_id)
+        stop_criteria = StoppingCriteriaList()
+        if max_length is not None:
+            max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+            stop_criteria.append(
+                MaxLengthCriteria(
+                    max_length=max_length,
+                    max_position_embeddings=max_position_embeddings,
+                )
+            )
+        if stopping_criteria is not None and len(stopping_criteria) > 0:
+            stop_criteria.extend(stopping_criteria)
+        assert input_ids.shape[0] == 1, "Only support batch size 1 for now"
+        assert attention_mask is None, "Only support attention mask None for now"
+        # Avoid modifying the input_ids in-place
+        input_ids = input_ids.clone()
+        position_ids = torch.arange(0, input_ids.shape[1], device=self.device, dtype=int).reshape(1, -1)
+        ####################################################
+        # 0. initialize the medusa buffers
+        ####################################################
+        if medusa_choices is None:
+            medusa_choices = evabyte_7b_95
+        medusa_buffers = generate_medusa_buffers(
+            medusa_choices, device=self.device
+        )
+        past_key_values = EvaStaticCacheForTriton(
+            input_ids.shape[0],
+            self.config.num_attention_heads,
+            # we add 256 to allow tree ids
+            self.config.window_size + 256,
+            self.config.hidden_size // self.config.num_attention_heads,
+            self.config.num_hidden_layers,
+            self.lm_head.weight.dtype,
+            self.lm_head.weight.device,
+        )
+        # prefill to get medusa logits and logits
+        full_logits, past_key_values = self.forward(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=True,
+            past_key_values=past_key_values,
+            return_all_pred_logits=True,
+            multibyte_decoding=False,
+        )
+        # handles an edge case where the prefill length == window_size
+        # we force the previous window to be dumped into RFA chunks
+        past_key_values = self._multi_byte_pred_update_cache_when_prefil_len_eq_window_size(
+            past_key_values
+        )
+        medusa_logits, logits = split_logits(full_logits)
+        past_attn_mask = None
+        last_iter_new_tokens = 0
+        max_iters = 32768
+        if return_acc_lengths:
+            acc_lengths = []
+        for _ in range(max_iters):
+            ####################################################
+            # 1. generate candidate_ids with topk predictions from Medusa heads
+            ####################################################
+            tree_candidate_ids, unflattened_candidate_ids = generate_candidates(
+                medusa_logits,
+                logits,
+                medusa_buffers["tree_indices"],
+                medusa_buffers["retrieve_indices"],
+                temperature=temperature,
+                posterior_alpha=posterior_alpha,
+                posterior_threshold=posterior_threshold,
+                top_p=top_p,
+                sampling=sampling,
+                fast=fast,
+            )
+            ####################################################
+            # 2. Build the medusa attention mask and position ids
+            ####################################################
+            # NOTE: 1 indicates the position is not masked
+            medusa_attn_mask, past_attn_mask = self.multi_byte_pred_update_attn_mask(
+                last_iter_new_tokens,
+                tree_candidate_ids,
+                past_attn_mask,
+                medusa_buffers["medusa_attn_mask"],
+                past_key_values,
+            )
+            medusa_position_ids = medusa_buffers["medusa_position_ids"] + input_ids.shape[1]
+            ####################################################
+            # 3. tree decoding
+            ####################################################
+            tree_full_logits, past_key_values = self.forward(
+                tree_candidate_ids,
+                past_key_values=past_key_values,
+                attention_mask=medusa_attn_mask,
+                position_ids=medusa_position_ids,
+                return_all_pred_logits=True,
+                multibyte_decoding=True,
+            )
+            _medusa_logits, _logits = split_logits(tree_full_logits)
+            medusa_logits = _medusa_logits[..., 0, medusa_buffers["retrieve_indices"], :]
+            logits = _logits[..., 0, medusa_buffers["retrieve_indices"], :]
+            ####################################################
+            # 4. candidate selection
+            ####################################################
+            # if the current iteration, with tree tokens, crosses window
+            # boundaries, trim the condidate_ids to be within the window
+            # so that those exceeded tokens (which will be inaccurate)
+            # will not be considered
+            tree_depth = unflattened_candidate_ids.shape[-1]
+            if tree_depth + past_key_values.get_past_window_pos(0) > self.config.window_size:
+                max_acc_len = self.config.window_size - past_key_values.get_past_window_pos(0)
+                _trimmed_unflattened_candidate_ids = unflattened_candidate_ids[:, :max_acc_len]
+                _trimmed_logits = logits[:, :max_acc_len]
+            else:
+                _trimmed_unflattened_candidate_ids = unflattened_candidate_ids
+                _trimmed_logits = logits
+            best_candidate, accept_length = evaluate_posterior(
+                _trimmed_logits,
+                _trimmed_unflattened_candidate_ids,
+                temperature,
+                posterior_threshold,
+                posterior_alpha,
+                top_p=top_p,
+                sampling=sampling,
+                fast=fast
+            )
+            ####################################################
+            # 5. update model inputs and caches
+            ####################################################
+            input_ids, medusa_logits, logits, last_iter_new_tokens = update_inference_inputs(
+                input_ids,
+                medusa_logits,
+                logits,
+                unflattened_candidate_ids,
+                best_candidate,
+                accept_length,
+            )
+            past_key_values = self.multi_byte_pred_update_cache(
+                past_key_values,
+                medusa_buffers["retrieve_indices"],
+                best_candidate,
+                last_iter_new_tokens,
+            )
+            if return_acc_lengths:
+                acc_lengths.append(last_iter_new_tokens)
+            if stop_criteria(input_ids, None) or eos_stop_criteria(input_ids, last_iter_new_tokens):
+                if return_acc_lengths:
+                    return input_ids, acc_lengths
+                else:
+                    return input_ids
+        if return_acc_lengths:
+            return input_ids, acc_lengths
+        else:
+            return input_ids

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_evabyte.EvaByteImageProcessor",
+    "AutoProcessor": "processing_evabyte.EvaByteProcessor"
+  },
+  "do_convert_rgb": true,
+  "do_resize": true,
+  "image_processor_type": "EvaByteImageProcessor",
+  "jpeg_quality": 25,
+  "jpeg_restart_marker_blocks": 1,
+  "jpeg_streamtype": 2,
+  "jpeg_subsampling": "4:2:0",
+  "processor_class": "EvaByteProcessor",
+  "resample": 1,
+  "size": {
+    "longest_edge": 384
+  }
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/processing_evabyte.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# coding=utf-8
+"""
+Processor class for EvaByte.
+"""
+import base64
+from io import BytesIO
+import requests
+import os
+import PIL
+from PIL import Image
+from typing import List, Optional, Union
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, is_valid_image
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import TensorType, to_py_obj
+def fetch_image(image: Union[str, "PIL.Image.Image"]) -> Image.Image:
+    image_obj = None
+    if isinstance(image, Image.Image):
+        image_obj = image
+    elif image.startswith("http://") or image.startswith("https://"):
+        image_obj = Image.open(BytesIO(requests.get(image, timeout=None).content))
+    elif os.path.isfile(image):
+        image_obj = Image.open(image)
+    elif image.startswith("data:image/"):
+        image = image.split(",")[1]
+        # Try to load as base64
+        try:
+            b64 = base64.decodebytes(image.encode())
+            image = PIL.Image.open(BytesIO(b64))
+        except Exception as e:
+            raise ValueError(
+                f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
+            )
+    else:
+        image_obj = Image.open(image)
+    if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
+    return image_obj
+def is_url(val) -> bool:
+    return isinstance(val, str) and val.startswith("http")
+def is_file(val) -> bool:
+    return isinstance(val, str) and os.path.isfile(val)
+def is_image_or_image_url(elem):
+    return is_url(elem) or is_valid_image(elem) or is_file(elem)
+vl_chat_template = """
+{{- bos_token }}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{{- '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>'}}
+{%- for message in messages %}
+    {%- if (message['role'] != 'user') and (message['role'] != 'assistant') %}
+        {{- raise_exception('Conversation roles must be user or assistant') }}
+    {%- endif %}
+    {%- if message['content'] is string %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}
+    {%- else %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+        {%- for content in message['content'] %}
+            {%- if content['type'] == 'image' %}
+                {{- '<image_placeholder>\n' }}
+            {%- elif content['type'] == 'text' %}
+                {{- content['text'] }}
+            {%- endif %}
+        {%- endfor %}
+        {{- '<|eot_id|>' }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\n\n' }}
+{%- endif %}
+"""
+class EvaByteProcessor(ProcessorMixin):
+    r"""
+    Constructs a EvaByte processor which wraps a EvaByte image processor and a EvaByte tokenizer into a single processor.
+    [`EvaByteProcessor`] offers all the functionalities of [`EvaByteImageProcessor`] and [`EvaByteTokenizer`]. See the
+    [`~EvaByteProcessor.__call__`] and [`~EvaByteProcessor.decode`] for more information.
+    Args:
+        image_processor ([`EvaByteImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`EvaByteTokenizer`], *optional*):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        super().__init__(image_processor, tokenizer)
+        self.t2v_token_id = self.tokenizer.convert_tokens_to_ids("<t2v_token>")
+        self.v2t_token_id = self.tokenizer.convert_tokens_to_ids("<v2t_token>")
+        self.image_placeholder = "<image_placeholder>"
+        self.vl_chat_template = vl_chat_template
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        strip_ending_sentinel: bool = False,
+        encode_only: bool = False,
+        **kwargs
+    ) -> Union[BatchFeature, List[List[int]]]:
+        # processing pipeline:
+        # 1. read images or videos from paths
+        # 2. use image_processor to convert images / videos to byte streams
+        if images is not None:
+            if isinstance(images, bytes):
+                image_bytes_list = [[images]]
+            elif isinstance(images, list) and isinstance(images[0], bytes):
+                image_bytes_list = [images]
+            elif isinstance(images, list) and isinstance(images[0], list) and isinstance(images[0][0], bytes):
+                image_bytes_list = images
+            else:
+                if is_image_or_image_url(images):
+                    images = [[images]]
+                elif isinstance(images, list) and is_image_or_image_url(images[0]):
+                    images = [images]
+                elif (
+                    not isinstance(images, list)
+                    and not isinstance(images[0], list)
+                    and not is_image_or_image_url(images[0][0])
+                ):
+                    raise ValueError(
+                        "Invalid input images. Please provide a single image or a list of images or a list of list of images."
+                    )
+                # Load images if they are URLs
+                images = [[fetch_image(im) if is_url(im) or is_file(im) else im for im in sample] for sample in images]
+                image_bytes_list = self.image_processor(images=images, **kwargs)
+        if not isinstance(text, list):
+            text = [text]
+        assert len(text) == 1, "Only support batch size 1 for now"
+        assert len(text) == len(image_bytes_list), "text and image_bytes_list must have the same length"
+        # TODO: invoke SequenceFeatureExtractor to get batched inputs
+        # 3. tokenize the text and put images / videos byte streams into the placeholders
+        #    surrounded by special tokens like "<image>" and "</image>"
+        batch_input_ids = []
+        if not encode_only:
+            batch_attention_mask = []
+        else:
+            batch_attention_mask = None
+        for t, image_bytes in zip(text, image_bytes_list):
+            text_splits = t.split(self.image_placeholder)
+            if len(text_splits) != len(image_bytes) + 1:
+                raise ValueError(
+                    f"The number of image tokens should be equal to the number of images, "
+                    f"but got {len(text_splits)} and {len(image_bytes) + 1}"
+                )
+            input_ids = [self.tokenizer.bos_token_id]
+            for i, text_part in enumerate(text_splits):
+                # each text part must be non-empty because we added markers around placeholders
+                split_tokens = self.tokenizer.encode(text_part, add_special_tokens=False)
+                input_ids.extend(split_tokens)
+                # Add image bytes after each text part except the last one
+                if i < len(image_bytes):
+                    input_ids.append(self.t2v_token_id)
+                    input_ids.extend([b + self.tokenizer.offset for b in image_bytes[i]])
+                    input_ids.append(self.v2t_token_id)
+            if strip_ending_sentinel and (input_ids[-1] in [self.t2v_token_id, self.v2t_token_id]):
+                input_ids = input_ids[:-1]
+            batch_input_ids.append(input_ids)
+            if not encode_only:
+                batch_attention_mask.append([1] * len(input_ids))
+        if not encode_only:
+            # 4. return batch of features
+            inputs = BatchFeature({
+                "input_ids": batch_input_ids,
+                "attention_mask": batch_attention_mask
+            }, tensor_type=return_tensors)
+            return inputs
+            # # Pad sequences
+            # padded_inputs = self.tokenizer.pad(
+            #     {"input_ids": batch_input_ids},
+            #     padding=True,
+            #     return_attention_mask=True,
+            #     return_tensors=return_tensors,
+            # )
+            # return BatchFeature(data=padded_inputs)
+        else:
+            return batch_input_ids
+    def image_tokens_to_bytes(self, image_token_ids, jpeg_quality=None):
+        image_bytes = bytes([token_id - self.tokenizer.offset for token_id in image_token_ids])
+        image_bytes = self.image_processor.jpeg_merge_qtables(image_bytes, jpeg_quality)
+        return image_bytes
+    def batch_decode(self, sequences, **kwargs):
+        """
+        This method forwards all its arguments to EvaByteTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        rets = [self.decode(seq, **kwargs) for seq in sequences]
+        return tuple(map(list, zip(*rets)))
+    def decode(self, token_ids, **kwargs):
+        """
+        Decodes a sequence of input_ids, handling image tokens separately.
+        Returns a tuple of (decoded_text, images), where images is a list of bytes.
+        """
+        if kwargs and "jpeg_quality" in kwargs:
+            kwargs = kwargs.copy()
+            jpeg_quality = kwargs.pop("jpeg_quality")
+        else:
+            jpeg_quality = None
+        token_ids = to_py_obj(token_ids)
+        # Find indices of t2v_token_id and v2t_token_id
+        t2v_indices = [i for i, token_id in enumerate(token_ids) if token_id == self.t2v_token_id]
+        v2t_indices = [i for i, token_id in enumerate(token_ids) if token_id == self.v2t_token_id]
+        # Check for correct pairing of t2v and v2t tokens
+        if len(t2v_indices) != len(v2t_indices):
+            raise ValueError("Mismatched number of t2v and v2t tokens in token_ids: {} and {}".format(t2v_indices, v2t_indices))
+        # Ensure t2v and v2t tokens are in the correct order
+        for t2v_idx, v2t_idx in zip(t2v_indices, v2t_indices):
+            if t2v_idx >= v2t_idx:
+                raise ValueError("Found t2v_token_id after v2t_token_id in token_ids")
+        # Initialize the start index
+        images = []
+        decoded_text = ""
+        start = 0
+        # Iterate over pairs of t2v and v2t indices
+        for t2v_idx, v2t_idx in zip(t2v_indices, v2t_indices):
+            # Decode text tokens before the image
+            text_token_ids = token_ids[start:t2v_idx]
+            if len(text_token_ids) > 0:
+                decoded_text += self.tokenizer.decode(text_token_ids, **kwargs)
+            # Insert image placeholder
+            decoded_text += self.image_placeholder
+            # Extract image tokens and convert them to bytes
+            image_token_ids = token_ids[t2v_idx + 1 : v2t_idx]
+            image_bytes = self.image_tokens_to_bytes(image_token_ids, jpeg_quality)
+            images.append(image_bytes)
+            # Update the start index to the token after v2t_token_id
+            start = v2t_idx + 1
+        # Decode any remaining text tokens after the last image
+        if start < len(token_ids):
+            text_token_ids = token_ids[start:]
+            decoded_text += self.tokenizer.decode(text_token_ids, **kwargs)
+        return decoded_text, images
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_evabyte.EvaByteProcessor"
+  },
+  "processor_class": "EvaByteProcessor"
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,98 @@

+{
+  "additional_special_tokens": [
+    "<repo_name>",
+    "<file_sep>",
+    "<t2v_token>",
+    "<v2t_token>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "<|eot_id|>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>"
+  ],
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<sep>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/tokenization_evabyte.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# coding=utf-8
+""" Tokenization class for model EvaByte."""
+from typing import List, Optional, Tuple
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+chat_template = """
+{{- bos_token }}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{{- '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>'}}
+{%- for message in messages %}
+    {%- if (message['role'] != 'user') and (message['role'] != 'assistant') %}
+        {{- raise_exception('Conversation roles must be user or assistant') }}
+    {%- endif %}
+    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\n\n' }}
+{%- endif %}
+"""
+class EvaByteTokenizer(PreTrainedTokenizer):
+    def __init__(
+        self,
+        bos_token="<bos>",
+        eos_token="<eos>",
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        extra_ids=59,
+        additional_special_tokens=None,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ) -> None:
+        num_base_special_tokens = 5
+        # Add extra_ids to the special token list
+        if extra_ids > 0 and additional_special_tokens is None:
+            additional_special_tokens = [f"<extra_id_{i}>" for i in range(num_base_special_tokens, extra_ids + num_base_special_tokens)]
+        elif extra_ids > 0 and additional_special_tokens is not None and len(additional_special_tokens) > 0:
+            # Check that we have the right number of extra_id special tokens
+            extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
+            if extra_tokens != extra_ids:
+                raise ValueError(
+                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
+                    " provided to EvaByteTokenizer. In this case the additional_special_tokens must include the"
+                    " extra_ids tokens"
+                )
+        #### override some reserved tokens to support chat template
+        for i, token in enumerate(additional_special_tokens):
+            if token == "<extra_id_5>":
+                token = "<repo_name>"
+            elif token == "<extra_id_6>":
+                token = "<file_sep>"
+            elif token == "<extra_id_7>":
+                token = "<t2v_token>"
+            elif token == "<extra_id_8>":
+                token = "<v2t_token>"
+            elif token == "<extra_id_9>":
+                token = "<|start_header_id|>"
+            elif token == "<extra_id_10>":
+                token = "<|end_header_id|>"
+            elif token == "<extra_id_11>":
+                token = "<|eot_id|>"
+            additional_special_tokens[i] = token
+        # lstrip and rstrip are set to False because we don't want to strip the whitespace from the special tokens
+        # this would be important for the byte tokenizer
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        self._added_tokens_decoder = {
+            0: pad_token,
+            1: bos_token,
+            2: eos_token,
+            3: unk_token, # unk_token is a placeholder
+            4: sep_token,
+            **{i: AddedToken(t, lstrip=False, rstrip=False) for i, t in enumerate(additional_special_tokens, start=num_base_special_tokens)},
+        }
+        self.offset = len(self._added_tokens_decoder)
+        self._utf_vocab_size = 2**8  # utf is 8 bits
+        self.add_bos_token = True
+        self.add_eos_token = False
+        super().__init__(
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            extra_ids=0,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+        self.chat_template = chat_template
+    @property
+    def vocab_size(self):
+        return self._utf_vocab_size
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.offset)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = bos_token_id + token_ids_0 + eos_token_id
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+        return output
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+        return output
+    def _tokenize(self, text: str) -> List[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        tokens = [chr(i) for i in text.encode("utf-8")]
+        return tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if len(token) != 1:
+            token_id = None
+        else:
+            token_id = ord(token) + self.offset
+        return token_id
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) to a byte (str) using the vocab."""
+        token = chr(index - self.offset)
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of bytes (string) to a single string."""
+        bstring = b""
+        for token in tokens:
+            if token in self.added_tokens_decoder:
+                tok_string = self.added_tokens_decoder[token].encode("utf-8")
+            elif token in self.added_tokens_encoder:
+                tok_string = token.encode("utf-8")
+            else:
+                tok_string = bytes([ord(token)])
+            bstring += tok_string
+        string = bstring.decode("utf-8", errors="ignore")
+        return string
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        return ()

ckpts/ocpython_14b_bsz-2m_seq16k_100raw_docmask_100B_2m_step-50000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,596 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<sep>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "6": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "7": {
+      "content": "<t2v_token>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "8": {
+      "content": "<v2t_token>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "9": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "10": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "11": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "12": {
+      "content": "<extra_id_12>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "13": {
+      "content": "<extra_id_13>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "14": {
+      "content": "<extra_id_14>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "15": {
+      "content": "<extra_id_15>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "16": {
+      "content": "<extra_id_16>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "17": {
+      "content": "<extra_id_17>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "18": {
+      "content": "<extra_id_18>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "19": {
+      "content": "<extra_id_19>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "20": {
+      "content": "<extra_id_20>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21": {
+      "content": "<extra_id_21>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "22": {
+      "content": "<extra_id_22>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "23": {
+      "content": "<extra_id_23>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "24": {
+      "content": "<extra_id_24>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "25": {
+      "content": "<extra_id_25>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "26": {
+      "content": "<extra_id_26>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "27": {
+      "content": "<extra_id_27>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "28": {
+      "content": "<extra_id_28>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "29": {
+      "content": "<extra_id_29>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "30": {
+      "content": "<extra_id_30>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "31": {
+      "content": "<extra_id_31>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32": {
+      "content": "<extra_id_32>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "33": {
+      "content": "<extra_id_33>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "34": {
+      "content": "<extra_id_34>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "35": {
+      "content": "<extra_id_35>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "36": {
+      "content": "<extra_id_36>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "37": {
+      "content": "<extra_id_37>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "38": {
+      "content": "<extra_id_38>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "39": {
+      "content": "<extra_id_39>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "40": {
+      "content": "<extra_id_40>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "41": {
+      "content": "<extra_id_41>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "42": {
+      "content": "<extra_id_42>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "43": {
+      "content": "<extra_id_43>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "44": {
+      "content": "<extra_id_44>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "45": {
+      "content": "<extra_id_45>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "46": {
+      "content": "<extra_id_46>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "47": {
+      "content": "<extra_id_47>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "48": {
+      "content": "<extra_id_48>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "49": {
+      "content": "<extra_id_49>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50": {
+      "content": "<extra_id_50>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "51": {
+      "content": "<extra_id_51>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "52": {
+      "content": "<extra_id_52>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "53": {
+      "content": "<extra_id_53>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "54": {
+      "content": "<extra_id_54>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "55": {
+      "content": "<extra_id_55>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "56": {
+      "content": "<extra_id_56>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57": {
+      "content": "<extra_id_57>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "58": {
+      "content": "<extra_id_58>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "59": {
+      "content": "<extra_id_59>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "60": {
+      "content": "<extra_id_60>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "61": {
+      "content": "<extra_id_61>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "62": {
+      "content": "<extra_id_62>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "63": {
+      "content": "<extra_id_63>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<repo_name>",
+    "<file_sep>",
+    "<t2v_token>",
+    "<v2t_token>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "<|eot_id|>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>"
+  ],
+  "auto_map": {
+    "AutoProcessor": "processing_evabyte.EvaByteProcessor",
+    "AutoTokenizer": [
+      "tokenization_evabyte.EvaByteTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<bos>",
+  "chat_template": "\n{{- bos_token }}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{{- '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>'}}\n\n{%- for message in messages %}\n    {%- if (message['role'] != 'user') and (message['role'] != 'assistant') %}\n        {{- raise_exception('Conversation roles must be user or assistant') }}\n    {%- endif %}\n\n    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n{%- endfor %}\n\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\n\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<eos>",
+  "extra_ids": 0,
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "processor_class": "EvaByteProcessor",
+  "sep_token": "<sep>",
+  "tokenizer_class": "EvaByteTokenizer",
+  "unk_token": "<unk>"
+}

ckpts/ocpython_14b_bsz-2m_seq16k_docmask_multipredc2r8_90dynamic-10raw_transsentinel_minsize0ent98line16ow16pack_100B_2m_new_2_step-10000/README.md ADDED Viewed

	@@ -0,0 +1,105 @@

+---
+license: apache-2.0
+---
+# EvaByte Model Card
+**EvaByte** is a 6.5B **byte-level language model** built upon an improved architecture with multibyte prediction and EVA -- an efficient attention mechanism designed for scalability and performance. Trained on 1.5T bytes spanning natural language text, math, and code, EvaByte demonstrates the viability of efficient byte-level processing at scale -- rivaling top open-source tokenizer-based LMs using 5x less training data, excelling in coding tasks, and decoding up to 2x faster.
+## Model Resources
+- **Repository:** https://github.com/openevabyte/evabyte
+- **Blog:** https://hkunlp.github.io/blog/2025/evabyte and https://sambanova.ai/blog/evabyte-efficient-byte-level-language-models-at-scale
+- **Paper:** Coming soon
+## Model Details
+EvaByte is trained using the performant SambaNova SN30 RDU system with a batch size of 8M bytes and 32K context length. The training process consists of 3 phases: after pre-training on 1.2T bytes (yielding **EvaByte-Phase1**), two independent annealing runs (100B and 200B bytes respectively) are conducted with learning rate linearly decayed from 1e-4 to 0. The resulting checkpoints are merged via model soup (**EvaByte**), which then undergoes supervised fine-tuning (**EvaByte-SFT**).
+| Stage | Model |
+|:----- |:-----|
+| Base (before annealing) | [EvaByte-Phase1](https://huggingface.co/evabyte/EvaByte-Phase1) |
+| Base | [EvaByte](https://huggingface.co/evabyte/EvaByte) <-- you are here |
+| SFT  | [EvaByte-SFT](https://huggingface.co/evabyte/EvaByte-SFT) |
+## Usage
+**Note:** Make sure to set `trust_remote_code=True` when loading the model (or tokenizer), as our implementation includes custom code.
+The code snippet below demonstrates EvaByte-6.5B for completion:
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+# Load model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("evabyte/EvaByte", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained("evabyte/EvaByte", torch_dtype=torch.bfloat16, trust_remote_code=True).eval().to("cuda")
+prompt = "The quick brown fox jumps "
+# Tokenize input
+# Option 1: standard HF tokenizer interface
+input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
+# Option 2: Direct UTF-8 byte encoding with offset
+# Note: Each byte is offset by 64 with <bos> prepended.
+input_ids = torch.tensor([[1] + [b + 64 for b in prompt.encode("utf-8")]]).to("cuda")
+# byte-by-byte generation (default)
+generation_output = model.generate(
+    input_ids=input_ids,
+    max_new_tokens=32
+)
+# alternatively, use faster multibyte generation
+generation_output = model.multi_byte_generate(
+    input_ids=input_ids,
+    max_new_tokens=32
+)
+# Decode and print the output
+response = tokenizer.decode(
+    generation_output[0][input_ids.shape[1]:],
+    skip_special_tokens=False,
+    clean_up_tokenization_spaces=False
+)
+print(response)
+# Sample output:
+# over the lazy dog.\n\nThe quick
+```
+### ⚙️ Generation Modes
+EvaByte supports two generation interfaces:
+- `model.generate()`: The default generation method compatible with Huggingface `transformers` library. This approach generates one byte at a time and might be slow.
+- `model.multi_byte_generate()`: A faster alternative that generates multiple bytes per step and usually yields the same result as `model.generate()` under greedy decoding, with the implementation adapted from [Medusa](https://github.com/FasterDecoding/Medusa). `model.multi_byte_generate()` supports a subset of arguments in `model.generate()`:
+    - `input_ids`: the input byte ids.
+    - `temperature`: the temperature for sampling.
+    - `max_length`: the maximum length of the generated sequence.
+    - `max_new_tokens`: the maximum number of new bytes to generate.
+    - `stopping_criteria`: the [stopping criteria](https://huggingface.co/docs/transformers/v4.47.1/en/internal/generation_utils#transformers.StoppingCriteria) for generation.
+    - `top_p`: the top-p parameter for sampling.
+    - `do_sample`: greedy decoding or sampling.
+**Notes and Limitations:**
+- `device_map="auto"` is not supported for >2 GPUs.
+- Only batch size of 1 (with `attention_mask=None`) is supported for decoding.
+- `torch_dtype=torch.bfloat16` is required.
+- The multibyte generation `model.multi_byte_generate()` might return extra bytes after the end-of-sequence sentinel, due to the nature of the multibyte decoding. Manual truncation or cleaning may be needed.
+## Bias, Risks, and Limitations
+As a pretrained base model, **EvaByte** has not been fine-tuned for chat or instruction following, so users should not expect reliable performance in conversational or instruction-based tasks. Like other base models, it does not incorporate any moderation mechanisms, making it possible to generate potentially harmful or inappropriate content.
+## Evaluation
+For detailed evaluation results, check out our blog post at [SambaNova](https://sambanova.ai/blog/evabyte-efficient-byte-level-language-models-at-scale) or [HKUNLP](https://hkunlp.github.io/blog/2025/evabyte).
+## Citation
+```bibtex
+@misc{evabyte,
+    title = {EvaByte: Efficient Byte-level Language Models at Scale},
+    url = {https://hkunlp.github.io/blog/2025/evabyte},
+    author = {Lin Zheng and Xueliang Zhao and Guangtao Wang and Chen Wu and David Dong and Angela Wang and Mingran Wang and Yun Du and Haige Bo and Amol Sharma and Bo Li and Kejie Zhang and Changran Hu and Urmish Thakker and Lingpeng Kong},
+    year = {2025}
+}
+```