Instructions to use OwlMaster/cog_vlm_2_windows with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use OwlMaster/cog_vlm_2_windows with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="OwlMaster/cog_vlm_2_windows", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("OwlMaster/cog_vlm_2_windows", trust_remote_code=True, device_map="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use OwlMaster/cog_vlm_2_windows with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "OwlMaster/cog_vlm_2_windows"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "OwlMaster/cog_vlm_2_windows",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/OwlMaster/cog_vlm_2_windows

SGLang

How to use OwlMaster/cog_vlm_2_windows with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "OwlMaster/cog_vlm_2_windows" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "OwlMaster/cog_vlm_2_windows",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "OwlMaster/cog_vlm_2_windows" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "OwlMaster/cog_vlm_2_windows",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use OwlMaster/cog_vlm_2_windows with Docker Model Runner:
```
docker model run hf.co/OwlMaster/cog_vlm_2_windows
```

OwlMaster commited on Jul 27, 2024

Commit

c19926b

verified ·

1 Parent(s): df106cf

Update util.py

Browse files

Files changed (1) hide show

util.py +631 -565

util.py CHANGED Viewed

@@ -1,565 +1,631 @@
-from typing import Optional, Tuple, Union
-import torch
-from einops import rearrange, repeat
-import torch.nn.functional as F
-#import triton
-#import triton.language as tl
-# @triton.autotune(
-#     configs=[
-#         triton.Config({"BLOCK_M": 2}),
-#         triton.Config({"BLOCK_M": 4}),
-#         triton.Config({"BLOCK_M": 8}),
-#         triton.Config({"BLOCK_M": 16}),
-#     ],
-#     key=["CACHE_KEY_SEQLEN", "BLOCK_K", "INTERLEAVED"],
-# )
-#@triton.jit
-# def rotary_kernel(
-#         OUT,  # Pointers to matrices
-#         X,
-#         COS,
-#         SIN,
-#         CU_SEQLENS,
-#         SEQLEN_OFFSETS,  # this could be int or a pointer
-#         # Matrix dimensions
-#         seqlen,
-#         nheads,
-#         rotary_dim,
-#         seqlen_ro,
-#         CACHE_KEY_SEQLEN,
-#         # strides
-#         stride_out_batch,
-#         stride_out_nheads,
-#         stride_out_seqlen,
-#         stride_out_headdim,
-#         stride_x_batch,
-#         stride_x_nheads,
-#         stride_x_seqlen,
-#         stride_x_headdim,
-#         # Meta-parameters
-#         BLOCK_K: tl.constexpr,
-#         IS_SEQLEN_OFFSETS_TENSOR: tl.constexpr,
-#         IS_VARLEN: tl.constexpr,
-#         INTERLEAVED: tl.constexpr,
-#         CONJUGATE: tl.constexpr,
-#         BLOCK_M: tl.constexpr,
-# ):
-#     pid_m = tl.program_id(axis=0)
-#     pid_batch = tl.program_id(axis=1)
-#     pid_head = tl.program_id(axis=2)
-#     rotary_dim_half = rotary_dim // 2
-#     if not IS_VARLEN:
-#         X = X + pid_batch * stride_x_batch + pid_head * stride_x_nheads
-#         OUT = OUT + pid_batch * stride_out_batch + pid_head * stride_out_nheads
-#         COS = COS + pid_batch * seqlen_ro * rotary_dim_half
-#         SIN = SIN + pid_batch * seqlen_ro * rotary_dim_half
-#     else:
-#         start_idx = tl.load(CU_SEQLENS + pid_batch)
-#         seqlen = tl.load(CU_SEQLENS + pid_batch + 1) - start_idx
-#         X = X + start_idx * stride_x_seqlen + pid_head * stride_x_nheads
-#         OUT = OUT + start_idx * stride_out_seqlen + pid_head * stride_out_nheads
-#     if pid_m * BLOCK_M >= seqlen:
-#         return
-#     rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-#     if not IS_SEQLEN_OFFSETS_TENSOR:
-#         rm_cs = rm + SEQLEN_OFFSETS
-#     else:
-#         rm_cs = rm + tl.load(SEQLEN_OFFSETS + pid_batch)
-#     rk = tl.arange(0, BLOCK_K)
-#     rk_half = tl.arange(0, BLOCK_K // 2)
-#     if not INTERLEAVED:
-#         # Load the 1st and 2nd halves of X, do calculation, then store to 1st and 2nd halves of OUT
-#         X = X + (rm[:, None] * stride_x_seqlen + rk_half[None, :] * stride_x_headdim)
-#         COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :])
-#         SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :])
-#         cos = tl.load(
-#             COS, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=1.0
-#         )
-#         sin = tl.load(
-#             SIN, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=0.0
-#         )
-#         x0 = tl.load(
-#             X, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half), other=0.0
-#         )
-#         x1 = tl.load(
-#             X + rotary_dim_half * stride_x_headdim,
-#             mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half),
-#             other=0.0,
-#         )
-#         if CONJUGATE:
-#             sin = -sin
-#         o0 = x0 * cos - x1 * sin
-#         o1 = x0 * sin + x1 * cos
-#         # write back result
-#         OUT = OUT + (rm[:, None] * stride_out_seqlen + rk_half[None, :] * stride_out_headdim)
-#         tl.store(OUT, o0, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half))
-#         tl.store(
-#             OUT + rotary_dim_half * stride_out_headdim,
-#             o1,
-#             mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half),
-#         )
-#     else:
-#         # We don't want to load X[0, 2, 4, ...] and X[1, 3, 5, ...] separately since both are slow.
-#         # Instead, we load x0 = X[0, 1, 2, 3, ...] and x1 = X[1, 0, 3, 2, ...].
-#         # Loading x0 will be fast but x1 will be slow.
-#         # Then we load cos = COS[0, 0, 1, 1, ...] and sin = SIN[0, 0, 1, 1, ...].
-#         # Then we do the calculation and use tl.where to pick put the right outputs for the even
-#         # and for the odd indices.
-#         rk_swap = rk + ((rk + 1) % 2) * 2 - 1  # 1, 0, 3, 2, 5, 4, ...
-#         rk_repeat = tl.arange(0, BLOCK_K) // 2
-#         X0 = X + (rm[:, None] * stride_x_seqlen + rk[None, :] * stride_x_headdim)
-#         X1 = X + (rm[:, None] * stride_x_seqlen + rk_swap[None, :] * stride_x_headdim)
-#         COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :])
-#         SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :])
-#         cos = tl.load(
-#             COS,
-#             mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half),
-#             other=1.0,
-#         ).to(tl.float32)
-#         sin = tl.load(
-#             SIN,
-#             mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half),
-#             other=0.0,
-#         ).to(tl.float32)
-#         x0 = tl.load(X0, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim), other=0.0).to(
-#             tl.float32
-#         )
-#         x1 = tl.load(
-#             X1, mask=(rm[:, None] < seqlen) & (rk_swap[None, :] < rotary_dim), other=0.0
-#         ).to(tl.float32)
-#         if CONJUGATE:
-#             sin = -sin
-#         x0_cos = x0 * cos
-#         x1_sin = x1 * sin
-#         out = tl.where(rk[None, :] % 2 == 0, x0_cos - x1_sin, x0_cos + x1_sin)
-#         OUT = OUT + (rm[:, None] * stride_out_seqlen + rk[None, :] * stride_out_headdim)
-#         tl.store(OUT, out, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim))
-# def apply_rotary(
-#         x: torch.Tensor,
-#         cos: torch.Tensor,
-#         sin: torch.Tensor,
-#         seqlen_offsets: Union[int, torch.Tensor] = 0,
-#         cu_seqlens: Optional[torch.Tensor] = None,
-#         max_seqlen: Optional[int] = None,
-#         interleaved=False,
-#         inplace=False,
-#         conjugate=False,
-# ) -> torch.Tensor:
-#     """
-#     Arguments:
-#         x: (batch, seqlen, nheads, headdim) if cu_seqlens is None
-#             else (total_seqlen, nheads, headdim).
-#         cos: (seqlen_ro, rotary_dim / 2)
-#         sin: (seqlen_ro, rotary_dim / 2)
-#         seqlen_offsets: integer or integer tensor of size (batch,)
-#         cu_seqlens: (batch + 1,) or None
-#         max_seqlen: int
-#     Returns:
-#         y: (batch, seqlen, nheads, headdim)
-#     """
-#     batch, nheads, seqlen, headdim = x.shape
-#     batch_ro, seqlen_ro, rotary_dim = cos.shape
-#     assert batch == batch_ro
-#     assert sin.shape == cos.shape
-#     rotary_dim *= 2
-#     assert rotary_dim <= headdim, "rotary_dim must be <= headdim"
-#     assert headdim <= 256, "Only support headdim <= 256"
-#     assert seqlen_ro >= seqlen, "seqlen_ro must be >= seqlen"
-#     assert (
-#             cos.dtype == sin.dtype
-#     ), f"cos and sin must have the same dtype, got {cos.dtype} and {sin.dtype}"
-#     assert (
-#             x.dtype == cos.dtype
-#     ), f"Input and cos/sin must have the same dtype, got {x.dtype} and {cos.dtype}"
-#     cos, sin = cos.contiguous(), sin.contiguous()
-#     if isinstance(seqlen_offsets, torch.Tensor):
-#         assert seqlen_offsets.shape == (batch,)
-#         assert seqlen_offsets.dtype in [torch.int32, torch.int64]
-#         seqlen_offsets = seqlen_offsets.contiguous()
-#     else:
-#         assert seqlen_offsets + seqlen <= seqlen_ro
-#     output = torch.empty_like(x) if not inplace else x
-#     if rotary_dim < headdim and not inplace:
-#         output[..., rotary_dim:].copy_(x[..., rotary_dim:])
-#     BLOCK_K = (
-#         32
-#         if rotary_dim <= 32
-#         else (64 if rotary_dim <= 64 else (128 if rotary_dim <= 128 else 256))
-#     )
-#     grid = lambda META: (triton.cdiv(seqlen, META["BLOCK_M"]), batch, nheads)  # noqa
-#     BLOCK_M = 4 if interleaved else (8 if rotary_dim <= 64 else 4)
-#     # Need this, otherwise Triton tries to launch from cuda:0 and we get
-#     # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
-#     with torch.cuda.device(x.device.index):
-#         rotary_kernel[grid](
-#             output,  # data ptrs
-#             x,
-#             cos,
-#             sin,
-#             cu_seqlens,
-#             seqlen_offsets,
-#             seqlen,  # shapes
-#             nheads,
-#             rotary_dim,
-#             seqlen_ro,
-#             seqlen // 128,  # key for triton cache (limit number of compilations)
-#             output.stride(0),  # batch_strides
-#             output.stride(-3),  # nheads_stride
-#             output.stride(-2),  # seqlen_stride
-#             output.stride(-1),  # headdim_stride
-#             x.stride(0),  # batch_strides
-#             x.stride(-3),  # nheads stride
-#             x.stride(-2),  # seqlen stride
-#             x.stride(-1),  # headdim stride
-#             BLOCK_K,
-#             isinstance(seqlen_offsets, torch.Tensor),
-#             False,
-#             interleaved,
-#             conjugate,
-#             BLOCK_M,
-#         )
-#     return output
-def apply_rotary(
-        x: torch.Tensor,
-        cos: torch.Tensor,
-        sin: torch.Tensor,
-        seqlen_offsets: Union[int, torch.Tensor] = 0,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[int] = None,
-        interleaved=False,
-        inplace=False,
-        conjugate=False,
-) -> torch.Tensor:
-    """
-    Arguments:
-        x: (batch, seqlen, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, nheads, headdim).
-        cos: (seqlen_ro, rotary_dim / 2)
-        sin: (seqlen_ro, rotary_dim / 2)
-        seqlen_offsets: integer or integer tensor of size (batch,)
-        cu_seqlens: (batch + 1,) or None
-        max_seqlen: int
-    Returns:
-        y: (batch, seqlen, nheads, headdim)
-    """
-    batch, nheads, seqlen, headdim = x.shape
-    batch_ro, seqlen_ro, rotary_dim = cos.shape
-    assert batch == batch_ro
-    assert sin.shape == cos.shape
-    rotary_dim *= 2
-    assert rotary_dim <= headdim, "rotary_dim must be <= headdim"
-    assert headdim <= 256, "Only support headdim <= 256"
-    assert seqlen_ro >= seqlen, "seqlen_ro must be >= seqlen"
-    assert (
-            cos.dtype == sin.dtype
-    ), f"cos and sin must have the same dtype, got {cos.dtype} and {sin.dtype}"
-    assert (
-            x.dtype == cos.dtype
-    ), f"Input and cos/sin must have the same dtype, got {x.dtype} and {cos.dtype}"
-    cos, sin = cos.contiguous(), sin.contiguous()
-    if isinstance(seqlen_offsets, torch.Tensor):
-        assert seqlen_offsets.shape == (batch,)
-        assert seqlen_offsets.dtype in [torch.int32, torch.int64]
-        seqlen_offsets = seqlen_offsets.contiguous()
-    else:
-        assert seqlen_offsets + seqlen <= seqlen_ro
-    output = torch.empty_like(x) if not inplace else x
-    if rotary_dim < headdim and not inplace:
-        output[..., rotary_dim:].copy_(x[..., rotary_dim:])
-    rotary_dim_half = rotary_dim // 2
-    for b in range(batch):
-        for h in range(nheads):
-            for s in range(seqlen):
-                idx = s + seqlen_offsets if isinstance(seqlen_offsets, int) else s + seqlen_offsets[b]
-                if idx >= seqlen_ro:
-                    continue
-                cos_idx = cos[b, idx, :rotary_dim_half]
-                sin_idx = sin[b, idx, :rotary_dim_half]
-                if conjugate:
-                    sin_idx = -sin_idx
-                if not interleaved:
-                    x0 = x[b, h, s, :rotary_dim_half]
-                    x1 = x[b, h, s, rotary_dim_half:rotary_dim]
-                    o0 = x0 * cos_idx - x1 * sin_idx
-                    o1 = x0 * sin_idx + x1 * cos_idx
-                    output[b, h, s, :rotary_dim_half] = o0
-                    output[b, h, s, rotary_dim_half:rotary_dim] = o1
-                else:
-                    for i in range(rotary_dim):
-                        if i % 2 == 0:
-                            output[b, h, s, i] = x[b, h, s, i] * cos_idx[i // 2] - x[b, h, s, i + 1] * sin_idx[i // 2]
-                        else:
-                            output[b, h, s, i] = x[b, h, s, i - 1] * sin_idx[i // 2] + x[b, h, s, i] * cos_idx[i // 2]
-    return output
-class ApplyRotaryEmb(torch.autograd.Function):
-    @staticmethod
-    def forward(
-            ctx,
-            x,
-            cos,
-            sin,
-            interleaved=False,
-            inplace=False,
-            seqlen_offsets: Union[int, torch.Tensor] = 0,
-            cu_seqlens: Optional[torch.Tensor] = None,
-            max_seqlen: Optional[int] = None,
-    ):
-        out = apply_rotary(
-            x,
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            cu_seqlens=cu_seqlens,
-            interleaved=interleaved,
-            inplace=inplace,
-        )
-        if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(cos, sin, cu_seqlens)  # Can't save int with save_for_backward
-            ctx.seqlen_offsets = seqlen_offsets
-        else:
-            ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
-            ctx.seqlen_offsets = None
-        ctx.interleaved = interleaved
-        ctx.inplace = inplace
-        ctx.max_seqlen = max_seqlen
-        return out if not inplace else x
-    @staticmethod
-    def backward(ctx, do):
-        seqlen_offsets = ctx.seqlen_offsets
-        if seqlen_offsets is None:
-            cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
-        else:
-            cos, sin, cu_seqlens = ctx.saved_tensors
-        # TD [2023-09-02]: For some reason Triton (2.0.0.post1) errors with
-        # "[CUDA]: invalid device context", and cloning makes it work. Idk why. Triton 2.1.0 works.
-        if not ctx.interleaved and not ctx.inplace:
-            do = do.clone()
-        dx = apply_rotary(
-            do,
-            cos,
-            sin,
-            seqlen_offsets=seqlen_offsets,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=ctx.max_seqlen,
-            interleaved=ctx.interleaved,
-            inplace=ctx.inplace,
-            conjugate=True,
-        )
-        return dx, None, None, None, None, None, None, None
-def apply_rotary_emb(
-        x,
-        cos,
-        sin,
-        interleaved=False,
-        inplace=False,
-        seqlen_offsets: Union[int, torch.Tensor] = 0,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[int] = None,
-):
-    """
-    Arguments:
-        x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, nheads, headdim)
-        cos, sin: (seqlen_rotary, rotary_dim / 2)
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
-            of 1st half and 2nd half (GPT-NeoX style).
-        inplace: if True, apply rotary embedding in-place.
-        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
-            Most commonly used in inference when we have KV cache.
-        cu_seqlens: (batch + 1,) or None
-        max_seqlen: int
-    Return:
-        out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
-            else (total_seqlen, nheads, headdim)
-    rotary_dim must be <= headdim
-    Apply rotary embedding to the first rotary_dim of x.
-    """
-    return ApplyRotaryEmb.apply(
-        x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen
-    )
-# For backward compatibility
-apply_rotary_emb_func = apply_rotary_emb
-class FastRotaryEmbedding(torch.nn.Module):
-    """
-    The rotary position embeddings from RoFormer_ (Su et. al).
-    A crucial insight from the method is that the query and keys are
-    transformed by rotation matrices which depend on the relative positions.
-    Other implementations are available in the Rotary Transformer repo_ and in
-    GPT-NeoX_, GPT-NeoX was an inspiration
-    .. _RoFormer: https://arxiv.org/abs/2104.09864
-    .. _repo: https://github.com/ZhuiyiTechnology/roformer
-    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
-    If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
-    A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
-    Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
-    """
-    def __init__(
-            self,
-            dim: int,
-            base=10000,
-            interleaved=False,
-            scale_base=None,
-            pos_idx_in_fp32=True,
-            device=None,
-    ):
-        """
-        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
-            of 1st half and 2nd half (GPT-NeoX style).
-        pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
-            otherwise they might be in lower precision.
-            This option was added because previously (before 2023-07-02), when we construct
-            the position indices, we use the dtype of self.inv_freq. In most cases this would
-            be fp32, but if the model is trained in pure bf16 (not mixed precision), then
-            self.inv_freq would be bf16, and the position indices are also in bf16.
-            Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
-            embeddings for some positions will coincide.
-            To maintain compatibility with models previously trained in pure bf16,
-            we add this option.
-        """
-        super().__init__()
-        self.dim = dim
-        self.base = base
-        self.pos_idx_in_fp32 = pos_idx_in_fp32
-        # Generate and save the inverse frequency buffer (non trainable)
-        inv_freq = self._compute_inv_freq(device)
-        self.register_buffer("inv_freq", inv_freq)
-        self.interleaved = interleaved
-        self.scale_base = scale_base
-        scale = (
-            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
-            if scale_base is not None
-            else None
-        )
-        self.register_buffer("scale", scale, persistent=False)
-        self._seq_len_cached = 0
-        self._cos_cached = None
-        self._sin_cached = None
-        self._cos_k_cached = None
-        self._sin_k_cached = None
-        self.cos = None
-        self.sin = None
-    def _compute_inv_freq(self, device=None):
-        return 1.0 / (
-                self.base
-                ** (torch.arange(0, self.dim, 2, device=device) / self.dim)
-                # ** (torch.arange(0, self.dim, 2, device=device).float() / self.dim)
-        )
-    def _update_cos_sin_cache(self, seqlen, position_id, device=None, dtype=None):
-        if (
-                seqlen > self._seq_len_cached
-        ):
-            self._seq_len_cached = seqlen
-            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
-            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
-            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
-            if self.pos_idx_in_fp32:
-                t = torch.arange(seqlen, device=device, dtype=torch.float32)
-                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
-                # will be large. Having it in bf16 will lose a lot of precision and cause the
-                # cos & sin output to change significantly.
-                # We want to recompute self.inv_freq if it was not loaded in fp32
-                if self.inv_freq.dtype != torch.float32:
-                    inv_freq = self._compute_inv_freq(device=device)
-                else:
-                    inv_freq = self.inv_freq
-            else:
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                inv_freq = self.inv_freq
-            freqs = torch.einsum("i,j->ij", t, inv_freq)
-            if self.scale is None:
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-            else:
-                power = (
-                                torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
-                                - seqlen // 2
-                        ) / self.scale_base
-                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
-                # We want the multiplication by scale to happen in fp32
-                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
-                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
-                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
-                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
-    def forward(
-            self,
-            q: torch.Tensor,
-            k: torch.Tensor,
-            position_ids: torch.Tensor,
-            max_seqlen,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        q: (batch, nheads, seqlen, headdim)
-        k: (batch, nheads, seqlen, headdim)
-        position_id: (batch, seqlen)
-        max_seqlen: int
-        layer_id: int
-            only if layer_id == 0, then update cons and sin
-        Apply rotary embedding *inplace* to q k.
-        """
-        self._update_cos_sin_cache(max_seqlen, position_ids, device=q.device, dtype=q.dtype)
-        cos, sin = F.embedding(position_ids, self._cos_cached), F.embedding(position_ids, self._sin_cached)
-        q = apply_rotary_emb_func(
-            q,
-            cos,
-            sin,
-            interleaved=self.interleaved,
-            inplace=True
-        )
-        k = apply_rotary_emb_func(
-            k,
-            cos,
-            sin,
-            interleaved=self.interleaved,
-            inplace=True
-        )
-        return q, k

+from typing import Optional, Tuple, Union
+import torch
+from einops import rearrange, repeat
+import torch.nn.functional as F
+#import triton
+#import triton.language as tl
+# @triton.autotune(
+#     configs=[
+#         triton.Config({"BLOCK_M": 2}),
+#         triton.Config({"BLOCK_M": 4}),
+#         triton.Config({"BLOCK_M": 8}),
+#         triton.Config({"BLOCK_M": 16}),
+#     ],
+#     key=["CACHE_KEY_SEQLEN", "BLOCK_K", "INTERLEAVED"],
+# )
+#@triton.jit
+# def rotary_kernel(
+#         OUT,  # Pointers to matrices
+#         X,
+#         COS,
+#         SIN,
+#         CU_SEQLENS,
+#         SEQLEN_OFFSETS,  # this could be int or a pointer
+#         # Matrix dimensions
+#         seqlen,
+#         nheads,
+#         rotary_dim,
+#         seqlen_ro,
+#         CACHE_KEY_SEQLEN,
+#         # strides
+#         stride_out_batch,
+#         stride_out_nheads,
+#         stride_out_seqlen,
+#         stride_out_headdim,
+#         stride_x_batch,
+#         stride_x_nheads,
+#         stride_x_seqlen,
+#         stride_x_headdim,
+#         # Meta-parameters
+#         BLOCK_K: tl.constexpr,
+#         IS_SEQLEN_OFFSETS_TENSOR: tl.constexpr,
+#         IS_VARLEN: tl.constexpr,
+#         INTERLEAVED: tl.constexpr,
+#         CONJUGATE: tl.constexpr,
+#         BLOCK_M: tl.constexpr,
+# ):
+#     pid_m = tl.program_id(axis=0)
+#     pid_batch = tl.program_id(axis=1)
+#     pid_head = tl.program_id(axis=2)
+#     rotary_dim_half = rotary_dim // 2
+#     if not IS_VARLEN:
+#         X = X + pid_batch * stride_x_batch + pid_head * stride_x_nheads
+#         OUT = OUT + pid_batch * stride_out_batch + pid_head * stride_out_nheads
+#         COS = COS + pid_batch * seqlen_ro * rotary_dim_half
+#         SIN = SIN + pid_batch * seqlen_ro * rotary_dim_half
+#     else:
+#         start_idx = tl.load(CU_SEQLENS + pid_batch)
+#         seqlen = tl.load(CU_SEQLENS + pid_batch + 1) - start_idx
+#         X = X + start_idx * stride_x_seqlen + pid_head * stride_x_nheads
+#         OUT = OUT + start_idx * stride_out_seqlen + pid_head * stride_out_nheads
+#     if pid_m * BLOCK_M >= seqlen:
+#         return
+#     rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+#     if not IS_SEQLEN_OFFSETS_TENSOR:
+#         rm_cs = rm + SEQLEN_OFFSETS
+#     else:
+#         rm_cs = rm + tl.load(SEQLEN_OFFSETS + pid_batch)
+#     rk = tl.arange(0, BLOCK_K)
+#     rk_half = tl.arange(0, BLOCK_K // 2)
+#     if not INTERLEAVED:
+#         # Load the 1st and 2nd halves of X, do calculation, then store to 1st and 2nd halves of OUT
+#         X = X + (rm[:, None] * stride_x_seqlen + rk_half[None, :] * stride_x_headdim)
+#         COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :])
+#         SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :])
+#         cos = tl.load(
+#             COS, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=1.0
+#         )
+#         sin = tl.load(
+#             SIN, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=0.0
+#         )
+#         x0 = tl.load(
+#             X, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half), other=0.0
+#         )
+#         x1 = tl.load(
+#             X + rotary_dim_half * stride_x_headdim,
+#             mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half),
+#             other=0.0,
+#         )
+#         if CONJUGATE:
+#             sin = -sin
+#         o0 = x0 * cos - x1 * sin
+#         o1 = x0 * sin + x1 * cos
+#         # write back result
+#         OUT = OUT + (rm[:, None] * stride_out_seqlen + rk_half[None, :] * stride_out_headdim)
+#         tl.store(OUT, o0, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half))
+#         tl.store(
+#             OUT + rotary_dim_half * stride_out_headdim,
+#             o1,
+#             mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half),
+#         )
+#     else:
+#         # We don't want to load X[0, 2, 4, ...] and X[1, 3, 5, ...] separately since both are slow.
+#         # Instead, we load x0 = X[0, 1, 2, 3, ...] and x1 = X[1, 0, 3, 2, ...].
+#         # Loading x0 will be fast but x1 will be slow.
+#         # Then we load cos = COS[0, 0, 1, 1, ...] and sin = SIN[0, 0, 1, 1, ...].
+#         # Then we do the calculation and use tl.where to pick put the right outputs for the even
+#         # and for the odd indices.
+#         rk_swap = rk + ((rk + 1) % 2) * 2 - 1  # 1, 0, 3, 2, 5, 4, ...
+#         rk_repeat = tl.arange(0, BLOCK_K) // 2
+#         X0 = X + (rm[:, None] * stride_x_seqlen + rk[None, :] * stride_x_headdim)
+#         X1 = X + (rm[:, None] * stride_x_seqlen + rk_swap[None, :] * stride_x_headdim)
+#         COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :])
+#         SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :])
+#         cos = tl.load(
+#             COS,
+#             mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half),
+#             other=1.0,
+#         ).to(tl.float32)
+#         sin = tl.load(
+#             SIN,
+#             mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half),
+#             other=0.0,
+#         ).to(tl.float32)
+#         x0 = tl.load(X0, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim), other=0.0).to(
+#             tl.float32
+#         )
+#         x1 = tl.load(
+#             X1, mask=(rm[:, None] < seqlen) & (rk_swap[None, :] < rotary_dim), other=0.0
+#         ).to(tl.float32)
+#         if CONJUGATE:
+#             sin = -sin
+#         x0_cos = x0 * cos
+#         x1_sin = x1 * sin
+#         out = tl.where(rk[None, :] % 2 == 0, x0_cos - x1_sin, x0_cos + x1_sin)
+#         OUT = OUT + (rm[:, None] * stride_out_seqlen + rk[None, :] * stride_out_headdim)
+#         tl.store(OUT, out, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim))
+# def apply_rotary(
+#         x: torch.Tensor,
+#         cos: torch.Tensor,
+#         sin: torch.Tensor,
+#         seqlen_offsets: Union[int, torch.Tensor] = 0,
+#         cu_seqlens: Optional[torch.Tensor] = None,
+#         max_seqlen: Optional[int] = None,
+#         interleaved=False,
+#         inplace=False,
+#         conjugate=False,
+# ) -> torch.Tensor:
+#     """
+#     Arguments:
+#         x: (batch, seqlen, nheads, headdim) if cu_seqlens is None
+#             else (total_seqlen, nheads, headdim).
+#         cos: (seqlen_ro, rotary_dim / 2)
+#         sin: (seqlen_ro, rotary_dim / 2)
+#         seqlen_offsets: integer or integer tensor of size (batch,)
+#         cu_seqlens: (batch + 1,) or None
+#         max_seqlen: int
+#     Returns:
+#         y: (batch, seqlen, nheads, headdim)
+#     """
+#     batch, nheads, seqlen, headdim = x.shape
+#     batch_ro, seqlen_ro, rotary_dim = cos.shape
+#     assert batch == batch_ro
+#     assert sin.shape == cos.shape
+#     rotary_dim *= 2
+#     assert rotary_dim <= headdim, "rotary_dim must be <= headdim"
+#     assert headdim <= 256, "Only support headdim <= 256"
+#     assert seqlen_ro >= seqlen, "seqlen_ro must be >= seqlen"
+#     assert (
+#             cos.dtype == sin.dtype
+#     ), f"cos and sin must have the same dtype, got {cos.dtype} and {sin.dtype}"
+#     assert (
+#             x.dtype == cos.dtype
+#     ), f"Input and cos/sin must have the same dtype, got {x.dtype} and {cos.dtype}"
+#     cos, sin = cos.contiguous(), sin.contiguous()
+#     if isinstance(seqlen_offsets, torch.Tensor):
+#         assert seqlen_offsets.shape == (batch,)
+#         assert seqlen_offsets.dtype in [torch.int32, torch.int64]
+#         seqlen_offsets = seqlen_offsets.contiguous()
+#     else:
+#         assert seqlen_offsets + seqlen <= seqlen_ro
+#     output = torch.empty_like(x) if not inplace else x
+#     if rotary_dim < headdim and not inplace:
+#         output[..., rotary_dim:].copy_(x[..., rotary_dim:])
+#     BLOCK_K = (
+#         32
+#         if rotary_dim <= 32
+#         else (64 if rotary_dim <= 64 else (128 if rotary_dim <= 128 else 256))
+#     )
+#     grid = lambda META: (triton.cdiv(seqlen, META["BLOCK_M"]), batch, nheads)  # noqa
+#     BLOCK_M = 4 if interleaved else (8 if rotary_dim <= 64 else 4)
+#     # Need this, otherwise Triton tries to launch from cuda:0 and we get
+#     # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
+#     with torch.cuda.device(x.device.index):
+#         rotary_kernel[grid](
+#             output,  # data ptrs
+#             x,
+#             cos,
+#             sin,
+#             cu_seqlens,
+#             seqlen_offsets,
+#             seqlen,  # shapes
+#             nheads,
+#             rotary_dim,
+#             seqlen_ro,
+#             seqlen // 128,  # key for triton cache (limit number of compilations)
+#             output.stride(0),  # batch_strides
+#             output.stride(-3),  # nheads_stride
+#             output.stride(-2),  # seqlen_stride
+#             output.stride(-1),  # headdim_stride
+#             x.stride(0),  # batch_strides
+#             x.stride(-3),  # nheads stride
+#             x.stride(-2),  # seqlen stride
+#             x.stride(-1),  # headdim stride
+#             BLOCK_K,
+#             isinstance(seqlen_offsets, torch.Tensor),
+#             False,
+#             interleaved,
+#             conjugate,
+#             BLOCK_M,
+#         )
+#     return output
+def apply_rotary(
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        seqlen_offsets: Union[int, torch.Tensor] = 0,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
+        interleaved=False,
+        inplace=False,
+        conjugate=False,
+) -> torch.Tensor:
+    """
+    Arguments:
+        x: (batch, seqlen, nheads, headdim) if cu_seqlens is None
+            else (total_seqlen, nheads, headdim).
+        cos: (seqlen_ro, rotary_dim / 2)
+        sin: (seqlen_ro, rotary_dim / 2)
+        seqlen_offsets: integer or integer tensor of size (batch,)
+        cu_seqlens: (batch + 1,) or None
+        max_seqlen: int
+    Returns:
+        y: (batch, seqlen, nheads, headdim)
+    """
+    batch, nheads, seqlen, headdim = x.shape
+    batch_ro, seqlen_ro, rotary_dim = cos.shape
+    assert batch == batch_ro
+    assert sin.shape == cos.shape
+    rotary_dim *= 2
+    assert rotary_dim <= headdim, "rotary_dim must be <= headdim"
+    assert headdim <= 256, "Only support headdim <= 256"
+    assert seqlen_ro >= seqlen, "seqlen_ro must be >= seqlen"
+    assert (
+            cos.dtype == sin.dtype
+    ), f"cos and sin must have the same dtype, got {cos.dtype} and {sin.dtype}"
+    assert (
+            x.dtype == cos.dtype
+    ), f"Input and cos/sin must have the same dtype, got {x.dtype} and {cos.dtype}"
+    cos, sin = cos.contiguous(), sin.contiguous()
+    if isinstance(seqlen_offsets, torch.Tensor):
+        assert seqlen_offsets.shape == (batch,)
+        assert seqlen_offsets.dtype in [torch.int32, torch.int64]
+        seqlen_offsets = seqlen_offsets.contiguous()
+    else:
+        assert seqlen_offsets + seqlen <= seqlen_ro
+    output = torch.empty_like(x) if not inplace else x
+    if rotary_dim < headdim and not inplace:
+        output[..., rotary_dim:].copy_(x[..., rotary_dim:])
+    rotary_dim_half = rotary_dim // 2
+    for b in range(batch):
+        for h in range(nheads):
+            for s in range(seqlen):
+                idx = s + seqlen_offsets if isinstance(seqlen_offsets, int) else s + seqlen_offsets[b]
+                if idx >= seqlen_ro:
+                    continue
+                cos_idx = cos[b, idx, :rotary_dim_half]
+                sin_idx = sin[b, idx, :rotary_dim_half]
+                if conjugate:
+                    sin_idx = -sin_idx
+                if not interleaved:
+                    x0 = x[b, h, s, :rotary_dim_half]
+                    x1 = x[b, h, s, rotary_dim_half:rotary_dim]
+                    o0 = x0 * cos_idx - x1 * sin_idx
+                    o1 = x0 * sin_idx + x1 * cos_idx
+                    output[b, h, s, :rotary_dim_half] = o0
+                    output[b, h, s, rotary_dim_half:rotary_dim] = o1
+                else:
+                    for i in range(rotary_dim):
+                        if i % 2 == 0:
+                            output[b, h, s, i] = x[b, h, s, i] * cos_idx[i // 2] - x[b, h, s, i + 1] * sin_idx[i // 2]
+                        else:
+                            output[b, h, s, i] = x[b, h, s, i - 1] * sin_idx[i // 2] + x[b, h, s, i] * cos_idx[i // 2]
+    return output
+def apply_rotary_optimized(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    seqlen_offsets: Union[int, torch.Tensor] = 0,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    max_seqlen: Optional[int] = None,
+    interleaved=False,
+    inplace=False,
+    conjugate=False,
+) -> torch.Tensor:
+    batch, nheads, seqlen, headdim = x.shape
+    batch_ro, seqlen_ro, rotary_dim = cos.shape
+    assert batch == batch_ro
+    assert sin.shape == cos.shape
+    rotary_dim *= 2
+    assert rotary_dim <= headdim, "rotary_dim must be <= headdim"
+    assert headdim <= 256, "Only support headdim <= 256"
+    assert seqlen_ro >= seqlen, "seqlen_ro must be >= seqlen"
+    assert cos.dtype == sin.dtype, f"cos and sin must have the same dtype, got {cos.dtype} and {sin.dtype}"
+    assert x.dtype == cos.dtype, f"Input and cos/sin must have the same dtype, got {x.dtype} and {cos.dtype}"
+    cos, sin = cos.contiguous(), sin.contiguous()
+    if isinstance(seqlen_offsets, torch.Tensor):
+        assert seqlen_offsets.shape == (batch,)
+        assert seqlen_offsets.dtype in [torch.int32, torch.int64]
+        seqlen_offsets = seqlen_offsets.contiguous()
+    else:
+        assert seqlen_offsets + seqlen <= seqlen_ro
+        seqlen_offsets = torch.full((batch,), seqlen_offsets, device=x.device, dtype=torch.long)
+    output = torch.empty_like(x) if not inplace else x
+    if rotary_dim < headdim and not inplace:
+        output[..., rotary_dim:].copy_(x[..., rotary_dim:])
+    rotary_dim_half = rotary_dim // 2
+    # Create indices for gathering
+    seq_indices = torch.arange(seqlen, device=x.device).unsqueeze(0) + seqlen_offsets.unsqueeze(1)
+    seq_indices = seq_indices.clamp(max=seqlen_ro - 1)
+    # Gather cos and sin values
+    cos_gathered = cos.gather(1, seq_indices.unsqueeze(-1).expand(-1, -1, rotary_dim_half))
+    sin_gathered = sin.gather(1, seq_indices.unsqueeze(-1).expand(-1, -1, rotary_dim_half))
+    if conjugate:
+        sin_gathered = -sin_gathered
+    if not interleaved:
+        x_rotary = x[..., :rotary_dim].view(batch, nheads, seqlen, 2, -1)
+        x0, x1 = x_rotary.unbind(dim=-2)
+        o0 = x0 * cos_gathered.unsqueeze(1) - x1 * sin_gathered.unsqueeze(1)
+        o1 = x0 * sin_gathered.unsqueeze(1) + x1 * cos_gathered.unsqueeze(1)
+        output[..., :rotary_dim] = torch.stack([o0, o1], dim=-2).view(batch, nheads, seqlen, -1)
+    else:
+        x_rotary = x[..., :rotary_dim].view(batch, nheads, seqlen, rotary_dim // 2, 2)
+        x0, x1 = x_rotary.unbind(dim=-1)
+        o0 = x0 * cos_gathered.unsqueeze(1) - x1 * sin_gathered.unsqueeze(1)
+        o1 = x0 * sin_gathered.unsqueeze(1) + x1 * cos_gathered.unsqueeze(1)
+        output[..., :rotary_dim] = torch.stack([o0, o1], dim=-1).view(batch, nheads, seqlen, -1)
+    return output
+class ApplyRotaryEmb(torch.autograd.Function):
+    @staticmethod
+    def forward(
+            ctx,
+            x,
+            cos,
+            sin,
+            interleaved=False,
+            inplace=False,
+            seqlen_offsets: Union[int, torch.Tensor] = 0,
+            cu_seqlens: Optional[torch.Tensor] = None,
+            max_seqlen: Optional[int] = None,
+    ):
+        out = apply_rotary_optimized(
+            x,
+            cos,
+            sin,
+            seqlen_offsets=seqlen_offsets,
+            cu_seqlens=cu_seqlens,
+            interleaved=interleaved,
+            inplace=inplace,
+        )
+        if isinstance(seqlen_offsets, int):
+            ctx.save_for_backward(cos, sin, cu_seqlens)  # Can't save int with save_for_backward
+            ctx.seqlen_offsets = seqlen_offsets
+        else:
+            ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
+            ctx.seqlen_offsets = None
+        ctx.interleaved = interleaved
+        ctx.inplace = inplace
+        ctx.max_seqlen = max_seqlen
+        return out if not inplace else x
+    @staticmethod
+    def backward(ctx, do):
+        seqlen_offsets = ctx.seqlen_offsets
+        if seqlen_offsets is None:
+            cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors
+        else:
+            cos, sin, cu_seqlens = ctx.saved_tensors
+        # TD [2023-09-02]: For some reason Triton (2.0.0.post1) errors with
+        # "[CUDA]: invalid device context", and cloning makes it work. Idk why. Triton 2.1.0 works.
+        if not ctx.interleaved and not ctx.inplace:
+            do = do.clone()
+        dx = apply_rotary(
+            do,
+            cos,
+            sin,
+            seqlen_offsets=seqlen_offsets,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=ctx.max_seqlen,
+            interleaved=ctx.interleaved,
+            inplace=ctx.inplace,
+            conjugate=True,
+        )
+        return dx, None, None, None, None, None, None, None
+def apply_rotary_emb(
+        x,
+        cos,
+        sin,
+        interleaved=False,
+        inplace=False,
+        seqlen_offsets: Union[int, torch.Tensor] = 0,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
+):
+    """
+    Arguments:
+        x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
+            else (total_seqlen, nheads, headdim)
+        cos, sin: (seqlen_rotary, rotary_dim / 2)
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
+            of 1st half and 2nd half (GPT-NeoX style).
+        inplace: if True, apply rotary embedding in-place.
+        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
+            Most commonly used in inference when we have KV cache.
+        cu_seqlens: (batch + 1,) or None
+        max_seqlen: int
+    Return:
+        out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
+            else (total_seqlen, nheads, headdim)
+    rotary_dim must be <= headdim
+    Apply rotary embedding to the first rotary_dim of x.
+    """
+    return ApplyRotaryEmb.apply(
+        x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen
+    )
+# For backward compatibility
+apply_rotary_emb_func = apply_rotary_emb
+class FastRotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+    If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
+    A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
+    Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
+    """
+    def __init__(
+            self,
+            dim: int,
+            base=10000,
+            interleaved=False,
+            scale_base=None,
+            pos_idx_in_fp32=True,
+            device=None,
+    ):
+        """
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
+            of 1st half and 2nd half (GPT-NeoX style).
+        pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
+            otherwise they might be in lower precision.
+            This option was added because previously (before 2023-07-02), when we construct
+            the position indices, we use the dtype of self.inv_freq. In most cases this would
+            be fp32, but if the model is trained in pure bf16 (not mixed precision), then
+            self.inv_freq would be bf16, and the position indices are also in bf16.
+            Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
+            embeddings for some positions will coincide.
+            To maintain compatibility with models previously trained in pure bf16,
+            we add this option.
+        """
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self.pos_idx_in_fp32 = pos_idx_in_fp32
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = self._compute_inv_freq(device)
+        self.register_buffer("inv_freq", inv_freq)
+        self.interleaved = interleaved
+        self.scale_base = scale_base
+        scale = (
+            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
+            if scale_base is not None
+            else None
+        )
+        self.register_buffer("scale", scale, persistent=False)
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+        self.cos = None
+        self.sin = None
+    def _compute_inv_freq(self, device=None):
+        return 1.0 / (
+                self.base
+                ** (torch.arange(0, self.dim, 2, device=device) / self.dim)
+                # ** (torch.arange(0, self.dim, 2, device=device).float() / self.dim)
+        )
+    def _update_cos_sin_cache(self, seqlen, position_id, device=None, dtype=None):
+        if (
+                seqlen > self._seq_len_cached
+        ):
+            self._seq_len_cached = seqlen
+            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
+            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
+            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
+            if self.pos_idx_in_fp32:
+                t = torch.arange(seqlen, device=device, dtype=torch.float32)
+                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
+                # will be large. Having it in bf16 will lose a lot of precision and cause the
+                # cos & sin output to change significantly.
+                # We want to recompute self.inv_freq if it was not loaded in fp32
+                if self.inv_freq.dtype != torch.float32:
+                    inv_freq = self._compute_inv_freq(device=device)
+                else:
+                    inv_freq = self.inv_freq
+            else:
+                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+                inv_freq = self.inv_freq
+            freqs = torch.einsum("i,j->ij", t, inv_freq)
+            if self.scale is None:
+                self._cos_cached = torch.cos(freqs).to(dtype)
+                self._sin_cached = torch.sin(freqs).to(dtype)
+            else:
+                power = (
+                                torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
+                                - seqlen // 2
+                        ) / self.scale_base
+                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
+                # We want the multiplication by scale to happen in fp32
+                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
+                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
+                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
+                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
+    def forward(
+            self,
+            q: torch.Tensor,
+            k: torch.Tensor,
+            position_ids: torch.Tensor,
+            max_seqlen,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        q: (batch, nheads, seqlen, headdim)
+        k: (batch, nheads, seqlen, headdim)
+        position_id: (batch, seqlen)
+        max_seqlen: int
+        layer_id: int
+            only if layer_id == 0, then update cons and sin
+        Apply rotary embedding *inplace* to q k.
+        """
+        self._update_cos_sin_cache(max_seqlen, position_ids, device=q.device, dtype=q.dtype)
+        cos, sin = F.embedding(position_ids, self._cos_cached), F.embedding(position_ids, self._sin_cached)
+        q = apply_rotary_emb_func(
+            q,
+            cos,
+            sin,
+            interleaved=self.interleaved,
+            inplace=True
+        )
+        k = apply_rotary_emb_func(
+            k,
+            cos,
+            sin,
+            interleaved=self.interleaved,
+            inplace=True
+        )
+        return q, k