msj19 commited on Jan 22

Commit

65775f0

verified ·

1 Parent(s): 7652cf9

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/chunk_h_bwd.py +173 -0
build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/chunk_h_fwd.py +173 -0
build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/chunk_o_bwd.py +428 -0
build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/chunk_o_fwd.py +123 -0
build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/fused_recurrent.py +273 -0
build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/naive.py +96 -0
build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/wy_fast_bwd.py +164 -0
build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/wy_fast_fwd.py +284 -0
build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/iplr/__init__.py +7 -0
build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/iplr/chunk.py +500 -0
build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/iplr/fused_recurrent.py +452 -0
build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/iplr/naive.py +69 -0
build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/iplr/wy_fast.py +300 -0
docs/en/.readthedocs.yaml +17 -0
docs/en/Makefile +20 -0
docs/en/_static/css/readthedocs.css +62 -0
docs/en/_static/image/logo.svg +79 -0
docs/en/_static/image/logo_icon.svg +31 -0
docs/en/_static/js/custom.js +20 -0
docs/en/_templates/404.html +18 -0
docs/en/_templates/autosummary/class.rst +13 -0
docs/en/_templates/callable.rst +14 -0
docs/en/advanced_guides/accelerator_intro.md +142 -0
docs/en/advanced_guides/circular_eval.md +113 -0
docs/en/advanced_guides/code_eval.md +104 -0
docs/en/advanced_guides/code_eval_service.md +224 -0
docs/en/advanced_guides/contamination_eval.md +124 -0
docs/en/advanced_guides/custom_dataset.md +267 -0
docs/en/advanced_guides/evaluation_lightllm.md +71 -0
docs/en/advanced_guides/evaluation_lmdeploy.md +88 -0
docs/en/advanced_guides/llm_judge.md +370 -0
docs/en/advanced_guides/longeval.md +169 -0
docs/en/advanced_guides/math_verify.md +190 -0
docs/en/advanced_guides/needleinahaystack_eval.md +138 -0
docs/en/advanced_guides/new_dataset.md +105 -0
docs/en/advanced_guides/new_model.md +73 -0
docs/en/advanced_guides/objective_judgelm_evaluation.md +186 -0
docs/en/advanced_guides/persistence.md +65 -0
docs/en/advanced_guides/prompt_attack.md +108 -0
docs/en/advanced_guides/subjective_evaluation.md +171 -0
docs/en/conf.py +234 -0
docs/en/docutils.conf +2 -0
docs/en/get_started/faq.md +128 -0
docs/en/get_started/installation.md +142 -0
docs/en/get_started/quick_start.md +300 -0
docs/en/index.rst +99 -0
docs/en/notes/academic.md +106 -0
docs/en/notes/contribution_guide.md +158 -0
docs/en/notes/news.md +40 -0
docs/en/prompt/chain_of_thought.md +127 -0

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/chunk_h_bwd.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from ....ops.utils import prepare_chunk_indices, prepare_chunk_offsets
+from ....ops.utils.op import exp
+from ....utils import check_shared_mem, use_cuda_graph
+@triton.heuristics({
+    'USE_FINAL_STATE_GRADIENT': lambda args: args['dht'] is not None,
+    'USE_INITIAL_STATE': lambda args: args['dh0'] is not None,
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT', 'BK', 'BV', "V"],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_dplr_bwd_kernel_dhu(
+    qg,
+    bg,
+    w,
+    gk,
+    dht,
+    dh0,
+    do,
+    dh,
+    dv,
+    dv2,
+    cu_seqlens,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_FINAL_STATE_GRADIENT: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+    # [BK, BV]
+    b_dh = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_FINAL_STATE_GRADIENT:
+        p_dht = tl.make_block_ptr(dht + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_dh += tl.load(p_dht, boundary_check=(0, 1))
+    mask_k = tl.arange(0, BK) < K
+    for i_t in range(NT - 1, -1, -1):
+        p_dh = tl.make_block_ptr(dh + ((boh+i_t) * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_dh, b_dh.to(p_dh.dtype.element_ty), boundary_check=(0, 1))
+        b_dh_tmp = tl.zeros([BK, BV], dtype=tl.float32)
+        for i_c in range(tl.cdiv(BT, BC) - 1, -1, -1):
+            p_qg = tl.make_block_ptr(qg+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+            p_bg = tl.make_block_ptr(bg+(bos*H+i_h)*K, (T, K), (H*K, 1), (i_t * BT + i_c * BC, i_k * BK), (BC, BK), (1, 0))
+            p_w = tl.make_block_ptr(w+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+            p_dv = tl.make_block_ptr(dv+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+            p_do = tl.make_block_ptr(do+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+            p_dv2 = tl.make_block_ptr(dv2+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+            # [BK, BT]
+            b_qg = tl.load(p_qg, boundary_check=(0, 1))
+            # [BT, BK]
+            b_bg = tl.load(p_bg, boundary_check=(0, 1))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            # [BT, V]
+            b_do = tl.load(p_do, boundary_check=(0, 1))
+            b_dv = tl.load(p_dv, boundary_check=(0, 1))
+            b_dv2 = b_dv + tl.dot(b_bg, b_dh.to(b_bg.dtype))
+            tl.store(p_dv2, b_dv2.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+            # [BK, BV]
+            b_dh_tmp += tl.dot(b_qg, b_do.to(b_qg.dtype))
+            b_dh_tmp += tl.dot(b_w, b_dv2.to(b_qg.dtype))
+        last_idx = min((i_t + 1) * BT, T) - 1
+        bg_last = tl.load(gk + ((bos + last_idx) * H + i_h) * K + tl.arange(0, BK), mask=mask_k)
+        b_dh *= exp(bg_last)[:, None]
+        b_dh += b_dh_tmp
+    if USE_INITIAL_STATE:
+        p_dh0 = tl.make_block_ptr(dh0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_dh0, b_dh.to(p_dh0.dtype.element_ty), boundary_check=(0, 1))
+def chunk_dplr_bwd_dhu(
+    qg: torch.Tensor,
+    bg: torch.Tensor,
+    w: torch.Tensor,
+    gk: torch.Tensor,
+    h0: torch.Tensor,
+    dht: Optional[torch.Tensor],
+    do: torch.Tensor,
+    dv: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *qg.shape, do.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    BK = triton.next_power_of_2(K)
+    assert BK <= 256, "current kernel does not support head dimension being larger than 256."
+    # H100
+    if check_shared_mem('hopper', qg.device.index):
+        BV = 64
+        BC = 64 if K <= 128 else 32
+    elif check_shared_mem('ampere', qg.device.index):  # A100
+        BV = 32
+        BC = 32
+    else:  # Etc: 4090
+        BV = 16
+        BC = 16
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if cu_seqlens is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        N, NT, chunk_offsets = len(cu_seqlens) - 1, len(chunk_indices), prepare_chunk_offsets(cu_seqlens, BT)
+    BC = min(BT, BC)
+    NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+    assert NK == 1, 'NK > 1 is not supported because it involves time-consuming synchronization'
+    dh = qg.new_empty(B, NT, H, K, V)
+    dh0 = torch.empty_like(h0, dtype=torch.float32) if h0 is not None else None
+    dv2 = torch.zeros_like(dv)
+    grid = (NK, NV, N * H)
+    chunk_dplr_bwd_kernel_dhu[grid](
+        qg=qg,
+        bg=bg,
+        w=w,
+        gk=gk,
+        dht=dht,
+        dh0=dh0,
+        do=do,
+        dh=dh,
+        dv=dv,
+        dv2=dv2,
+        cu_seqlens=cu_seqlens,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BC=BC,
+        BK=BK,
+        BV=BV,
+    )
+    return dh, dh0, dv2

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/chunk_h_fwd.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from ....ops.utils import prepare_chunk_indices, prepare_chunk_offsets
+from ....ops.utils.op import exp
+from ....utils import check_shared_mem, use_cuda_graph
+@triton.heuristics({
+    'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
+    'STORE_FINAL_STATE': lambda args: args['ht'] is not None,
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT', 'BK', 'BV'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_dplr_fwd_kernel_h(
+    kg,
+    v,
+    w,
+    bg,
+    u,
+    v_new,
+    gk,
+    h,
+    h0,
+    ht,
+    cu_seqlens,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+    o_k = i_k * BK + tl.arange(0, BK)
+    # [BK, BV]
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        p_h0 = tl.make_block_ptr(h0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_h = tl.load(p_h0, boundary_check=(0, 1)).to(tl.float32)
+    for i_t in range(NT):
+        p_h = tl.make_block_ptr(h + ((boh + i_t) * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_h, b_h.to(p_h.dtype.element_ty), boundary_check=(0, 1))
+        b_hc = tl.zeros([BK, BV], dtype=tl.float32)
+        # since we need to make all DK in the SRAM. we face serve SRAM memory burden. By subchunking we allievate such burden
+        for i_c in range(tl.cdiv(min(BT, T - i_t * BT), BC)):
+            p_kg = tl.make_block_ptr(kg+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+            p_bg = tl.make_block_ptr(bg+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+            p_w = tl.make_block_ptr(w+(bos*H+i_h)*K, (T, K), (H*K, 1), (i_t * BT + i_c * BC, i_k * BK), (BC, BK), (1, 0))
+            p_v = tl.make_block_ptr(v+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+            p_u = tl.make_block_ptr(u+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+            p_v_new = tl.make_block_ptr(v_new+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT+i_c*BC, i_v * BV), (BC, BV), (1, 0))
+            # [BK, BC]
+            b_kg = tl.load(p_kg, boundary_check=(0, 1))
+            b_v = tl.load(p_v, boundary_check=(0, 1))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_bg = tl.load(p_bg, boundary_check=(0, 1))
+            b_v2 = tl.dot(b_w, b_h.to(b_w.dtype)) + tl.load(p_u, boundary_check=(0, 1))
+            b_hc += tl.dot(b_kg, b_v)
+            b_hc += tl.dot(b_bg.to(b_hc.dtype), b_v2)
+            tl.store(p_v_new, b_v2.to(p_v_new.dtype.element_ty), boundary_check=(0, 1))
+        last_idx = min((i_t + 1) * BT, T) - 1
+        b_g_last = tl.load(gk + (bos + last_idx) * H*K + i_h * K + o_k, mask=o_k < K).to(tl.float32)
+        b_h *= exp(b_g_last[:, None])
+        b_h += b_hc
+    if STORE_FINAL_STATE:
+        p_ht = tl.make_block_ptr(ht + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+def chunk_dplr_fwd_h(
+    kg: torch.Tensor,
+    v: torch.Tensor,
+    w: torch.Tensor,
+    u: torch.Tensor,
+    bg: torch.Tensor,
+    gk: torch.Tensor,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *kg.shape, u.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if cu_seqlens is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        N, NT, chunk_offsets = len(cu_seqlens) - 1, len(chunk_indices), prepare_chunk_offsets(cu_seqlens, BT)
+    BK = triton.next_power_of_2(K)
+    assert BK <= 256, "current kernel does not support head dimension larger than 256."
+    # H100 can have larger block size
+    if check_shared_mem('hopper', kg.device.index):
+        BV = 64
+        BC = 64 if K <= 128 else 32
+    elif check_shared_mem('ampere', kg.device.index):  # A100
+        BV = 32
+        BC = 32
+    else:
+        BV = 16
+        BC = 16
+    BC = min(BT, BC)
+    NK = triton.cdiv(K, BK)
+    NV = triton.cdiv(V, BV)
+    assert NK == 1, 'NK > 1 is not supported because it involves time-consuming synchronization'
+    h = kg.new_empty(B, NT, H, K, V)
+    final_state = kg.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None
+    v_new = torch.empty_like(u)
+    grid = (NK, NV, N * H)
+    chunk_dplr_fwd_kernel_h[grid](
+        kg=kg,
+        v=v,
+        w=w,
+        bg=bg,
+        u=u,
+        v_new=v_new,
+        h=h,
+        gk=gk,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BC=BC,
+        BK=BK,
+        BV=BV,
+    )
+    return h, v_new, final_state

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/chunk_o_bwd.py ADDED Viewed

	@@ -0,0 +1,428 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from ....ops.utils import prepare_chunk_indices
+from ....ops.utils.op import exp
+from ....utils import check_shared_mem, use_cuda_graph
+BK_LIST = [32, 64, 128] if check_shared_mem() else [16, 32]
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BV', 'BT'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_dplr_bwd_kernel_dAu(
+    v,
+    do,
+    v_new,
+    A_qb,
+    dA_qk,
+    dA_qb,
+    dv_new,
+    cu_seqlens,
+    chunk_indices,
+    scale: tl.constexpr,
+    T,
+    H: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    T = eos - bos
+    b_dA_qk = tl.zeros([BT, BT], dtype=tl.float32)
+    b_dA_qb = tl.zeros([BT, BT], dtype=tl.float32)
+    p_A_qb = tl.make_block_ptr(A_qb + (bos * H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    b_A_qb = tl.load(p_A_qb, boundary_check=(0, 1))
+    # causal mask
+    b_A_qb = tl.where(tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :], b_A_qb, 0.).to(b_A_qb.dtype)
+    for i_v in range(tl.cdiv(V, BV)):
+        p_do = tl.make_block_ptr(do + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_v = tl.make_block_ptr(v + (bos*H + i_h) * V, (V, T), (1, H*V), (i_v * BV, i_t * BT), (BV, BT), (0, 1))
+        p_v_new = tl.make_block_ptr(v_new + (bos*H + i_h) * V, (V, T), (1, H*V), (i_v * BV, i_t * BT), (BV, BT), (0, 1))
+        p_dv_new = tl.make_block_ptr(dv_new + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+        b_v_new = tl.load(p_v_new, boundary_check=(0, 1))
+        b_dA_qk += tl.dot(b_do, b_v)
+        b_dA_qb += tl.dot(b_do, b_v_new)
+        b_dv_new = tl.dot(tl.trans(b_A_qb), b_do)
+        # for recurrent
+        tl.store(p_dv_new, b_dv_new.to(p_dv_new.dtype.element_ty), boundary_check=(0, 1))
+    p_dA_qk = tl.make_block_ptr(dA_qk + (bos * H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    p_dA_qb = tl.make_block_ptr(dA_qb + (bos * H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    m_s = tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :]
+    b_dA_qk = tl.where(m_s, b_dA_qk * scale, 0.)
+    tl.store(p_dA_qk, b_dA_qk.to(p_dA_qk.dtype.element_ty), boundary_check=(0, 1))
+    b_dA_qb = tl.where(m_s, b_dA_qb * scale, 0.)
+    tl.store(p_dA_qb, b_dA_qb.to(p_dA_qb.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT', 'BK', 'BV'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit
+def chunk_dplr_bwd_o_kernel(
+    v,
+    v_new,
+    h,
+    do,
+    dh,
+    dk,
+    db,
+    w,
+    dq,
+    dv,
+    dw,
+    gk,
+    dgk_last,
+    k,
+    b,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_k, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+    # offset calculation
+    v += (bos * H + i_h) * V
+    v_new += (bos * H + i_h) * V
+    do += (bos * H + i_h) * V
+    h += (i_tg * H + i_h) * K * V
+    dh += (i_tg * H + i_h) * K * V
+    dk += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    db += (bos * H + i_h) * K
+    b += (bos * H + i_h) * K
+    dw += (bos * H + i_h) * K
+    dv += (bos * H + i_h) * V
+    dq += (bos * H + i_h) * K
+    w += (bos * H + i_h) * K
+    dgk_last += (i_tg * H + i_h) * K
+    gk += (bos * H + i_h) * K
+    stride_qk = H*K
+    stride_vo = H*V
+    b_dq = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dk = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dw = tl.zeros([BT, BK], dtype=tl.float32)
+    b_db = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dgk_last = tl.zeros([BK], dtype=tl.float32)
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(v, (T, V), (stride_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_v_new = tl.make_block_ptr(v_new, (T, V), (stride_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_do = tl.make_block_ptr(do, (T, V), (stride_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_h = tl.make_block_ptr(h, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1))
+        p_dh = tl.make_block_ptr(dh, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1))
+        # [BT, BV]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_v_new = tl.load(p_v_new, boundary_check=(0, 1))
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+        # [BV, BK]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        b_dh = tl.load(p_dh, boundary_check=(0, 1))
+        b_dgk_last += tl.sum((b_h * b_dh).to(tl.float32), axis=0)
+        # [BT, BV] @ [BV, BK] -> [BT, BK]
+        b_dq += tl.dot(b_do, b_h.to(b_do.dtype))
+        # [BT, BV] @ [BV, BK] -> [BT, BK]
+        b_dk += tl.dot(b_v, b_dh.to(b_v.dtype))
+        b_db += tl.dot(b_v_new, b_dh.to(b_v_new.dtype))
+        p_dv = tl.make_block_ptr(dv, (T, V), (stride_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_dv = tl.load(p_dv, boundary_check=(0, 1))
+        b_dw += tl.dot(b_dv.to(b_v.dtype), b_h.to(b_v.dtype))
+    m_k = (i_k*BK+tl.arange(0, BK)) < K
+    last_idx = min(i_t * BT + BT, T) - 1
+    b_gk_last = tl.load(gk + last_idx * stride_qk + i_k*BK + tl.arange(0, BK), mask=m_k, other=float('-inf'))
+    b_dgk_last *= exp(b_gk_last)
+    p_k = tl.make_block_ptr(k, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_b = tl.make_block_ptr(b, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+    b_b = tl.load(p_b, boundary_check=(0, 1))
+    b_dgk_last += tl.sum(b_k * b_dk, axis=0)
+    b_dgk_last += tl.sum(b_b * b_db, axis=0)
+    tl.store(dgk_last + tl.arange(0, BK) + i_k * BK, b_dgk_last, mask=m_k)
+    p_dw = tl.make_block_ptr(dw, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dk = tl.make_block_ptr(dk, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_db = tl.make_block_ptr(db, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dq = tl.make_block_ptr(dq, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    tl.store(p_dw, b_dw.to(p_dw.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_db, b_db.to(p_db.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+        for BK in BK_LIST
+        for BV in BK_LIST
+    ],
+    key=['BT'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit
+def chunk_dplr_bwd_kernel_dv(
+    A_qk,
+    kg,
+    do,
+    dv,
+    dh,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+    b_dv = tl.zeros([BT, BV], dtype=tl.float32)
+    # offset calculation
+    A_qk += (bos * H + i_h) * BT
+    do += (bos * H + i_h) * V
+    dv += (bos * H + i_h) * V
+    kg += (bos * H + i_h) * K
+    dh += (i_tg * H + i_h) * K*V
+    stride_qk = H*K
+    stride_vo = H*V
+    stride_A = H*BT
+    for i_k in range(tl.cdiv(K, BK)):
+        p_dh = tl.make_block_ptr(dh, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        p_kg = tl.make_block_ptr(kg, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_dh = tl.load(p_dh, boundary_check=(0, 1))
+        b_kg = tl.load(p_kg, boundary_check=(0, 1))
+        b_dv += tl.dot(b_kg, b_dh.to(b_kg.dtype))
+    p_Aqk = tl.make_block_ptr(A_qk, (BT, T), (1, stride_A), (0, i_t * BT), (BT, BT), (0, 1))
+    b_A = tl.where(tl.arange(0, BT)[:, None] <= tl.arange(0, BT)[None, :], tl.load(p_Aqk, boundary_check=(0, 1)), 0)
+    p_do = tl.make_block_ptr(do, (T, V), (stride_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_dv = tl.make_block_ptr(dv, (T, V), (stride_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    b_do = tl.load(p_do, boundary_check=(0, 1))
+    b_dv += tl.dot(b_A.to(b_do.dtype), b_do)
+    tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+def chunk_dplr_bwd_dv(
+    A_qk: torch.Tensor,
+    kg: torch.Tensor,
+    do: torch.Tensor,
+    dh: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64
+) -> torch.Tensor:
+    B, T, H, K, V = *kg.shape, do.shape[-1]
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    dv = torch.empty_like(do)
+    def grid(meta): return (triton.cdiv(V, meta['BV']), NT, B * H)
+    chunk_dplr_bwd_kernel_dv[grid](
+        A_qk=A_qk,
+        kg=kg,
+        do=do,
+        dv=dv,
+        dh=dh,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+    )
+    return dv
+def chunk_dplr_bwd_o(
+    k: torch.Tensor,
+    b: torch.Tensor,
+    v: torch.Tensor,
+    v_new: torch.Tensor,
+    gk: torch.Tensor,
+    do: torch.Tensor,
+    h: torch.Tensor,
+    dh: torch.Tensor,
+    dv: torch.Tensor,
+    w: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64,
+    scale: float = 1.0,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *w.shape, v.shape[-1]
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    BK = min(triton.next_power_of_2(K), 64) if check_shared_mem() else min(triton.next_power_of_2(K), 32)
+    BV = min(triton.next_power_of_2(V), 64) if check_shared_mem() else min(triton.next_power_of_2(K), 32)
+    NK = triton.cdiv(K, BK)
+    dq = torch.empty_like(k)
+    dk = torch.empty_like(k)
+    dw = torch.empty_like(w)
+    db = torch.empty_like(b)
+    grid = (NK, NT, B * H)
+    dgk_last = torch.empty(B, NT, H, K, dtype=torch.float, device=w.device)
+    chunk_dplr_bwd_o_kernel[grid](
+        k=k,
+        b=b,
+        v=v,
+        v_new=v_new,
+        h=h,
+        do=do,
+        dh=dh,
+        dq=dq,
+        dk=dk,
+        db=db,
+        dgk_last=dgk_last,
+        w=w,
+        dv=dv,
+        dw=dw,
+        gk=gk,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+    )
+    return dq, dk, dw, db, dgk_last
+def chunk_dplr_bwd_dAu(
+    v: torch.Tensor,
+    v_new: torch.Tensor,
+    do: torch.Tensor,
+    A_qb: torch.Tensor,
+    scale: float,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64
+) -> torch.Tensor:
+    B, T, H, V = v.shape
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    if check_shared_mem('ampere'):  # A100
+        BV = min(triton.next_power_of_2(V), 128)
+    elif check_shared_mem('ada'):  # 4090
+        BV = min(triton.next_power_of_2(V), 64)
+    else:
+        BV = min(triton.next_power_of_2(V), 32)
+    grid = (NT, B * H)
+    dA_qk = torch.empty(B, T, H, BT, dtype=torch.float, device=v.device)
+    dA_qb = torch.empty(B, T, H, BT, dtype=torch.float, device=v.device)
+    dv_new = torch.empty_like(v_new)
+    chunk_dplr_bwd_kernel_dAu[grid](
+        v=v,
+        do=do,
+        v_new=v_new,
+        A_qb=A_qb,
+        dA_qk=dA_qk,
+        dA_qb=dA_qb,
+        dv_new=dv_new,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        scale=scale,
+        T=T,
+        H=H,
+        V=V,
+        BT=BT,
+        BV=BV,
+    )
+    return dv_new, dA_qk, dA_qb

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/chunk_o_fwd.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional
+import torch
+import triton
+import triton.language as tl
+from ....ops.utils import prepare_chunk_indices
+from ....utils import check_shared_mem, use_cuda_graph
+BK_LIST = [32, 64, 128] if check_shared_mem() else [16, 32]
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in BK_LIST
+        for BV in BK_LIST
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_dplr_fwd_kernel_o(
+    qg,
+    v,
+    v_new,
+    A_qk,
+    A_qb,
+    h,
+    o,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+    b_o = tl.zeros([BT, BV], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_qg = tl.make_block_ptr(qg + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_h = tl.make_block_ptr(h + (i_tg * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_qg = tl.load(p_qg, boundary_check=(0, 1))
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        b_o += tl.dot(b_qg, b_h)
+    p_Aqk = tl.make_block_ptr(A_qk + (bos * H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    p_Aqb = tl.make_block_ptr(A_qb + (bos * H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_v_new = tl.make_block_ptr(v_new + (bos * H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_o = tl.make_block_ptr(o + (bos * H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    m_s = tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :]
+    b_Aqk = tl.load(p_Aqk, boundary_check=(0, 1))
+    b_Aqb = tl.load(p_Aqb, boundary_check=(0, 1))
+    b_Aqk = tl.where(m_s, b_Aqk, 0)
+    b_Aqb = tl.where(m_s, b_Aqb, 0)
+    b_v = tl.load(p_v, boundary_check=(0, 1))
+    b_v_new = tl.load(p_v_new, boundary_check=(0, 1))
+    b_o = b_o + tl.dot(b_Aqk.to(b_v.dtype), b_v) + tl.dot(b_Aqb.to(b_v_new.dtype), b_v_new)
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+def chunk_dplr_fwd_o(
+    qg: torch.Tensor,
+    v: torch.Tensor,
+    v_new: torch.Tensor,
+    A_qk: torch.Tensor,
+    A_qb: torch.Tensor,
+    h: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64
+) -> torch.Tensor:
+    B, T, H, K, V = *qg.shape, v.shape[-1]
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    o = torch.empty_like(v)
+    def grid(meta): return (triton.cdiv(V, meta['BV']), NT, B * H)
+    chunk_dplr_fwd_kernel_o[grid](
+        qg=qg,
+        v=v,
+        v_new=v_new,
+        A_qk=A_qk,
+        A_qb=A_qb,
+        h=h,
+        o=o,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+    )
+    return o

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/fused_recurrent.py ADDED Viewed

	@@ -0,0 +1,273 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from ....ops.utils.op import exp
+from ....utils import autocast_custom_bwd, autocast_custom_fwd, input_guard, use_cuda_graph
+@triton.heuristics({
+    'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
+    'STORE_FINAL_STATE': lambda args: args['ht'] is not None,
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BV': BV}, num_warps=num_warps, num_stages=num_stages)
+        for BV in [16, 32, 64]
+        for num_warps in [2, 4, 8, 16]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BK'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def fused_recurrent_dplr_delta_rule_fwd_kernel(
+    q,
+    k,
+    v,
+    a,
+    b,
+    gk,
+    o,
+    h0,
+    ht,
+    cu_seqlens,
+    scale,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    REVERSE: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_nh = tl.program_id(0).to(tl.int64), tl.program_id(1).to(tl.int64)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int64), tl.load(cu_seqlens + i_n + 1).to(tl.int64)
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+    o_k = tl.arange(0, BK)
+    o_v = i_v * BV + tl.arange(0, BV)
+    p_q = q + (bos + ((T-1) if REVERSE else 0)) * H*K + i_h * K + o_k
+    p_k = k + (bos + ((T-1) if REVERSE else 0)) * H*K + i_h * K + o_k
+    p_a = a + (bos + ((T-1) if REVERSE else 0)) * H*K + i_h * K + o_k
+    p_b = b + (bos + ((T-1) if REVERSE else 0)) * H*K + i_h * K + o_k
+    p_gk = gk + (bos + ((T-1) if REVERSE else 0)) * H*K + i_h * K + o_k
+    p_v = v + (bos + ((T-1) if REVERSE else 0)) * H*V + i_h * V + o_v
+    p_o = o + (bos + ((T-1) if REVERSE else 0)) * H*V + i_h * V + o_v
+    mask_k = o_k < K
+    mask_v = o_v < V
+    mask_h = mask_k[None, :] & mask_v[:, None]
+    b_h = tl.zeros([BV, BK], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        p_h0 = h0 + i_nh * K*V + o_k[None, :] * V + o_v[:, None]
+        b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32)
+    for _ in range(0, T):
+        b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32) * scale
+        b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
+        b_a = tl.load(p_a, mask=mask_k, other=0).to(tl.float32)
+        b_b = tl.load(p_b, mask=mask_k, other=0).to(tl.float32)
+        b_gk = tl.load(p_gk, mask=mask_k, other=0).to(tl.float32)
+        b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
+        tmp = tl.sum(b_h * b_a[None, :], axis=1)
+        b_h = exp(b_gk)[None, :] * b_h + (tmp[:, None] * b_b[None, :] + b_k[None, :] * b_v[:, None])
+        b_o = tl.sum(b_h * b_q[None, :], axis=1)
+        tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v)
+        p_q += (-1 if REVERSE else 1) * H*K
+        p_k += (-1 if REVERSE else 1) * H*K
+        p_a += (-1 if REVERSE else 1) * H*K
+        p_b += (-1 if REVERSE else 1) * H*K
+        p_gk += (-1 if REVERSE else 1) * H*K
+        p_v += (-1 if REVERSE else 1) * H*V
+        p_o += (-1 if REVERSE else 1) * H*V
+    if STORE_FINAL_STATE:
+        p_ht = ht + i_nh * K*V + o_k[None, :] * V + o_v[:, None]
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask_h)
+def fused_recurrent_dplr_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    gk: torch.Tensor,
+    scale: Optional[float] = 1.0,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    reverse: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+):
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    N = B if cu_seqlens is None else len(cu_seqlens) - 1
+    BK = triton.next_power_of_2(K)
+    h0 = initial_state
+    if output_final_state:
+        ht = q.new_empty(N, H, K, V, dtype=torch.float32)
+    else:
+        ht = None
+    o = torch.empty_like(v)
+    def grid(meta): return (triton.cdiv(V, meta['BV']), N * H)
+    fused_recurrent_dplr_delta_rule_fwd_kernel[grid](
+        q,
+        k,
+        v,
+        a,
+        b,
+        gk,
+        o,
+        h0,
+        ht,
+        cu_seqlens,
+        scale,
+        T=T,
+        B=B,
+        H=H,
+        K=K,
+        V=V,
+        BK=BK,
+        REVERSE=reverse,
+    )
+    return o, ht
+class FusedRecurrentDPLRDeltaRuleFunction(torch.autograd.Function):
+    @staticmethod
+    @input_guard
+    @autocast_custom_fwd
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        gk: torch.Tensor,
+        scale: Optional[float] = 1.0,
+        initial_state: Optional[torch.Tensor] = None,
+        output_final_state: bool = False,
+        reverse: bool = False,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+    ):
+        o, ht = fused_recurrent_dplr_delta_rule_fwd(
+            q=q,
+            k=k,
+            v=v,
+            a=a,
+            b=b,
+            gk=gk,
+            scale=scale,
+            initial_state=initial_state,
+            output_final_state=output_final_state,
+            reverse=reverse,
+            cu_seqlens=cu_seqlens,
+        )
+        return o, ht
+    @staticmethod
+    @input_guard
+    @autocast_custom_bwd
+    def backward(ctx, do, dht):
+        raise NotImplementedError(
+            "Backward pass for fused_recurrent_dplr_delta_rule is not implemented and will not be supported. "
+            "This kernel is only for inference. "
+            "For training, please use `chunk_dplr_delta_rule`."
+        )
+def fused_recurrent_dplr_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    gk: torch.Tensor,
+    scale: Optional[float] = 1.0,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    reverse: bool = False,
+    cu_seqlens: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    This function computes the recurrence S_t = S_t @ (I + a_t b_t^T) + v_t k_t^T in a recurrent manner.
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, T, H, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]`.
+        v (torch.Tensor):
+            values of shape `[B, T, H, V]`.
+        a (torch.Tensor):
+            a of shape `[B, T, H, K]`.
+        b (torch.Tensor):
+            b of shape `[B, T, H, K]`.
+        gk (torch.Tensor):
+            gk of shape `[B, T, H, K]`. decay term in log space!
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: 1.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        reverse (Optional[bool]):
+            If `True`, process the state passing in reverse order. Default: `False`.
+        cu_seqlens (Optional[torch.Tensor]):
+            Cumulative sequence lengths of shape `[N + 1]` used for variable-length training,
+            consistent with the FlashAttention API.
+    """
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(
+                f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                f"Please flatten variable-length inputs before processing."
+            )
+        if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1:
+            raise ValueError(
+                f"The number of initial states is expected to be equal to the number of input sequences, "
+                f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}."
+            )
+    if scale is None:
+        scale = q.shape[-1] ** -0.5
+    else:
+        assert scale > 0, "scale must be positive"
+    o, final_state = FusedRecurrentDPLRDeltaRuleFunction.apply(
+        q,
+        k,
+        v,
+        a,
+        b,
+        gk,
+        scale,
+        initial_state,
+        output_final_state,
+        reverse,
+        cu_seqlens,
+    )
+    return o, final_state

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/naive.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# -*- coding: utf-8 -*-
+import torch
+from einops import rearrange
+# S_t = S_t @ (I + alpha_t beta_t^T) + v_t k_t^T
+# q, k, alpha, beta [B, H, L, D_K]
+# v [B, H, L, D_V]
+def dplr_recurrence(q, k, v, alpha, beta, gk, initial_state=None, output_final_state=True):
+    orig_dtype = q.dtype
+    b, h, l, d_k = q.shape
+    q, k, v, beta, gk = map(lambda x: x.float(), [q, k, v, beta, gk])
+    d_v = v.shape[-1]
+    o = torch.zeros_like(v)
+    S = torch.zeros(b, h, d_k, d_v).to(v)
+    q = q * (d_k ** -0.5)
+    if initial_state is not None:
+        S += initial_state
+    for i in range(l):
+        _k = k[:, :, i]
+        _q = q[:, :, i]
+        _v = v[:, :, i]
+        _alpha = alpha[:, :, i].clone()
+        _beta = beta[:, :, i].clone()
+        _kv = _k[..., None] * _v[..., None, :] + (S.clone() * _alpha[..., None]).sum(-2, keepdim=True) * _beta[..., None]
+        S = S.clone() * gk[:, :, i].exp()[..., None] + _kv
+        o[:, :, i] = torch.einsum('bhd,bhdm->bhm', _q, S)
+    S = None if output_final_state is False else S
+    return o.to(orig_dtype), S
+def dplr_chunkwise(q, k, v, alpha, beta, gk, initial_state=None, output_final_state=True, chunk_size=32):
+    b, h, l, d_k = q.shape
+    d_v = v.shape[-1]
+    q = q * (d_k ** -0.5)
+    v = v
+    assert l % chunk_size == 0
+    S = k.new_zeros(b, h, d_k, d_v).to(q)
+    if initial_state is not None:
+        S += initial_state
+    # note that diagonal is masked.
+    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=q.device), diagonal=0)
+    q, k, v, alpha, beta, gk = map(lambda x: rearrange(x, 'b h (n c) d -> b h n c d',
+                                   c=chunk_size).float(), [q, k, v, alpha, beta, gk])
+    gk_cumsum = gk.cumsum(-2)
+    # v2 = (alpha @ k.transpose(-1, -2)).masked_fill_(mask, 0) @ v
+    A_ab = torch.zeros(b, h, l // chunk_size, chunk_size, chunk_size).to(q.device)
+    A_qk = torch.zeros(b, h, l // chunk_size, chunk_size, chunk_size).to(q.device)
+    A_ak = torch.zeros(b, h, l // chunk_size, chunk_size, chunk_size).to(q.device)
+    A_qb = torch.zeros(b, h, l // chunk_size, chunk_size, chunk_size).to(q.device)
+    for i in range(chunk_size):
+        alpha_i = alpha[:, :, :, i, None]
+        q_i = q[:, :, :, i, None]
+        gk_i = gk_cumsum[:, :, :, i, None]
+        mask = (torch.arange(chunk_size) <= i).to(q.device)
+        attn_i = (gk_i - gk_cumsum).masked_fill(~mask.unsqueeze(-1), float('-inf')).exp()
+        A_qk[:, :, :, i, :] = (q_i * k * attn_i).sum(-1).clone()
+        A_qb[:, :, :, i, :] = (q_i * beta * attn_i).sum(-1).clone()
+        mask = (torch.arange(chunk_size) < i).to(q.device)
+        # shift by one.
+        attn_i = (gk_i - gk[:, :, :, i, None] - gk_cumsum).masked_fill(~mask.unsqueeze(-1), float('-inf')).exp()
+        A_ab[:, :, :, i, :] = (alpha_i * beta * attn_i).sum(-1).clone()
+        A_ak[:, :, :, i, :] = (alpha_i * k * attn_i).sum(-1).clone()
+    A_ab = A_ab
+    for i in range(1, chunk_size):
+        A_ab[..., i, :i] = A_ab[..., i, :i].clone() + (A_ab[..., i, :, None].clone() * A_ab[..., :, :i].clone()).sum(-2)
+    A_ab = A_ab + torch.eye(chunk_size, dtype=torch.float, device=q.device)
+    u = A_ab @ (A_ak @ v)
+    w = A_ab @ ((gk_cumsum-gk).exp() * alpha)
+    o = torch.zeros_like(v)
+    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=q.device), diagonal=1)
+    for i in range(0, l // chunk_size):
+        q_i, k_i, v_i, u_i, w_i, beta_i = q[:, :, i], k[:, :, i], v[:, :, i], u[:, :, i], w[:, :, i], beta[:, :, i]
+        v2_i = u_i + w_i @ S
+        o_1 = A_qk[:, :, i] @ v_i
+        o_2 = A_qb[:, :, i] @ v2_i
+        o_3 = (q_i * gk_cumsum[:, :, i].exp()) @ S
+        o[:, :, i] = o_1 + o_2 + o_3
+        decay = (gk_cumsum[:, :, i, -1, None] - gk_cumsum[:, :, i]).exp()
+        S = S*gk_cumsum[:, :, i, -1, :, None].exp() + (k_i * decay).transpose(-1, -2) @ v_i + \
+            (beta_i * decay).transpose(-1, -2) @ v2_i
+    S = None if output_final_state is False else S
+    return rearrange(o, 'b h n c d -> b h (n c) d'), S

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/wy_fast_bwd.py ADDED Viewed

	@@ -0,0 +1,164 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from ....ops.utils import prepare_chunk_indices
+from ....utils import check_shared_mem, is_intel_alchemist, use_cuda_graph
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/3449
+triton_config = {'grf_mode': 'large'} if is_intel_alchemist else {}
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config(triton_config, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT', 'BK', 'BV'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def prepare_wy_repr_bwd_kernel(
+    A_ab_inv,
+    A_ak,
+    ag,
+    v,
+    dw,
+    du,
+    dv,
+    dv0,
+    dag,
+    dAak,
+    dAab,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_Aak_t = tl.make_block_ptr(A_ak + (bos*H + i_h) * BT,  (BT, T), (1, H*BT), (0, i_t * BT), (BT, BT), (0, 1))
+    p_Aab_inv_t = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (BT, T), (1, H*BT), (0, i_t * BT), (BT, BT), (0, 1))
+    p_dAak = tl.make_block_ptr(dAak + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    p_dAab = tl.make_block_ptr(dAab + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    b_A_ab_inv_t = tl.load(p_Aab_inv_t, boundary_check=(0, 1))
+    b_A_ak_t = tl.load(p_Aak_t, boundary_check=(0, 1))
+    b_A_ak_t = tl.where(tl.arange(0, BT)[:, None] < tl.arange(0, BT)[None, :], b_A_ak_t, 0)
+    b_A_ab_inv_t = tl.where(tl.arange(0, BT)[:, None] <= tl.arange(0, BT)[None, :], b_A_ab_inv_t, 0)
+    b_A_tmp_t = tl.dot(b_A_ak_t, b_A_ab_inv_t).to(v.dtype.element_ty)
+    b_dA_tmp = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(v + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_dv = tl.make_block_ptr(dv + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_dv0 = tl.make_block_ptr(dv0 + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_du = tl.make_block_ptr(du + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_du = tl.load(p_du, boundary_check=(0, 1))
+        b_dA_tmp += tl.dot(b_du.to(b_v.dtype), tl.trans(b_v))
+        b_dv0 = tl.load(p_dv0, boundary_check=(0, 1))
+        b_dv = b_dv0 + tl.dot(b_A_tmp_t, b_du)
+        tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+    m_i = tl.arange(0, BT)[:, None] > tl.arange(0, BT)[None, :]
+    b_dA_tmp = tl.where(m_i, b_dA_tmp, 0)
+    b_dA_ak = tl.dot(b_A_ab_inv_t, b_dA_tmp)
+    b_dA_ak = tl.where(m_i, b_dA_ak, 0)
+    tl.store(p_dAak, b_dA_ak, boundary_check=(0, 1))
+    b_dA_ab_inv = tl.dot(b_dA_tmp, b_A_ak_t)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_ag = tl.make_block_ptr(ag + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dag = tl.make_block_ptr(dag + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dw = tl.make_block_ptr(dw + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_ag = tl.load(p_ag, boundary_check=(0, 1))
+        b_dw = tl.load(p_dw, boundary_check=(0, 1))
+        b_dA_ab_inv += tl.dot(b_dw, tl.trans(b_ag))
+        b_dag = tl.dot(b_A_ab_inv_t.to(b_dw.dtype), b_dw)
+        tl.store(p_dag, b_dag.to(p_dag.dtype.element_ty), boundary_check=(0, 1))
+    # if we know dL/dA^(-1), for dL/dA, we can use the following formula:
+    # dL/dA = -(A^(-1))^T @ (dL/dA^(-1)) @ (A^(-1))^T
+    # in the fwd pass we use fwd substitution to calculate (I-lower(A_ab))^-1.
+    # denote A = I - lower(A_ab), B = A^-1
+    # in the backward pass.
+    # dL/dA = -(B)^T @ (dL/dB) @ B^T
+    # dL/dA_ab = lower(B^T @ dL/dB @ B^T)
+    b_dA_ab_inv = tl.where(tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :], b_dA_ab_inv, 0)
+    b_dA_ab_inv = tl.dot(b_A_ab_inv_t, b_dA_ab_inv)
+    b_dA_ab_inv = tl.dot(b_dA_ab_inv, b_A_ab_inv_t)
+    b_dA_ab_inv = tl.where(m_i, b_dA_ab_inv, 0)
+    tl.store(p_dAab, b_dA_ab_inv, boundary_check=(0, 1))
+def chunk_dplr_bwd_wy(
+    A_ab_inv: torch.Tensor,
+    A_ak: torch.Tensor,
+    v: torch.Tensor,
+    ag: torch.Tensor,
+    dw: torch.Tensor,
+    du: torch.Tensor,
+    dv0: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor],
+    chunk_size: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    A_ab_inv, A_ak, v, ag, dw, du = map(lambda x: x.contiguous(), [A_ab_inv, A_ak, v, ag, dw, du])
+    B, T, H, K, V = *dw.shape, du.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    BK = min(triton.next_power_of_2(K), 64)
+    BV = min(triton.next_power_of_2(V), 64) if check_shared_mem() else min(triton.next_power_of_2(V), 32)
+    dA_ab = torch.empty_like(A_ab_inv, dtype=torch.float)
+    dA_ak = torch.empty_like(A_ak, dtype=torch.float)
+    dv = torch.empty_like(v)
+    dag = torch.empty_like(ag)
+    prepare_wy_repr_bwd_kernel[(NT, B * H)](
+        A_ab_inv=A_ab_inv,
+        A_ak=A_ak,
+        ag=ag,
+        v=v,
+        dw=dw,
+        du=du,
+        dv=dv,
+        dv0=dv0,
+        dag=dag,
+        dAak=dA_ak,
+        dAab=dA_ab,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+    )
+    return dA_ab, dA_ak, dv, dag

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/dplr/wy_fast_fwd.py ADDED Viewed

	@@ -0,0 +1,284 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from ....ops.utils import prepare_chunk_indices
+from ....ops.utils.op import gather
+from ....utils import is_gather_supported, use_cuda_graph
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4, 8, 16]
+    ],
+    key=['BT'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def prepare_wy_repr_fwd_kernel_chunk32(
+    A_ab,
+    A_ab_inv,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,  # placeholder, do not delete
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_Aab = tl.make_block_ptr(A_ab + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    p_Aab_inv = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    b_A_ab = tl.load(p_Aab, boundary_check=(0, 1))
+    b_A_ab = tl.where(tl.arange(0, BT)[:, None] > tl.arange(0, BT)[None, :], b_A_ab, 0)
+    for i in range(1, BT):
+        mask = tl.arange(0, BT) == i
+        b_a = tl.sum(tl.where(mask[:, None], b_A_ab, 0), 0)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A_ab, 0) * (tl.arange(0, BT) < i)
+        b_A_ab = tl.where(mask[:, None], b_a, b_A_ab)
+    b_A_ab += tl.arange(0, BT)[:, None] == tl.arange(0, BT)[None, :]
+    tl.store(p_Aab_inv, b_A_ab.to(p_Aab_inv.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BC'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def prepare_wy_repr_fwd_kernel_chunk64(
+    A_ab,
+    A_ab_inv,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    GATHER_SUPPORTED: tl.constexpr = is_gather_supported
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_A1 = tl.make_block_ptr(A_ab + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BC, BC), (1, 0))
+    p_A2 = tl.make_block_ptr(A_ab + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT + BC, BC), (BC, BC), (1, 0))
+    p_A3 = tl.make_block_ptr(A_ab + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT + BC, 0), (BC, BC), (1, 0))
+    p_A_inv1 = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BC, BC), (1, 0))
+    p_A_inv2 = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT + BC, BC), (BC, BC), (1, 0))
+    p_A_inv3 = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT + BC, 0), (BC, BC), (1, 0))
+    p_A_inv4 = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, BC), (BC, BC), (1, 0))
+    b_A = tl.load(p_A1, boundary_check=(0, 1))
+    b_A2 = tl.load(p_A2, boundary_check=(0, 1))
+    b_A3 = tl.load(p_A3, boundary_check=(0, 1))
+    b_A = tl.where(tl.arange(0, BC)[:, None] > tl.arange(0, BC)[None, :], b_A, 0)
+    b_A2 = tl.where(tl.arange(0, BC)[:, None] > tl.arange(0, BC)[None, :], b_A2, 0)
+    for i in range(1, BC):
+        if GATHER_SUPPORTED:
+            row_idx = tl.full([1, BC], i, dtype=tl.int16)
+            # [1, BK] -> [BK]
+            b_a = tl.sum(gather(b_A, row_idx, axis=0), 0)
+            b_a2 = tl.sum(gather(b_A2, row_idx, axis=0), 0)
+        else:
+            mask = tl.arange(0, BC) == i
+            b_a = tl.sum(tl.where(mask[:, None], b_A, 0), 0)
+            b_a2 = tl.sum(tl.where(mask[:, None], b_A2, 0), 0)
+        mask = tl.arange(0, BC) == i
+        # b_a = tl.sum(tl.where(mask[:, None], b_A, 0), 0)
+        # b_a2 = tl.sum(tl.where(mask[:, None], b_A2, 0), 0)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A, 0) * (tl.arange(0, BC) < i)
+        b_a2 = b_a2 + tl.sum(b_a2[:, None] * b_A2, 0) * (tl.arange(0, BC) < i)
+        b_A = tl.where(mask[:, None], b_a, b_A)
+        b_A2 = tl.where(mask[:, None], b_a2, b_A2)
+    # blockwise computation of lower triangular matrix's inverse
+    # i.e., [A11, 0; A21, A22]^-1 = [A11^-1, 0; -A22^-1 A21 A11^-1, A22^-1]
+    b_A += tl.arange(0, BC)[:, None] == tl.arange(0, BC)[None, :]
+    b_A2 += tl.arange(0, BC)[:, None] == tl.arange(0, BC)[None, :]
+    b_A3 = tl.dot(tl.dot(b_A2, b_A3), b_A)
+    # tl.debug_barrier()
+    tl.store(p_A_inv1, b_A.to(p_A_inv1.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    tl.store(p_A_inv2, b_A2.to(p_A_inv2.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    tl.store(p_A_inv3, b_A3.to(p_A_inv3.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    # causal mask
+    tl.store(p_A_inv4, tl.zeros([BC, BC], dtype=tl.float32).to(p_A_inv4.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['H', 'K', 'V', 'BT', 'BK', 'BV', 'IS_VARLEN'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def wu_fwd_kernel(
+    w,
+    u,
+    ag,
+    v,
+    A_ab_inv,
+    A_ak,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    o_s = tl.arange(0, BT)
+    p_A_ab_inv = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    p_A_ak = tl.make_block_ptr(A_ak + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    b_Aab_inv = tl.load(p_A_ab_inv, boundary_check=(0, 1))
+    b_Aak = tl.load(p_A_ak, boundary_check=(0, 1))
+    b_Aab_inv = tl.where(o_s[:, None] >= o_s[None, :], b_Aab_inv, 0)
+    b_Aak = tl.where(o_s[:, None] > o_s[None, :], b_Aak, 0)
+    # let's use tf32 here
+    b_Aak = tl.dot(b_Aab_inv, b_Aak)
+    # (SY 01/04) should be bf16 or tf32? To verify.
+    b_Aak = b_Aak.to(v.dtype.element_ty, fp_downcast_rounding="rtne")
+    b_Aab_inv = b_Aab_inv.to(ag.dtype.element_ty, fp_downcast_rounding="rtne")
+    for i_k in range(tl.cdiv(K, BK)):
+        p_ag = tl.make_block_ptr(ag + (bos*H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_w = tl.make_block_ptr(w + (bos*H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_ag = tl.load(p_ag, boundary_check=(0, 1))
+        b_w = tl.dot(b_Aab_inv, b_ag)  # both bf16 or fp16
+        tl.store(p_w, b_w.to(p_w.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(v + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_u = tl.make_block_ptr(u + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_u = tl.dot(b_Aak, b_v)  # both bf16 or fp16
+        tl.store(p_u, b_u.to(p_u.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+def wu_fwd(
+    ag: torch.Tensor,
+    v: torch.Tensor,
+    A_ak: torch.Tensor,
+    A_ab_inv: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor],
+    chunk_size: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *ag.shape, v.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    BK = min(triton.next_power_of_2(K), 64)
+    BV = min(triton.next_power_of_2(V), 64)
+    w = torch.empty_like(ag)
+    u = torch.empty_like(v)
+    wu_fwd_kernel[(NT, B * H)](
+        ag=ag,
+        v=v,
+        A_ak=A_ak,
+        A_ab_inv=A_ab_inv,
+        w=w,
+        u=u,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+    )
+    return w, u
+def prepare_wy_repr_fwd(
+    ag: torch.Tensor,
+    v: torch.Tensor,
+    A_ak: torch.Tensor,
+    A_ab: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor],
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    B, T, H, _ = ag.shape
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    BC = min(BT, 32)
+    fwd_fn = prepare_wy_repr_fwd_kernel_chunk64 if BT == 64 else prepare_wy_repr_fwd_kernel_chunk32
+    A_ab_inv = torch.empty_like(A_ab)
+    fwd_fn[(NT, B * H)](
+        A_ab=A_ab,
+        A_ab_inv=A_ab_inv,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        BT=BT,
+        BC=BC,
+    )
+    w, u = wu_fwd(
+        ag=ag,
+        v=v,
+        A_ak=A_ak,
+        A_ab_inv=A_ab_inv,
+        cu_seqlens=cu_seqlens,
+        chunk_size=BT
+    )
+    return w, u, A_ab_inv
+fwd_prepare_wy_repr = prepare_wy_repr_fwd
+fwd_wu = wu_fwd

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/iplr/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .chunk import chunk_iplr_delta_rule
+from .fused_recurrent import fused_recurrent_iplr_delta_rule
+__all__ = [
+    'chunk_iplr_delta_rule',
+    'fused_recurrent_iplr_delta_rule'
+]

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/iplr/chunk.py ADDED Viewed

	@@ -0,0 +1,500 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import warnings
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from einops import rearrange
+from ....ops.generalized_delta_rule.iplr.wy_fast import prepare_wy_repr_fwd
+from ....ops.utils import prepare_chunk_indices, prepare_chunk_offsets
+from ....utils import autocast_custom_bwd, autocast_custom_fwd, check_shared_mem, input_guard, use_cuda_graph
+BKV_LIST = [64, 128] if check_shared_mem() else [32, 64]
+@triton.heuristics({
+    'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
+    'STORE_FINAL_STATE': lambda args: args['ht'] is not None,
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [2, 4, 8, 16]
+    ],
+    key=['BT', 'BK', 'BV'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_generalized_iplr_delta_rule_fwd_kernel_h(
+    k,
+    v,
+    d,
+    b,
+    u,
+    v_new,
+    h,
+    h0,
+    ht,
+    cu_seqlens,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+    # [BK, BV]
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        p_h0 = tl.make_block_ptr(h0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_h = tl.load(p_h0, boundary_check=(0, 1)).to(tl.float32)
+    for i_t in range(NT):
+        p_h = tl.make_block_ptr(h + ((boh + i_t) * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_h, b_h.to(p_h.dtype.element_ty), boundary_check=(0, 1))
+        b_hc = tl.zeros([BK, BV], dtype=tl.float32)
+        # since we need to make all DK in the SRAM. we face serve SRAM memory burden. By subchunking we allievate such burden
+        for i_c in range(tl.cdiv(min(BT, T - i_t * BT), BC)):
+            p_k = tl.make_block_ptr(k+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+            p_b = tl.make_block_ptr(b+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+            p_d = tl.make_block_ptr(d+(bos*H+i_h)*K, (T, K), (H*K, 1), (i_t * BT + i_c * BC, i_k * BK), (BC, BK), (1, 0))
+            p_v = tl.make_block_ptr(v+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+            p_u = tl.make_block_ptr(u+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+            p_v_new = tl.make_block_ptr(v_new+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT+i_c*BC, i_v * BV), (BC, BV), (1, 0))
+            # [BK, BC]
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_v = tl.load(p_v, boundary_check=(0, 1))
+            b_d = tl.load(p_d, boundary_check=(0, 1))
+            b_b = tl.load(p_b, boundary_check=(0, 1))
+            b_v2 = tl.dot(b_d, b_h.to(b_d.dtype)) + tl.load(p_u, boundary_check=(0, 1))
+            b_hc += tl.dot(b_k, b_v)
+            b_hc += tl.dot(b_b, b_v2.to(b_k.dtype))
+            tl.store(p_v_new, b_v2.to(p_v_new.dtype.element_ty), boundary_check=(0, 1))
+        b_h += b_hc
+    if STORE_FINAL_STATE:
+        p_ht = tl.make_block_ptr(ht + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in BKV_LIST
+        for BV in BKV_LIST
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3]
+    ],
+    key=['BT'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_generalized_iplr_delta_rule_fwd_kernel_o(
+    q,
+    k,
+    v,
+    u,
+    b,
+    h,
+    o,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+    # offset calculation
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    b += (bos * H + i_h) * K
+    v += (bos * H + i_h) * V
+    u += (bos * H + i_h) * V
+    o += (bos * H + i_h) * V
+    h += (i_tg * H + i_h) * K * V
+    stride_qk = H*K
+    stride_vo = H*V
+    b_o = tl.zeros([BT, BV], dtype=tl.float32)
+    b_Aqk = tl.zeros([BT, BT], dtype=tl.float32)
+    b_Aqb = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_q = tl.make_block_ptr(q, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_k = tl.make_block_ptr(k, (K, T), (1, stride_qk), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        p_h = tl.make_block_ptr(h, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        p_b = tl.make_block_ptr(b, (K, T), (1, stride_qk), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        # [BT, BK]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        # [BK, BT]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_b = tl.load(p_b, boundary_check=(0, 1))
+        # [BK, BV]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        # [BT, BK] @ [BK, BV] -> [BT, BV]
+        b_o += tl.dot(b_q, b_h)
+        # [BT, BK] @ [BK, BT] -> [BT, BT]
+        b_Aqk += tl.dot(b_q, b_k)
+        # [BT, BK] @ [BK, BT] -> [BT, BT]
+        b_Aqb += tl.dot(b_q, b_b)
+    o_i = tl.arange(0, BT)
+    m_A = o_i[:, None] >= o_i[None, :]
+    b_Aqk = tl.where(m_A, b_Aqk, 0)
+    b_Aqb = tl.where(m_A, b_Aqb, 0)
+    p_v = tl.make_block_ptr(v, (T, V), (stride_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_u = tl.make_block_ptr(u, (T, V), (stride_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_o = tl.make_block_ptr(o, (T, V), (stride_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    b_v = tl.load(p_v, boundary_check=(0, 1))
+    b_u = tl.load(p_u, boundary_check=(0, 1))
+    b_o = (b_o + tl.dot(b_Aqk.to(b_v.dtype), b_v) + tl.dot(b_Aqb.to(b_u.dtype), b_u)) * scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+def chunk_generalized_iplr_delta_rule_fwd_o(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    v_new: torch.Tensor,
+    b: torch.Tensor,
+    h: torch.Tensor,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64
+) -> torch.Tensor:
+    B, T, H, K, V = *q.shape, v.shape[-1]
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    o = torch.empty_like(v)
+    def grid(meta): return (
+        triton.cdiv(V, meta['BV']),
+        NT,
+        B * H
+    )
+    chunk_generalized_iplr_delta_rule_fwd_kernel_o[grid](
+        q=q,
+        k=k,
+        v=v,
+        u=v_new,
+        b=b,
+        h=h,
+        o=o,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+    )
+    return o
+def chunk_generalized_iplr_delta_rule_fwd_h(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    w: torch.Tensor,
+    u: torch.Tensor,
+    b: torch.Tensor,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, u.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if cu_seqlens is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        N, NT, chunk_offsets = len(cu_seqlens) - 1, len(chunk_indices), prepare_chunk_offsets(cu_seqlens, BT)
+    BK = triton.next_power_of_2(K)
+    assert BK <= 256, "current kernel does not support head dimension larger than 256."
+    # H100 can have larger block size
+    if check_shared_mem('hopper', k.device.index):
+        BV = 64
+        BC = 64 if K <= 128 else 32
+    elif check_shared_mem('ampere', k.device.index):  # A100
+        BV = 32
+        BC = 32
+    else:
+        BV = 16
+        BC = 16
+    BC = min(BT, BC)
+    NK = triton.cdiv(K, BK)
+    NV = triton.cdiv(V, BV)
+    assert NK == 1, 'NK > 1 is not supported because it involves time-consuming synchronization'
+    h = k.new_empty(B, NT, H, K, V)
+    final_state = k.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None
+    v_new = torch.empty_like(u)
+    grid = (NK, NV, N * H)
+    chunk_generalized_iplr_delta_rule_fwd_kernel_h[grid](
+        k=k,
+        v=v,
+        d=w,
+        b=b,
+        u=u,
+        v_new=v_new,
+        h=h,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BC=BC,
+        BK=BK,
+        BV=BV,
+    )
+    return h, v_new, final_state
+def chunk_generalized_iplr_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    output_final_state: bool,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64
+):
+    T = q.shape[1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    w, u, _ = prepare_wy_repr_fwd(
+        a=a,
+        b=b,
+        k=k,
+        v=v,
+        cu_seqlens=cu_seqlens,
+        chunk_size=BT
+    )
+    h, v_new, final_state = chunk_generalized_iplr_delta_rule_fwd_h(
+        k=k,
+        v=v,
+        b=b,
+        w=w,
+        u=u,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        cu_seqlens=cu_seqlens,
+        chunk_size=BT
+    )
+    o = chunk_generalized_iplr_delta_rule_fwd_o(
+        q=q,
+        k=k,
+        v=v,
+        v_new=v_new,
+        b=b,
+        h=h,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_size=BT
+    )
+    return o, final_state
+class ChunkGeneralizedIPLRDeltaRuleFunction(torch.autograd.Function):
+    @staticmethod
+    @input_guard
+    @autocast_custom_fwd
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        output_final_state: bool,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+    ):
+        chunk_size = 64
+        o, final_state = chunk_generalized_iplr_delta_rule_fwd(
+            q=q,
+            k=k,
+            v=v,
+            a=a,
+            b=b,
+            scale=scale,
+            initial_state=initial_state,
+            output_final_state=output_final_state,
+            cu_seqlens=cu_seqlens,
+            chunk_size=chunk_size
+        )
+        return o.to(q.dtype), final_state
+    @staticmethod
+    @input_guard
+    @autocast_custom_bwd
+    def backward(
+        ctx,
+        do: torch.Tensor,
+        dht: torch.Tensor
+    ):
+        raise NotImplementedError(
+            "Backward pass for ChunkGeneralizedIPLRDeltaRuleFunction is not implemented yet. "
+            "Stay tuned!"
+        )
+@torch.compiler.disable
+def chunk_iplr_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False
+):
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        v (torch.Tensor):
+            values of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
+        a (torch.Tensor):
+            activations of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        b (torch.Tensor):
+            betas of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `False`.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+    """
+    assert q.dtype == k.dtype == v.dtype
+    assert q.dtype != torch.float32, "ChunkDeltaRuleFunction does not support float32. Please use bfloat16."
+    if head_first:
+        raise DeprecationWarning(
+            "head_first is deprecated and will be removed in a future version. "
+            "Please use head_first=False for now instead."
+        )
+        q, k, v, a, b = map(lambda x: rearrange(x, 'b h t ... -> b t h ...'), (q, k, v, a, b))
+    if not head_first and q.shape[1] < q.shape[2]:
+        warnings.warn(
+            f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
+            "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
+            "when head_first=False was specified. "
+            "Please verify your input tensor format matches the expected shape [B, T, H, ...]."
+        )
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(
+                f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                f"Please ...tten variable-length inputs before processing."
+            )
+        if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1:
+            raise ValueError(
+                f"The number of initial states is expected to be equal to the number of input sequences, "
+                f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}."
+            )
+    scale = k.shape[-1] ** -0.5 if scale is None else scale
+    o, final_state = ChunkGeneralizedIPLRDeltaRuleFunction.apply(
+        q,
+        k,
+        v,
+        a,
+        b,
+        scale,
+        initial_state,
+        output_final_state,
+        cu_seqlens,
+    )
+    if head_first:
+        o = rearrange(o, 'b t h ... -> b h t ...')
+    return o, final_state

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/iplr/fused_recurrent.py ADDED Viewed

	@@ -0,0 +1,452 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from ....utils import input_guard
+@triton.heuristics({
+    'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
+    'STORE_FINAL_STATE': lambda args: args['ht'] is not None,
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BV': BV}, num_warps=num_warps, num_stages=num_stages)
+        for BV in [32, 64]
+        for num_warps in [2, 4, 8, 16]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BK"],
+)
+@triton.jit
+def fused_recurrent_fwd_kernel(
+    q,  # query [B, H, L, K]
+    k,  # key [B, H, L, V]
+    v,  # value [B, H, L, V].
+    a,  # a [B, H, L, K]
+    b,  # b [B, H, L, K]
+    o,  # output [B, H, L, V]
+    ha,  # tmp variable [B, H, L, V] for storing intermediate results of (h * a[None, :]).sum(0)
+    h0,  # initial hidden state [B, H, K, V]
+    ht,  # final hidden state [B, H, K, V]
+    cu_seqlens,  # varlen cu_seqlens
+    scale,  # K ** -0.5
+    H,  # n_heads
+    T,  # seq_len
+    K: tl.constexpr,  # K
+    V: tl.constexpr,  # V
+    BK: tl.constexpr,  # BLOCK SIZE along the K dimension
+    BV: tl.constexpr,  # BLOCK SIZE along the V dimension
+    USE_INITIAL_STATE: tl.constexpr,  # whether to use initial state
+    STORE_FINAL_STATE: tl.constexpr,  # whether to store final state
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_nh = tl.program_id(0), tl.program_id(1)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int64), tl.load(cu_seqlens + i_n + 1).to(tl.int64)
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+    p_q = q + (bos * H + i_h) * K + tl.arange(0, BK)
+    p_k = k + (bos * H + i_h) * K + tl.arange(0, BK)
+    p_a = a + (bos * H + i_h) * K + tl.arange(0, BK)
+    p_b = b + (bos * H + i_h) * K + tl.arange(0, BK)
+    p_ha = ha + (bos * H + i_h) * V + i_v * BV + tl.arange(0, BV)
+    p_v = v + (bos * H + i_h) * V + i_v * BV + tl.arange(0, BV)
+    p_o = o + (bos * H + i_h) * V + i_v * BV + tl.arange(0, BV)
+    mask_k = tl.arange(0, BK) < K
+    mask_v = (i_v * BV + tl.arange(0, BV)) < V
+    mask_h = mask_k[None, :] & mask_v[:, None]
+    b_h = tl.zeros([BV, BK], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        p_h0 = h0 + i_nh * K * V + (tl.arange(0, BK)[None, :]) * V + ((i_v * BV + tl.arange(0, BV))[:, None])
+        b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32)
+    for _ in range(0, T):
+        b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
+        b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
+        b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32) * scale
+        b_a = tl.load(p_a, mask=mask_k, other=0).to(tl.float32)
+        b_b = tl.load(p_b, mask=mask_k, other=0).to(tl.float32)
+        # to store
+        tmp = tl.sum(b_h * b_a[None, :], axis=1)
+        b_h += (tmp[:, None] * b_b[None, :] + b_k[None, :] * b_v[:, None])
+        b_o = b_h * b_q[None, :]
+        b_o = tl.sum(b_o, axis=1)
+        tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v)
+        tl.store(p_ha, tmp.to(p_ha.dtype.element_ty), mask=mask_v)
+        p_q += K*H
+        p_k += K*H
+        p_o += V*H
+        p_v += V*H
+        p_ha += V*H
+        p_a += K*H
+        p_b += K*H
+    if STORE_FINAL_STATE:
+        p_ht = ht + i_nh * K * V + (tl.arange(0, BK)[None, :]) * V + ((i_v * BV + tl.arange(0, BV))[:, None])
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask_h)
+@triton.heuristics({
+    'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
+    'USE_DHT': lambda args: args['dht'] is not None,
+    'USE_DH0': lambda args: args['dh0'] is not None,
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16]
+        for num_stages in [2, 3]
+    ],
+    key=["BK", "BV"],
+)
+@triton.jit
+def fused_recurrent_bwd_kernel(
+    # B: batch_size, H: n_heads, T: seq_len, D: b_dhead
+    # NV: number of split in the V dimension. NK: number of split in the K dimension
+    q,  # query [B, H, L, K]
+    k,  # key [B, H, L, V]
+    v,  # value [B, H, L, V]
+    a,  # a [B, H, L, K]
+    b,  # b [B, H, L, K]
+    ha,  # ha [B, H, L, V]
+    dht,  # gradient of final state [B, H, K, V]
+    dh0,  # gradient of initial state [B, H, K, V]
+    do,  # gradient of output [B, H, L, V]
+    dq,  # gradient of query [NV, B, H, L, K]
+    dk,  # gradient of key [NV, B, H, L, K]
+    dv,  # gradient of value [NK, B, H, L, V]
+    da,  # gradient of a [NV, B, H, L, K]
+    db,  # gradient of b [NV, B, H, L, K]
+    dha,  # gradient of ha [NK, B, H, L, V]
+    h0,  # initial state [B, H, K, V]
+    scale,  # K ** -0.5
+    cu_seqlens,  # cu_seqlens
+    B,  # batch_size
+    H,  # n_heads
+    T,  # seq_len
+    K: tl.constexpr,  # K
+    V: tl.constexpr,  # V
+    BK: tl.constexpr,  # BLOCK SIZE along the K dimension
+    BV: tl.constexpr,  # BLOCK SIZE along the V dimension
+    USE_INITIAL_STATE: tl.constexpr,  # whether to use initial state h0
+    USE_DH0: tl.constexpr,  # whether to use dh0
+    USE_DHT: tl.constexpr,  # whether to use dht
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_nh = tl.program_id(0), tl.program_id(1)
+    i_n, i_h = i_nh // H, i_nh % H
+    dk += i_v * B * H * K * T
+    db += i_v * B * H * K * T
+    dq += i_v * B * H * K * T
+    da += i_v * B * H * K * T
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int64), tl.load(cu_seqlens + i_n + 1).to(tl.int64)
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+    mask_k = tl.arange(0, BK) < K
+    mask_v = (tl.arange(0, BV) + i_v * BV) < V
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    v += (bos * H + i_h) * V + i_v * BV
+    ha += (bos * H + i_h) * V + i_v * BV
+    a += (bos * H + i_h) * K
+    b += (bos * H + i_h) * K
+    do += (bos * H + i_h) * V + i_v * BV
+    dq += (bos * H + i_h) * K
+    dk += (bos * H + i_h) * K
+    dv += (bos * H + i_h) * V + i_v * BV
+    da += (bos * H + i_h) * K
+    db += (bos * H + i_h) * K
+    dha += (bos * H + i_h) * V + i_v * BV
+    p_q = q + tl.arange(0, BK) + (T - 1) * H*K
+    p_k = k + tl.arange(0, BK) + (T - 1) * H*K
+    p_v = v + tl.arange(0, BV) + (T - 1) * H*V
+    p_ha = ha + tl.arange(0, BV) + (T - 1) * H*V
+    p_a = a + tl.arange(0, BK) + (T - 1) * H*K
+    p_b = b + tl.arange(0, BK) + (T - 1) * H*K
+    p_do = do + tl.arange(0, BV) + (T - 1) * H*V
+    p_dk = dk + tl.arange(0, BK) + (T - 1) * H*K
+    p_dv = dv + tl.arange(0, BV) + (T - 1) * H*V
+    p_dha = dha + tl.arange(0, BV) + (T - 1) * H*V
+    p_db = db + tl.arange(0, BK) + (T - 1) * H*K
+    p_da = da + tl.arange(0, BK) + (T - 1) * H*K
+    p_dq = dq + tl.arange(0, BK) + (T - 1) * H*K
+    b_dh = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_DHT:
+        p_ht = dht + i_nh * K * V + (tl.arange(0, BK)[:, None]) * V + ((i_v * BV + tl.arange(0, BV))[None, :])
+        b_dh += tl.load(p_ht, mask=mask_k[:, None] & mask_v[None, :], other=0).to(tl.float32)
+    for _ in range(T):
+        b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32) * scale
+        b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
+        b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
+        b_do = tl.load(p_do, mask=mask_v, other=0).to(tl.float32)
+        b_b = tl.load(p_b, mask=mask_k, other=0).to(tl.float32)
+        b_a = tl.load(p_a, mask=mask_k, other=0).to(tl.float32)
+        b_ha = tl.load(p_ha, mask=mask_v, other=0).to(tl.float32)
+        b_dh += b_q[:, None] * b_do[None, :]
+        d_k = tl.sum(b_dh * b_v[None, :], axis=1)
+        d_v = tl.sum(b_dh * b_k[:, None], axis=0)
+        tl.store(p_dk, d_k.to(p_dk.dtype.element_ty), mask=mask_k)
+        tl.store(p_dv, d_v.to(p_dv.dtype.element_ty), mask=mask_v)
+        b_dha = tl.sum(b_dh * b_b[:, None], axis=0)
+        tl.store(p_dha, b_dha.to(p_dha.dtype.element_ty), mask=mask_v)
+        b_db = tl.sum(b_dh * b_ha[None, :], axis=1)
+        tl.store(p_db, b_db.to(p_db.dtype.element_ty), mask=mask_k)
+        b_dh += b_dha[None, :] * b_a[:, None]
+        p_do -= H*V
+        p_q -= H*K
+        p_k -= H*K
+        p_v -= H*V
+        p_dk -= H*K
+        p_dv -= H*V
+        p_b -= H*K
+        p_db -= H*K
+        p_a -= H*K
+        p_dha -= H*V
+        p_ha -= H*V
+    if USE_DH0:
+        p_dh0 = dh0 + i_nh * K * V + (tl.arange(0, BK)[:, None]) * V + (i_v * BV + tl.arange(0, BV)[None, :])
+        tl.store(p_dh0, b_dh.to(p_dh0.dtype.element_ty), mask=mask_k[:, None] & mask_v[None, :])
+    tl.debug_barrier()
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        mask_kv = mask_k[:, None] & mask_v[None, :]
+        p_h0 = h0 + i_nh * K * V + (tl.arange(0, BK)[:, None]) * V + ((i_v * BV + tl.arange(0, BV))[None, :])
+        b_h += tl.load(p_h0, mask=mask_kv, other=0).to(tl.float32)
+    p_k = k + tl.arange(0, BK)
+    p_v = v + tl.arange(0, BV)
+    p_ha = ha + tl.arange(0, BV)
+    p_do = do + tl.arange(0, BV)
+    p_dha = dha + tl.arange(0, BV)
+    p_da = da + tl.arange(0, BK)
+    p_dq = dq + tl.arange(0, BK)
+    p_b = b + tl.arange(0, BK)
+    for i in range(0, T):
+        b_dha = tl.load(p_dha, mask=mask_v, other=0).to(tl.float32)
+        d_a = tl.sum(b_dha[None, :] * b_h, axis=1)
+        tl.store(p_da, d_a.to(p_da.dtype.element_ty), mask=mask_k)
+        b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
+        b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
+        b_do = tl.load(p_do, mask=mask_v, other=0).to(tl.float32)
+        b_b = tl.load(p_b, mask=mask_k, other=0).to(tl.float32)
+        b_ha = tl.load(p_ha, mask=mask_v, other=0).to(tl.float32)
+        b_h += b_k[:, None] * b_v[None, :] + b_b[:, None] * b_ha[None, :]
+        _d_q = b_h * b_do[None, :]
+        d_q = tl.sum(_d_q, axis=1) * scale
+        tl.store(p_dq, d_q.to(p_dq.dtype.element_ty), mask=mask_k)
+        p_k += H*K
+        p_do += H*V
+        p_v += H*V
+        p_da += H*K
+        p_dha += H*V
+        p_ha += H*V
+        p_dq += H*K
+        p_b += H*K
+class FusedRecurrentIPLRDeltaRuleFunction(torch.autograd.Function):
+    @staticmethod
+    @input_guard
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        scale: Optional[float] = None,
+        initial_state: Optional[torch.Tensor] = None,
+        output_final_state: bool = False,
+        cu_seqlens: Optional[torch.LongTensor] = None
+    ):
+        B, T, H, K, V = *k.shape, v.shape[-1]
+        N = B if cu_seqlens is None else len(cu_seqlens) - 1
+        BK = triton.next_power_of_2(K)
+        if output_final_state:
+            final_state = q.new_empty(B, H, K, V, dtype=torch.float32)
+        else:
+            final_state = None
+        ha = torch.empty_like(v, dtype=torch.float32)
+        def grid(meta): return (
+            triton.cdiv(V, meta['BV']),
+            N * H
+        )
+        o = torch.empty_like(v)
+        fused_recurrent_fwd_kernel[grid](
+            q=q,
+            k=k,
+            v=v,
+            a=a,
+            b=b,
+            o=o,
+            ha=ha,
+            h0=initial_state,
+            ht=final_state,
+            scale=scale,
+            cu_seqlens=cu_seqlens,
+            H=H,
+            T=T,
+            K=K,
+            V=V,
+            BK=BK,
+        )
+        ctx.save_for_backward(q, k, v, a, b, ha, initial_state)
+        ctx.scale = scale
+        ctx.cu_seqlens = cu_seqlens
+        return o, final_state
+    @staticmethod
+    @input_guard
+    def backward(ctx, do, dht):
+        q, k, v, a, b, ha, initial_state = ctx.saved_tensors
+        B, T, H, K, V = *q.shape, v.shape[-1]
+        N = B if ctx.cu_seqlens is None else len(ctx.cu_seqlens) - 1
+        BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 64)
+        NV = triton.cdiv(V, BV)
+        scale = ctx.scale
+        dq = q.new_empty(NV, *q.shape)
+        dk = k.new_empty(NV, *k.shape)
+        da = a.new_empty(NV, *a.shape)
+        db = b.new_empty(NV, *b.shape)
+        dv = torch.empty_like(v)
+        dha = torch.empty_like(ha)
+        grid = (NV, N * H)
+        if initial_state is not None and initial_state.requires_grad:
+            dh0 = torch.empty_like(initial_state, dtype=torch.float32)
+        else:
+            dh0 = None
+        fused_recurrent_bwd_kernel[grid](
+            q=q,
+            k=k,
+            v=v,
+            a=a,
+            b=b,
+            ha=ha,
+            dht=dht,
+            dh0=dh0,
+            do=do,
+            dq=dq,
+            dk=dk,
+            dv=dv,
+            da=da,
+            db=db,
+            dha=dha,
+            h0=initial_state,
+            scale=scale,
+            cu_seqlens=ctx.cu_seqlens,
+            B=B,
+            H=H,
+            T=T,
+            K=K,
+            V=V,
+            BK=BK,
+            BV=BV,
+        )
+        dq = dq.sum(0)
+        dk = dk.sum(0)
+        da = da.sum(0)
+        db = db.sum(0)
+        return dq.to(q), dk.to(k), dv.to(v), da.to(a), db.to(b), None, dh0, None, None
+def fused_recurrent_iplr_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    This function computes the recurrence S_t = S_t @ (I + a_t b_t^T) + v_t k_t^T in a recurrent manner.
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, T, H, K]`
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]`
+        v (torch.Tensor):
+            values of shape `[B, T, H, V]`
+        a (torch.Tensor):
+            as of shape `[B, T, H, K]`
+        b (torch.Tensor):
+            bs of shape `[B, T, H, K]`
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[B, H, K, V]`. Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[B, H, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+    """
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(
+                f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                f"Please flatten variable-length inputs before processing."
+            )
+        if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1:
+            raise ValueError(
+                f"The number of initial states is expected to be equal to the number of input sequences, "
+                f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}."
+            )
+    if scale is None:
+        scale = q.shape[-1] ** -0.5
+    else:
+        assert scale > 0, "scale must be positive"
+    o, final_state = FusedRecurrentIPLRDeltaRuleFunction.apply(
+        q,
+        k,
+        v,
+        a,
+        b,
+        scale,
+        initial_state,
+        output_final_state,
+        cu_seqlens
+    )
+    return o, final_state

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/iplr/naive.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# -*- coding: utf-8 -*-
+import torch
+from einops import rearrange
+# S_t = S_t @ (I + alpha_t beta_t^T) + v_t k_t^T
+# q, k, alpha, beta [B, H, L, D_K]
+# v [B, H, L, D_V]
+def iplr_recurrence(q, k, v, alpha, beta, initial_state=None, output_final_state=True):
+    orig_dtype = q.dtype
+    b, h, l, d_k = q.shape
+    q, k, v, beta = map(lambda x: x.float(), [q, k, v, beta])
+    d_v = v.shape[-1]
+    o = torch.zeros_like(v)
+    S = torch.zeros(b, h, d_k, d_v).to(v)
+    q = q * (d_k ** -0.5)
+    if initial_state is not None:
+        S += initial_state
+    for i in range(l):
+        _k = k[:, :, i]
+        _q = q[:, :, i]
+        _v = v[:, :, i]
+        _alpha = alpha[:, :, i]
+        _beta = beta[:, :, i]
+        _kv = _k[..., None] * _v[..., None, :] + (S.clone() * _alpha[..., None]).sum(-2, keepdim=True) * _beta[..., None]
+        S = S + _kv
+        o[:, :, i] = torch.einsum('bhd,bhdm->bhm', _q, S)
+    S = None if output_final_state is False else S
+    return o.to(orig_dtype), S
+def iplr_chunkwise(q, k, v, alpha, beta, initial_state=None, output_final_state=True, chunk_size=32):
+    b, h, l, d_k = q.shape
+    d_v = v.shape[-1]
+    q = q * (d_k ** -0.5)
+    v = v
+    assert l % chunk_size == 0
+    S = k.new_zeros(b, h, d_k, d_v)
+    if initial_state is not None:
+        S += initial_state
+    # note that diagonal is masked.
+    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=q.device), diagonal=0)
+    q, k, v, alpha, beta = map(lambda x: rearrange(x, 'b h (n c) d -> b h n c d', c=chunk_size), [q, k, v, alpha, beta])
+    v2 = (alpha @ k.transpose(-1, -2)).masked_fill_(mask, 0) @ v
+    attn = (alpha @ beta.transpose(-1, -2)).masked_fill(mask, 0)
+    for i in range(1, chunk_size):
+        attn[..., i, :i] = attn[..., i, :i] + (attn[..., i, :, None].clone() * attn[..., :, :i].clone()).sum(-2)
+    attn = attn + torch.eye(chunk_size, dtype=torch.float, device=q.device)
+    u = attn @ v2
+    w = attn @ alpha
+    o = torch.zeros_like(v)
+    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=q.device), diagonal=1)
+    for i in range(0, l // chunk_size):
+        q_i, k_i, v_i, u_i, w_i, beta_i = q[:, :, i], k[:, :, i], v[:, :, i], u[:, :, i], w[:, :, i], beta[:, :, i]
+        o_1 = (q_i @ k_i.transpose(-1, -2)).masked_fill_(mask, 0) @ v_i
+        v2_i = u_i + w_i @ S
+        o_2 = (q_i @ beta_i.transpose(-1, -2)).masked_fill_(mask, 0) @ (v2_i)
+        o_3 = q_i @ S
+        o[:, :, i] = o_1 + o_2 + o_3
+        S = S + k_i.transpose(-1, -2) @ v_i + beta_i.transpose(-1, -2) @ v2_i
+    S = None if output_final_state is False else S
+    return rearrange(o, 'b h n c d -> b h (n c) d'), S

build/lib/opencompass/tasks/fla2/ops/generalized_delta_rule/iplr/wy_fast.py ADDED Viewed

	@@ -0,0 +1,300 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from ....ops.utils import prepare_chunk_indices
+from ....utils import check_shared_mem, is_nvidia_hopper
+NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8]
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4, 8, 16]
+    ],
+    key=['BK']
+)
+@triton.jit(do_not_specialize=['T'])
+def prepare_wy_repr_fwd_kernel_chunk32(
+    a,
+    b,
+    A,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BC: tl.constexpr,  # dummy placeholder
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    b_A = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_a = tl.make_block_ptr(a + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_b = tl.make_block_ptr(b + (bos * H + i_h) * K, (K, T), (1, K*H), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        b_a = tl.load(p_a, boundary_check=(0, 1))
+        b_b = tl.load(p_b, boundary_check=(0, 1))
+        b_A += tl.dot(b_a, b_b)
+    b_A = tl.where(tl.arange(0, BT)[:, None] > tl.arange(0, BT)[None, :], b_A, 0)
+    for i in range(1, BT):
+        mask = tl.arange(0, BT) == i
+        b_a = tl.sum(tl.where(mask[:, None], b_A, 0), 0)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A, 0) * (tl.arange(0, BT) < i)
+        b_A = tl.where(mask[:, None], b_a, b_A)
+    b_A += tl.arange(0, BT)[:, None] == tl.arange(0, BT)[None, :]
+    p_A = tl.make_block_ptr(A + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4, 8, 16]
+    ],
+    key=['BK']
+)
+@triton.jit(do_not_specialize=['T'])
+def prepare_wy_repr_fwd_kernel_chunk64(
+    a,
+    b,
+    A,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BC: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    b_A = tl.zeros([BC, BC], dtype=tl.float32)
+    b_A2 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_A3 = tl.zeros([BC, BC], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_a1 = tl.make_block_ptr(a + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0))
+        p_a2 = tl.make_block_ptr(a + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT + BC, i_k * BK), (BC, BK), (1, 0))
+        p_b1 = tl.make_block_ptr(b + (bos * H + i_h) * K, (K, T), (1, K*H), (i_k * BK, i_t * BT), (BK, BC), (0, 1))
+        p_b2 = tl.make_block_ptr(b + (bos * H + i_h) * K, (K, T), (1, K*H), (i_k * BK, i_t * BT + BC), (BK, BC), (0, 1))
+        b_a1 = tl.load(p_a1, boundary_check=(0, 1))
+        b_a2 = tl.load(p_a2, boundary_check=(0, 1))
+        b_b1 = tl.load(p_b1, boundary_check=(0, 1))
+        b_b2 = tl.load(p_b2, boundary_check=(0, 1))
+        b_A += tl.dot(b_a1, b_b1, allow_tf32=False)
+        b_A2 += tl.dot(b_a2, b_b2, allow_tf32=False)
+        b_A3 += tl.dot(b_a2, b_b1, allow_tf32=False)
+    b_A = tl.where(tl.arange(0, BC)[:, None] > tl.arange(0, BC)[None, :], b_A, 0)
+    b_A2 = tl.where(tl.arange(0, BC)[:, None] > tl.arange(0, BC)[None, :], b_A2, 0)
+    for i in range(1, BC):
+        mask = tl.arange(0, BC) == i
+        b_a = tl.sum(tl.where(mask[:, None], b_A, 0), 0)
+        b_a2 = tl.sum(tl.where(mask[:, None], b_A2, 0), 0)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A, 0) * (tl.arange(0, BC) < i)
+        b_a2 = b_a2 + tl.sum(b_a2[:, None] * b_A2, 0) * (tl.arange(0, BC) < i)
+        b_A = tl.where(mask[:, None], b_a, b_A)
+        b_A2 = tl.where(mask[:, None], b_a2, b_A2)
+    # blockwise computation of lower triangular matrix's inverse
+    # i.e., [A11, 0; A21, A22]^-1 = [A11^-1, 0; -A22^-1 A21 A11^-1, A22^-1]
+    b_A += tl.arange(0, BC)[:, None] == tl.arange(0, BC)[None, :]
+    b_A2 += tl.arange(0, BC)[:, None] == tl.arange(0, BC)[None, :]
+    b_A3 = tl.dot(tl.dot(b_A2, b_A3, allow_tf32=False), b_A, allow_tf32=False)
+    p_A1 = tl.make_block_ptr(A + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BC, BC), (1, 0))
+    p_A2 = tl.make_block_ptr(A + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT + BC, BC), (BC, BC), (1, 0))
+    p_A3 = tl.make_block_ptr(A + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT + BC, 0), (BC, BC), (1, 0))
+    p_A4 = tl.make_block_ptr(A + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, BC), (BC, BC), (1, 0))
+    tl.store(p_A1, b_A.to(p_A1.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_A2, b_A2.to(p_A2.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_A3, b_A3.to(p_A3.dtype.element_ty), boundary_check=(0, 1))
+    # causal mask
+    tl.store(p_A4, tl.zeros([BC, BC], dtype=tl.float32).to(p_A4.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in NUM_WARPS
+    ],
+    key=['BT', 'BK', 'BV']
+)
+@triton.jit(do_not_specialize=['T'])
+def wu_fwd_kernel(
+    w,
+    u,
+    a,
+    k,
+    v,
+    A,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_A = tl.make_block_ptr(A + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_Aak = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_a = tl.make_block_ptr(a + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_w = tl.make_block_ptr(w + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_a = tl.load(p_a, boundary_check=(0, 1))
+        b_w = tl.dot(b_A, b_a)
+        b_Aak += tl.dot(b_a, tl.trans(b_k))
+        tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1))
+    b_Aak = tl.where(tl.arange(0, BT)[:, None] > tl.arange(0, BT)[None, :], b_Aak, 0)
+    b_Aak = b_Aak.to(k.dtype.element_ty)
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(v + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_u = tl.make_block_ptr(u + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_v = tl.dot(b_Aak, b_v).to(v.dtype.element_ty)
+        b_u = tl.dot(b_A, b_v)
+        tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1))
+def prepare_wy_repr_fwd(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    v: torch.Tensor,
+    k: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor],
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    B, T, H, K = a.shape
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    BC = min(BT, 32)
+    BK = min(triton.next_power_of_2(K), 64)
+    A = torch.empty(B, T, H, BT, device=a.device, dtype=a.dtype)
+    fwd_fn = prepare_wy_repr_fwd_kernel_chunk64 if BT == 64 else prepare_wy_repr_fwd_kernel_chunk32
+    fwd_fn[(NT, B * H)](
+        a=a,
+        b=b,
+        A=A,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BK=BK,
+        BC=BC,
+    )
+    w, u = wu_fwd(
+        a=a,
+        v=v,
+        k=k,
+        A=A,
+        cu_seqlens=cu_seqlens,
+        chunk_size=chunk_size
+    )
+    return w, u, A
+def wu_fwd(
+    a: torch.Tensor,
+    v: torch.Tensor,
+    k: torch.Tensor,
+    A: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor],
+    chunk_size: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *a.shape, v.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    chunk_indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    CONST_TILING = 64 if check_shared_mem() else 32
+    BK = min(triton.next_power_of_2(K), CONST_TILING)
+    BV = min(triton.next_power_of_2(V), CONST_TILING)
+    u = torch.empty_like(v)
+    w = torch.empty_like(a)
+    wu_fwd_kernel[(NT, B*H)](
+        a=a,
+        v=v,
+        w=w,
+        u=u,
+        A=A,
+        k=k,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+    )
+    return w, u
+fwd_prepare_wy_repr = prepare_wy_repr_fwd
+fwd_wu = wu_fwd

docs/en/.readthedocs.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+version: 2
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.8"
+formats:
+    - epub
+sphinx:
+  configuration: docs/en/conf.py
+python:
+  install:
+    - requirements: requirements/docs.txt

docs/en/Makefile ADDED Viewed

	@@ -0,0 +1,20 @@

+# Minimal makefile for Sphinx documentation
+#
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+.PHONY: help Makefile
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

docs/en/_static/css/readthedocs.css ADDED Viewed

	@@ -0,0 +1,62 @@

+.header-logo {
+    background-image: url("../image/logo.svg");
+    background-size: 275px 80px;
+    height: 80px;
+    width: 275px;
+}
+@media screen and (min-width: 1100px) {
+  .header-logo {
+    top: -25px;
+  }
+}
+pre {
+    white-space: pre;
+}
+@media screen and (min-width: 2000px) {
+  .pytorch-content-left {
+    width: 1200px;
+    margin-left: 30px;
+  }
+  article.pytorch-article {
+    max-width: 1200px;
+  }
+  .pytorch-breadcrumbs-wrapper {
+    width: 1200px;
+  }
+  .pytorch-right-menu.scrolling-fixed {
+    position: fixed;
+    top: 45px;
+    left: 1580px;
+  }
+}
+article.pytorch-article section code {
+  padding: .2em .4em;
+  background-color: #f3f4f7;
+  border-radius: 5px;
+}
+/* Disable the change in tables */
+article.pytorch-article section table code {
+  padding: unset;
+  background-color: unset;
+  border-radius: unset;
+}
+table.autosummary td {
+  width: 50%
+}
+img.align-center {
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+}
+article.pytorch-article p.rubric {
+  font-weight: bold;
+}

docs/en/_static/image/logo.svg ADDED Viewed

docs/en/_static/image/logo_icon.svg ADDED Viewed

docs/en/_static/js/custom.js ADDED Viewed

	@@ -0,0 +1,20 @@

+var collapsedSections = ['Dataset Statistics'];
+$(document).ready(function () {
+  $('.dataset').DataTable({
+    "stateSave": false,
+    "lengthChange": false,
+    "pageLength": 20,
+    "order": [],
+    "language": {
+      "info": "Show _START_ to _END_ Items（Totally _TOTAL_ ）",
+      "infoFiltered": "（Filtered from _MAX_ Items）",
+      "search": "Search：",
+      "zeroRecords": "Item Not Found",
+      "paginate": {
+        "next": "Next",
+        "previous": "Previous"
+      },
+    }
+  });
+});

docs/en/_templates/404.html ADDED Viewed

	@@ -0,0 +1,18 @@

+{% extends "layout.html" %}
+{% block body %}
+<h1>Page Not Found</h1>
+<p>
+  The page you are looking for cannot be found.
+</p>
+<p>
+  If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
+  the content table left, or go to <a href="{{ pathto(root_doc) }}">the homepage</a>.
+</p>
+<!-- <p>
+  If you cannot find documentation you want, please <a
+    href="">open an issue</a> to tell us!
+</p> -->
+{% endblock %}

docs/en/_templates/autosummary/class.rst ADDED Viewed

	@@ -0,0 +1,13 @@

+.. role:: hidden
+    :class: hidden-section
+.. currentmodule:: {{ module }}
+{{ name | underline}}
+.. autoclass:: {{ name }}
+    :members:
+..
+  autogenerated from _templates/autosummary/class.rst
+  note it does not have :inherited-members:

docs/en/_templates/callable.rst ADDED Viewed

	@@ -0,0 +1,14 @@

+.. role:: hidden
+    :class: hidden-section
+.. currentmodule:: {{ module }}
+{{ name | underline}}
+.. autoclass:: {{ name }}
+    :members:
+    :special-members: __call__
+..
+  autogenerated from _templates/callable.rst
+  note it does not have :inherited-members:

docs/en/advanced_guides/accelerator_intro.md ADDED Viewed

	@@ -0,0 +1,142 @@

+# Accelerate Evaluation Inference with vLLM or LMDeploy
+## Background
+During the OpenCompass evaluation process, the Huggingface transformers library is used for inference by default. While this is a very general solution, there are scenarios where more efficient inference methods are needed to speed up the process, such as leveraging VLLM or LMDeploy.
+- [LMDeploy](https://github.com/InternLM/lmdeploy) is a toolkit designed for compressing, deploying, and serving large language models (LLMs), developed by the [MMRazor](https://github.com/open-mmlab/mmrazor) and [MMDeploy](https://github.com/open-mmlab/mmdeploy) teams.
+- [vLLM](https://github.com/vllm-project/vllm) is a fast and user-friendly library for LLM inference and serving, featuring advanced serving throughput, efficient PagedAttention memory management, continuous batching of requests, fast model execution via CUDA/HIP graphs, quantization techniques (e.g., GPTQ, AWQ, SqueezeLLM, FP8 KV Cache), and optimized CUDA kernels.
+## Preparation for Acceleration
+First, check whether the model you want to evaluate supports inference acceleration using vLLM or LMDeploy. Additionally, ensure you have installed vLLM or LMDeploy as per their official documentation. Below are the installation methods for reference:
+### LMDeploy Installation Method
+Install LMDeploy using pip (Python 3.8+) or from [source](https://github.com/InternLM/lmdeploy/blob/main/docs/en/build.md):
+```bash
+pip install lmdeploy
+```
+### VLLM Installation Method
+Install vLLM using pip or from [source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
+```bash
+pip install vllm
+```
+## Accelerated Evaluation Using VLLM or LMDeploy
+### Method 1: Using Command Line Parameters to Change the Inference Backend
+OpenCompass offers one-click evaluation acceleration. During evaluation, it can automatically convert Huggingface transformer models to VLLM or LMDeploy models for use. Below is an example code for evaluating the GSM8k dataset using the default Huggingface version of the llama3-8b-instruct model:
+```python
+# eval_gsm8k.py
+from mmengine.config import read_base
+with read_base():
+    # Select a dataset list
+    from .datasets.gsm8k.gsm8k_0shot_gen_a58960 import gsm8k_datasets as datasets
+    # Select an interested model
+    from ..models.hf_llama.hf_llama3_8b_instruct import models
+```
+Here, `hf_llama3_8b_instruct` specifies the original Huggingface model configuration, as shown below:
+```python
+from opencompass.models import HuggingFacewithChatTemplate
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='llama-3-8b-instruct-hf',
+        path='meta-llama/Meta-Llama-3-8B-Instruct',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+        stop_words=['<|end_of_text|>', '<|eot_id|>'],
+    )
+]
+```
+To evaluate the GSM8k dataset using the default Huggingface version of the llama3-8b-instruct model, use:
+```bash
+python run.py config/eval_gsm8k.py
+```
+To accelerate the evaluation using vLLM or LMDeploy, you can use the following script:
+```bash
+python run.py config/eval_gsm8k.py -a vllm
+```
+or
+```bash
+python run.py config/eval_gsm8k.py -a lmdeploy
+```
+### Method 2: Accelerating Evaluation via Deployed Inference Acceleration Service API
+OpenCompass also supports accelerating evaluation by deploying vLLM or LMDeploy inference acceleration service APIs. Follow these steps:
+1. Install the openai package:
+```bash
+pip install openai
+```
+2. Deploy the inference acceleration service API for vLLM or LMDeploy. Below is an example for LMDeploy:
+```bash
+lmdeploy serve api_server meta-llama/Meta-Llama-3-8B-Instruct --model-name Meta-Llama-3-8B-Instruct --server-port 23333
+```
+Parameters for starting the api_server can be checked using `lmdeploy serve api_server -h`, such as --tp for tensor parallelism, --session-len for the maximum context window length, --cache-max-entry-count for adjusting the k/v cache memory usage ratio, etc.
+3. Once the service is successfully deployed, modify the evaluation script by changing the model configuration path to the service address, as shown below:
+```python
+from opencompass.models import OpenAISDK
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+models = [
+    dict(
+        abbr='Meta-Llama-3-8B-Instruct-LMDeploy-API',
+        type=OpenAISDK,
+        key='EMPTY', # API key
+        openai_api_base='http://0.0.0.0:23333/v1',  # Service address
+        path='Meta-Llama-3-8B-Instruct',  # Model name for service request
+        tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', # The tokenizer name or path, if set to `None`, uses the default `gpt-4` tokenizer
+        rpm_verbose=True,  # Whether to print request rate
+        meta_template=api_meta_template,  # Service request template
+        query_per_second=1,  # Service request rate
+        max_out_len=1024,  # Maximum output length
+        max_seq_len=4096,  # Maximum input length
+        temperature=0.01,  # Generation temperature
+        batch_size=8,  # Batch size
+        retry=3,  # Number of retries
+    )
+]
+```
+## Acceleration Effect and Performance Comparison
+Below is a comparison table of the acceleration effect and performance when using VLLM or LMDeploy on a single A800 GPU for evaluating the Llama-3-8B-Instruct model on the GSM8k dataset:
+| Inference Backend | Accuracy | Inference Time (minutes:seconds) | Speedup (relative to Huggingface) |
+| ----------------- | -------- | -------------------------------- | --------------------------------- |
+| Huggingface       | 74.22    | 24:26                            | 1.0                               |
+| LMDeploy          | 73.69    | 11:15                            | 2.2                               |
+| VLLM              | 72.63    | 07:52                            | 3.1                               |

docs/en/advanced_guides/circular_eval.md ADDED Viewed

	@@ -0,0 +1,113 @@

+# CircularEval
+## Background
+For multiple-choice questions, when a Language Model (LLM) provides the correct option, it does not necessarily imply a true understanding and reasoning of the question. It could be a guess. To differentiate these scenarios and reduce LLM bias towards options, CircularEval (CircularEval) can be utilized. A multiple-choice question is augmented by shuffling its options, and if the LLM correctly answers all variations of the augmented question, it is considered correct under CircularEval.
+## Adding Your Own CircularEval Dataset
+Generally, to evaluate a dataset using CircularEval, both its loading and evaluation methods need to be rewritten. Modifications are required in both the OpenCompass main library and configuration files. We will use C-Eval as an example for explanation.
+OpenCompass main library:
+```python
+from opencompass.datasets.ceval import CEvalDataset
+from opencompass.datasets.circular import CircularDatasetMeta
+class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
+    # The overloaded dataset class
+    dataset_class = CEvalDataset
+    # Splits of the DatasetDict that need CircularEval. For CEvalDataset, which loads [dev, val, test], we only need 'val' and 'test' for CircularEval, not 'dev'
+    default_circular_splits = ['val', 'test']
+    # List of keys to be shuffled
+    default_option_keys = ['A', 'B', 'C', 'D']
+    # If the content of 'answer_key' is one of ['A', 'B', 'C', 'D'], representing the correct answer. This field indicates how to update the correct answer after shuffling options. Choose either this or default_answer_key_switch_method
+    default_answer_key = 'answer'
+    # If 'answer_key' content is not one of ['A', 'B', 'C', 'D'], a function can be used to specify the correct answer after shuffling options. Choose either this or default_answer_key
+    # def default_answer_key_switch_method(item, circular_pattern):
+    #     # 'item' is the original data item
+    #     # 'circular_pattern' is a tuple indicating the order after shuffling options, e.g., ('D', 'A', 'B', 'C') means the original option A is now D, and so on
+    #     item['answer'] = circular_pattern['ABCD'.index(item['answer'])]
+    #     return item
+```
+`CircularCEvalDataset` accepts the `circular_pattern` parameter with two values:
+- `circular`: Indicates a single cycle. It is the default value. ABCD is expanded to ABCD, BCDA, CDAB, DABC, a total of 4 variations.
+- `all_possible`: Indicates all permutations. ABCD is expanded to ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., a total of 24 variations.
+Additionally, we provide a `CircularEvaluator` to replace `AccEvaluator`. This Evaluator also accepts `circular_pattern`, and it should be consistent with the above. It produces the following metrics:
+- `acc_{origin|circular|all_possible}`: Treating each question with shuffled options as separate, calculating accuracy.
+- `perf_{origin|circular|all_possible}`: Following Circular logic, a question is considered correct only if all its variations with shuffled options are answered correctly, calculating accuracy.
+- `more_{num}_{origin|circular|all_possible}`: According to Circular logic, a question is deemed correct if the number of its variations answered correctly is greater than or equal to num, calculating accuracy.
+OpenCompass configuration file:
+```python
+from mmengine.config import read_base
+from opencompass.datasets.circular import CircularCEvalDataset
+with read_base():
+    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+for d in ceval_datasets:
+    # Overloading the load method
+    d['type'] = CircularCEvalDataset
+    # Renaming for differentiation from non-circular evaluation versions
+    d['abbr'] = d['abbr'] + '-circular-4'
+    # Overloading the evaluation method
+    d['eval_cfg']['evaluator'] = {'type': CircularEvaluator}
+# The dataset after the above operations looks like this:
+# dict(
+#     type=CircularCEvalDataset,
+#     path='./data/ceval/formal_ceval',  # Unchanged
+#     name='computer_network',  # Unchanged
+#     abbr='ceval-computer_network-circular-4',
+#     reader_cfg=dict(...),  # Unchanged
+#     infer_cfg=dict(...),  # Unchanged
+#     eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...),
+# )
+```
+Additionally, for better presentation of results in CircularEval, consider using the following summarizer:
+```python
+from mmengine.config import read_base
+from opencompass.summarizers import CircularSummarizer
+with read_base():
+    from ...summarizers.groups.ceval.ceval_summary_groups
+new_summary_groups = []
+for item in ceval_summary_groups:
+    new_summary_groups.append(
+        {
+            'name': item['name'] + '-circular-4',
+            'subsets': [i + '-circular-4' for i in item['subsets']],
+        }
+    )
+summarizer = dict(
+    type=CircularSummarizer,
+    # Select specific metrics to view
+    metric_types=['acc_origin', 'perf_circular'],
+    dataset_abbrs = [
+        'ceval-circular-4',
+        'ceval-humanities-circular-4',
+        'ceval-stem-circular-4',
+        'ceval-social-science-circular-4',
+        'ceval-other-circular-4',
+    ],
+    summary_groups=new_summary_groups,
+)
+```
+For more complex evaluation examples, refer to this sample code: https://github.com/open-compass/opencompass/tree/main/examples/eval_circular.py

docs/en/advanced_guides/code_eval.md ADDED Viewed

	@@ -0,0 +1,104 @@

+# Code Evaluation Tutorial
+This tutorial primarily focuses on evaluating a model's coding proficiency, using `humaneval` and `mbpp` as examples.
+## pass@1
+If you only need to generate a single response to evaluate the pass@1 performance, you can directly use [configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) and [configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py), referring to the general [quick start tutorial](../get_started/quick_start.md).
+For multilingual evaluation, please refer to the [Multilingual Code Evaluation Tutorial](./code_eval_service.md).
+## pass@k
+If you need to generate multiple responses for a single example to evaluate the pass@k performance, consider the following two situations. Here we take 10 responses as an example:
+### Typical Situation
+For most models that support the `num_return_sequences` parameter in HF's generation, we can use it directly to obtain multiple responses. Refer to the following configuration file:
+```python
+from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator
+with read_base():
+    from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
+mbpp_datasets[0]['type'] = MBPPDatasetV2
+mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
+mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column'
+datasets = []
+datasets += humaneval_datasets
+datasets += mbpp_datasets
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        ...,
+        generation_kwargs=dict(
+            num_return_sequences=10,
+            do_sample=True,
+            top_p=0.95,
+            temperature=0.8,
+        ),
+        ...,
+    )
+]
+```
+For `mbpp`, new changes are needed in the dataset and evaluation, so we simultaneously modify the `type`, `eval_cfg.evaluator.type`, `reader_cfg.output_column` fields to accommodate these requirements.
+We also need model responses with randomness, thus setting the `generation_kwargs` parameter is necessary. Note that we need to set `num_return_sequences` to get the number of responses.
+Note: `num_return_sequences` must be greater than or equal to k, as pass@k itself is a probability estimate.
+You can specifically refer to the following configuration file [examples/eval_code_passk.py](https://github.com/open-compass/opencompass/blob/main/examples/eval_code_passk.py)
+### For Models That Do Not Support Multiple Responses
+This applies to some HF models with poorly designed APIs or missing features. In this case, we need to repeatedly construct datasets to achieve multiple response effects. Refer to the following configuration:
+```python
+from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator
+with read_base():
+    from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
+humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10'
+humaneval_datasets[0]['num_repeats'] = 10
+mbpp_datasets[0]['abbr'] = 'mbpp_pass10'
+mbpp_datasets[0]['num_repeats'] = 10
+mbpp_datasets[0]['type'] = MBPPDatasetV2
+mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
+mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column'
+datasets = []
+datasets += humaneval_datasets
+datasets += mbpp_datasets
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        ...,
+        generation_kwargs=dict(
+            do_sample=True,
+            top_p=0.95,
+            temperature=0.8,
+        ),
+        ...,
+    )
+]
+```
+Since the dataset's prompt has not been modified, we need to replace the corresponding fields to achieve the purpose of repeating the dataset.
+You need to modify these fields:
+- `num_repeats`: the number of times the dataset is repeated
+- `abbr`: It's best to modify the dataset abbreviation along with the number of repetitions because the number of datasets will change, preventing potential issues arising from discrepancies with the values in `.cache/dataset_size.json`.
+For `mbpp`, modify the `type`, `eval_cfg.evaluator.type`, `reader_cfg.output_column` fields as well.
+We also need model responses with randomness, thus setting the `generation_kwargs` parameter is necessary.
+You can specifically refer to the following configuration file [examples/eval_code_passk_repeat_dataset.py](https://github.com/open-compass/opencompass/blob/main/examples/eval_code_passk_repeat_dataset.py)

docs/en/advanced_guides/code_eval_service.md ADDED Viewed

	@@ -0,0 +1,224 @@

+# Code Evaluation Docker Tutorial
+To complete the LLM code capability evaluation, we need to build a separate evaluation environment to avoid executing erroneous code in the development environment, which would inevitably cause losses. The code evaluation service currently used by OpenCompass can refer to the [code-evaluator](https://github.com/open-compass/code-evaluator) project. The following will introduce evaluation tutorials around the code evaluation service.
+1. humaneval-x
+This is a multi-programming language dataset [humaneval-x](https://huggingface.co/datasets/THUDM/humaneval-x).
+You can download the dataset from this [download link](https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx). Please download the language file (××.jsonl.gz) that needs to be evaluated and place it in the `./data/humanevalx` folder.
+The currently supported languages are `python`, `cpp`, `go`, `java`, `js`.
+2. DS1000
+This is a Python multi-algorithm library dataset [ds1000](https://github.com/xlang-ai/DS-1000).
+You can download the dataset from this [download link](https://github.com/xlang-ai/DS-1000/blob/main/ds1000_data.zip).
+The currently supported algorithm libraries are `Pandas`, `Numpy`, `Tensorflow`, `Scipy`, `Sklearn`, `Pytorch`, `Matplotlib`.
+## Launching the Code Evaluation Service
+1. Ensure you have installed Docker, please refer to [Docker installation document](https://docs.docker.com/engine/install/).
+2. Pull the source code of the code evaluation service project and build the Docker image.
+Choose the dockerfile corresponding to the dataset you need, and replace `humanevalx` or `ds1000` in the command below.
+```shell
+git clone https://github.com/open-compass/code-evaluator.git
+docker build -t code-eval-{your-dataset}:latest -f docker/{your-dataset}/Dockerfile .
+```
+3. Create a container with the following commands:
+```shell
+# Log output format
+docker run -it -p 5000:5000 code-eval-{your-dataset}:latest python server.py
+# Run the program in the background
+# docker run -itd -p 5000:5000 code-eval-{your-dataset}:latest python server.py
+# Using different ports
+# docker run -itd -p 5001:5001 code-eval-{your-dataset}:latest python server.py --port 5001
+```
+**Note:**
+- If you encounter a timeout during the evaluation of Go, please use the following command when creating the container.
+```shell
+docker run -it -p 5000:5000 -e GO111MODULE=on -e GOPROXY=https://goproxy.io code-eval-{your-dataset}:latest python server.py
+```
+4. To ensure you have access to the service, use the following command to check the inference environment and evaluation service connection status. (If both inferences and code evaluations run on the same host, skip this step.)
+```shell
+ping your_service_ip_address
+telnet your_service_ip_address your_service_port
+```
+## Local Code Evaluation
+When the model inference and code evaluation services are running on the same host or within the same local area network, direct code reasoning and evaluation can be performed. **Note: DS1000 is currently not supported, please proceed with remote evaluation.**
+### Configuration File
+We provide [the configuration file](https://github.com/open-compass/opencompass/blob/main/examples/eval_codegeex2.py) of using `humanevalx` for evaluation on `codegeex2` as reference.
+The dataset and related post-processing configurations files can be found at this [link](https://github.com/open-compass/opencompass/tree/main/configs/datasets/humanevalx) with attention paid to the `evaluator` field in the humanevalx_eval_cfg_dict.
+```python
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
+humanevalx_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+humanevalx_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024))
+humanevalx_eval_cfg_dict = {
+    lang : dict(
+            evaluator=dict(
+                type=HumanevalXEvaluator,
+                language=lang,
+                ip_address="localhost",    # replace to your code_eval_server ip_address, port
+                port=5000),               # refer to https://github.com/open-compass/code-evaluator to launch a server
+            pred_role='BOT')
+    for lang in ['python', 'cpp', 'go', 'java', 'js']   # do not support rust now
+}
+humanevalx_datasets = [
+    dict(
+        type=HumanevalXDataset,
+        abbr=f'humanevalx-{lang}',
+        language=lang,
+        path='./data/humanevalx',
+        reader_cfg=humanevalx_reader_cfg,
+        infer_cfg=humanevalx_infer_cfg,
+        eval_cfg=humanevalx_eval_cfg_dict[lang])
+    for lang in ['python', 'cpp', 'go', 'java', 'js']
+]
+```
+### Task Launch
+Refer to the [Quick Start](../get_started.html)
+## Remote Code Evaluation
+Model inference and code evaluation services located in different machines which cannot be accessed directly require prior model inference before collecting the code evaluation results. The configuration file and inference process can be reused from the previous tutorial.
+### Collect Inference Results(Only for Humanevalx)
+In OpenCompass's tools folder, there is a script called `collect_code_preds.py` provided to process and collect the inference results after providing the task launch configuration file during startup along with specifying the working directory used corresponding to the task.
+It is the same with `-r` option in `run.py`. More details can be referred through the [documentation](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html#launching-evaluation).
+```shell
+python tools/collect_code_preds.py [config] [-r latest]
+```
+The collected results will be organized as following under the `-r` folder:
+```
+workdir/humanevalx
+├── codegeex2-6b
+│   ├── humanevalx_cpp.json
+│   ├── humanevalx_go.json
+│   ├── humanevalx_java.json
+│   ├── humanevalx_js.json
+│   └── humanevalx_python.json
+├── CodeLlama-13b
+│   ├── ...
+├── CodeLlama-13b-Instruct
+│   ├── ...
+├── CodeLlama-13b-Python
+│   ├── ...
+├── ...
+```
+For DS1000, you just need to obtain the corresponding prediction file generated by `opencompass`.
+### Code Evaluation
+Make sure your code evaluation service is started, and use `curl` to request:
+#### The following only supports Humanevalx
+```shell
+curl -X POST -F 'file=@{result_absolute_path}' -F 'dataset={dataset/language}' {your_service_ip_address}:{your_service_port}/evaluate
+```
+For example:
+```shell
+curl -X POST -F 'file=@./examples/humanevalx/python.json' -F 'dataset=humanevalx/python' localhost:5000/evaluate
+```
+The we have:
+```
+"{\"pass@1\": 37.19512195121951%}"
+```
+Additionally, we offer an extra option named `with_prompt`(Defaults to `True`), since some models(like `WizardCoder`) generate complete codes without requiring the form of concatenating prompt and prediction. You may refer to the following commands for evaluation.
+```shell
+curl -X POST -F 'file=@./examples/humanevalx/python.json' -F 'dataset=humanevalx/python' -H 'with-prompt: False' localhost:5000/evaluate
+```
+#### The following only supports DS1000
+Make sure the code evaluation service is started, then use `curl` to submit a request:
+```shell
+curl -X POST -F 'file=@./internlm-chat-7b-hf-v11/ds1000_Numpy.json' localhost:5000/evaluate
+```
+DS1000 supports additional debug parameters. Be aware that a large amount of log will be generated when it is turned on:
+- `full`: Additional print out of the original prediction for each error sample, post-processing prediction, running program, and final error.
+- `half`: Additional print out of the running program and final error for each error sample.
+- `error`: Additional print out of the final error for each error sample.
+```shell
+curl -X POST -F 'file=@./internlm-chat-7b-hf-v11/ds1000_Numpy.json' -F 'debug=error' localhost:5000/evaluate
+```
+You can also modify the `num_workers` in the same way to control the degree of parallelism.
+## Advanced Tutorial
+Besides evaluating the supported HUMANEVAList data set, users might also need:
+### Support New Dataset
+Please refer to the [tutorial on supporting new datasets](./new_dataset.md).
+### Modify Post-Processing
+1. For local evaluation, follow the post-processing section in the tutorial on supporting new datasets to modify the post-processing method.
+2. For remote evaluation, please modify the post-processing part in the tool's `collect_code_preds.py`.
+3. Some parts of post-processing could also be modified in the code evaluation service, more information will be available in the next section.
+### Debugging Code Evaluation Service
+When supporting new datasets or modifying post-processors, it is possible that modifications need to be made to the original code evaluation service. Please make changes based on the following steps:
+1. Remove the installation of the `code-evaluator` in `Dockerfile`, mount the `code-evaluator` when starting the container instead:
+```shell
+docker run -it -p 5000:5000 -v /local/path/of/code-evaluator:/workspace/code-evaluator code-eval:latest bash
+```
+2. Install and start the code evaluation service locally. At this point, any necessary modifications can be made to the local copy of the `code-evaluator`.
+```shell
+cd code-evaluator && pip install -r requirements.txt
+python server.py
+```

docs/en/advanced_guides/contamination_eval.md ADDED Viewed

	@@ -0,0 +1,124 @@

+# Data Contamination Assessment
+**Data Contamination** refers to the phenomenon where data intended for downstream testing tasks appear in the training data of large language models (LLMs), resulting in artificially inflated performance metrics in downstream tasks (such as summarization, natural language inference, text classification), which do not accurately reflect the model's true generalization capabilities.
+Since the source of data contamination lies in the training data used by LLMs, the most direct method to detect data contamination is to collide test data with training data and then report the extent of overlap between the two. The classic GPT-3 [paper](https://arxiv.org/pdf/2005.14165.pdf) reported on this in Table C.1.
+However, today's open-source community often only publishes model parameters, not training datasets. In such cases, how to determine the presence and extent of data contamination remains unsolved. OpenCompass offers two possible solutions.
+## Contamination Data Annotation Based on Self-Built Co-Distribution Data
+Referencing the method mentioned in Section 5.2 of [Skywork](https://arxiv.org/pdf/2310.19341.pdf), we directly used the dataset [mock_gsm8k_test](https://huggingface.co/datasets/Skywork/mock_gsm8k_test) uploaded to HuggingFace by Skywork.
+In this method, the authors used GPT-4 to synthesize data similar to the original GSM8K style, and then calculated the perplexity on the GSM8K training set (train), GSM8K test set (test), and GSM8K reference set (ref). Since the GSM8K reference set was newly generated, the authors considered it as clean, not belonging to any training set of any model. They posited:
+- If the test set's perplexity is significantly lower than the reference set's, the test set might have appeared in the model's training phase;
+- If the training set's perplexity is significantly lower than the test set's, the training set might have been overfitted by the model.
+The following configuration file can be referenced:
+```python
+from mmengine.config import read_base
+with read_base():
+    from .datasets.gsm8k_contamination.gsm8k_contamination_ppl_ecdd22 import gsm8k_datasets  # includes training, test, and reference sets
+    from .models.qwen.hf_qwen_7b import models as hf_qwen_7b_model  # model under review
+    from .models.yi.hf_yi_6b import models as hf_yi_6b_model
+datasets = [*gsm8k_datasets]
+models = [*hf_qwen_7b_model, *hf_yi_6b_model]
+```
+An example output is as follows:
+```text
+dataset          version    metric       mode       internlm-7b-hf    qwen-7b-hf    yi-6b-hf    chatglm3-6b-base-hf    qwen-14b-hf    baichuan2-13b-base-hf    internlm-20b-hf    aquila2-34b-hf  ...
+---------------  ---------  -----------  -------  ----------------  ------------  ----------  ---------------------  -------------  -----------------------  -----------------  ----------------  ...
+gsm8k-train-ppl  0b8e46     average_ppl  unknown              1.5           0.78        1.37                   1.16           0.5                      0.76               1.41              0.78  ...
+gsm8k-test-ppl   0b8e46     average_ppl  unknown              1.56          1.33        1.42                   1.3            1.15                     1.13               1.52              1.16  ...
+gsm8k-ref-ppl    f729ba     average_ppl  unknown              1.55          1.2         1.43                   1.35           1.27                     1.19               1.47              1.35  ...
+```
+Currently, this solution only supports the GSM8K dataset. We welcome the community to contribute more datasets.
+Consider cite the following paper if you find it helpful:
+```bibtex
+@misc{2023opencompass,
+    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
+    author={OpenCompass Contributors},
+    howpublished = {\url{https://github.com/open-compass/opencompass}},
+    year={2023}
+}
+@misc{wei2023skywork,
+      title={Skywork: A More Open Bilingual Foundation Model},
+      author={Tianwen Wei and Liang Zhao and Lichang Zhang and Bo Zhu and Lijie Wang and Haihua Yang and Biye Li and Cheng Cheng and Weiwei Lü and Rui Hu and Chenxia Li and Liu Yang and Xilin Luo and Xuejie Wu and Lunan Liu and Wenjun Cheng and Peng Cheng and Jianhao Zhang and Xiaoyu Zhang and Lei Lin and Xiaokun Wang and Yutuan Ma and Chuanhai Dong and Yanqi Sun and Yifu Chen and Yongyi Peng and Xiaojuan Liang and Shuicheng Yan and Han Fang and Yahui Zhou},
+      year={2023},
+      eprint={2310.19341},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+## Contamination Data Annotation Based on Classic Pre-trained Sets
+Thanks to [Contamination_Detector](https://github.com/liyucheng09/Contamination_Detector) and @liyucheng09 for providing this method.
+In this method, the authors search the test datasets (such as C-Eval, ARC, HellaSwag, etc.) using the Common Crawl database and Bing search engine, then mark each test sample as clean / question contaminated / both question and answer contaminated.
+During testing, OpenCompass
+will report the accuracy or perplexity of ceval on subsets composed of these three labels. Generally, the accuracy ranges from low to high: clean, question contaminated, both question and answer contaminated subsets. The authors believe:
+- If the performance of the three is relatively close, the contamination level of the model on that test set is light; otherwise, it is heavy.
+The following configuration file can be referenced [link](https://github.com/open-compass/opencompass/blob/main/examples/eval_contamination.py):
+```python
+from mmengine.config import read_base
+with read_base():
+    from .datasets.ceval.ceval_clean_ppl import ceval_datasets  # ceval dataset with contamination tags
+    from .models.yi.hf_yi_6b import models as hf_yi_6b_model  # model under review
+    from .models.qwen.hf_qwen_7b import models as hf_qwen_7b_model
+    from .summarizers.contamination import ceval_summarizer as summarizer  # output formatting
+datasets = [*ceval_datasets]
+models = [*hf_yi_6b_model, *hf_qwen_7b_model]
+```
+An example output is as follows:
+```text
+dataset                                         version    mode    yi-6b-hf          -                              -                                        qwen-7b-hf        -                              -                                        ...
+----------------------------------------------  ---------  ------  ----------------  -----------------------------  ---------------------------------------  ----------------  -----------------------------  ---------------------------------------  ...
+-                                               -          -       accuracy - clean  accuracy - input contaminated  accuracy - input-and-label contaminated  accuracy - clean  accuracy - input contaminated  accuracy - input-and-label contaminated  ...
+...
+ceval-humanities                                -          ppl     74.42             75.00                          82.14                                    67.44             50.00                          70.54                                    ...
+ceval-stem                                      -          ppl     53.70             57.14                          85.61                                    47.41             52.38                          67.63                                    ...
+ceval-social-science                            -          ppl     81.60             84.62                          83.09                                    76.00             61.54                          72.79                                    ...
+ceval-other                                     -          ppl     72.31             73.91                          75.00                                    58.46             39.13                          61.88                                    ...
+ceval-hard                                      -          ppl     44.35             37.50                          70.00                                    41.13             25.00                          30.00                                    ...
+ceval                                           -          ppl     67.32             71.01                          81.17                                    58.97             49.28                          67.82                                    ...
+```
+Currently, this solution only supports the C-Eval, MMLU, HellaSwag and ARC. [Contamination_Detector](https://github.com/liyucheng09/Contamination_Detector) also includes CSQA and WinoGrande, but these have not yet been implemented in OpenCompass. We welcome the community to contribute more datasets.
+Consider cite the following paper if you find it helpful:
+```bibtex
+@misc{2023opencompass,
+    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
+    author={OpenCompass Contributors},
+    howpublished = {\url{https://github.com/open-compass/opencompass}},
+    year={2023}
+}
+@article{Li2023AnOS,
+  title={An Open Source Data Contamination Report for Llama Series Models},
+  author={Yucheng Li},
+  journal={ArXiv},
+  year={2023},
+  volume={abs/2310.17589},
+  url={https://api.semanticscholar.org/CorpusID:264490711}
+}
+```

docs/en/advanced_guides/custom_dataset.md ADDED Viewed

	@@ -0,0 +1,267 @@

+# Dataset Quick Evaluation Tutorial
+OpenCompass provides two paths for quickly evaluating the provided data, the data format protocol based on ChatMLDataset and the data format protocol based on CustomDataset.
+Compared to the complete dataset integration process in [new_dataset.md](./new_dataset.md), these two evaluation paths are more convenient and efficient, being able to directly enter the evaluation process without adding new configuration files.
+But if you have specific needs for custom reading/inference/evaluation, it is recommended to still follow the complete integration process to add a new dataset.
+## Data Format Protocol and Fast Evaluation Based on ChatMLDataset
+OpenCompass has recently launched a dataset evaluation mode based on the ChatML dialogue template, which allow users to provide a dataset .json file that conforms to the ChatML dialogue template, and simply set the dataset information config like model configs to start evaluating directly.
+### Format Requirements for Data Files
+This evaluation method only supports data files in `.json` format, and each sample must comply with the following format:
+The format of a text-only dataset with a simple structure:
+```jsonl
+{
+    "question":[
+        {
+            "role": "system" # Omittable
+            "content": Str
+        },
+        {
+            "role": "user",
+            "content": Str
+        }
+    ],
+    "answer":[
+        Str
+    ]
+}
+{
+    ...
+}
+...
+```
+The format of multiple rounds and multiple modes datasets:
+```jsonl
+{
+    "question":[
+        {
+            "role": "system",
+            "content": Str,
+        },
+        {
+            "role": "user",
+            "content": Str or List
+            [
+                {
+                    "type": Str, # "image"
+                    "image_url": Str,
+                },
+                ...
+                {
+                    "type": Str, # "text"
+                    "text": Str,
+                },
+            ]
+        },
+        {
+            "role": "assistant",
+            "content": Str
+        },
+        {
+            "role": "user",
+            "content": Str or List
+        },
+        ...
+    ],
+    "answer":[
+        Str,
+        Str,
+        ...
+    ]
+}
+{
+    ...
+}
+...
+```
+(As OpenCompass currently does not support multi-mode evaluation, the template above is for reference only.)
+When ChatMLDataset reading `.json` files, it will use `pydantic` to perform simple format validation on the files.
+You can use `tools/chatml_fformat_test.py` to check your provided data file.
+After format checking, please add a config dictionary named `chatml_datasets` in your running config file to convert the data file into an OpenCompass dataset at runtime.
+An example is as follows:
+```python
+chatml_datasets = [
+    dict(
+        abbr='YOUR_DATASET_NAME',
+        path='YOUR_DATASET_PATH',
+        evaluator=dict(
+            type='cascade_evaluator',
+            rule_evaluator=dict(
+                type='math_evaluator',
+            ),
+            llm_evaluator=dict(
+                type='llm_evaluator',
+                prompt="YOUR_JUDGE_PROMPT",
+                judge_cfg=dict(), # YOUR Judge Model Config
+            )
+        ),
+        n=1, # Repeat Number
+    ),
+]
+```
+The ChatML evaluation module currently provides four preset evaluators, `mcq_rule_evaluator` used for MCQ evaluation, `math_evaluator` used for latex mathematical formula evaluation, `llm_evaluator` used for evaluating answers that are open-ended or difficult to extract), and `cascade_evaluator`, an evaluation mode composed of rule and LLM evaluators cascaded together.
+In addition, if you have a long-term need to use datasets based on ChatML templates, you can contribute your dataset config to `opencompass/config/chatml_datasets`.
+An eval example of calling these dataset configs is provided in `examples/evalchat_datasets.py`.
+## Data Format Protocol and Fast Evaluation Based on CustomsDataset
+(This module is no longer being updated, but it can still be used if there is a need for cli- quick evaluation.)
+This module support two types of tasks: multiple choice (`mcq`) and question & answer (`qa`). For `mcq`, both ppl and gen inferences are supported; for `qa`, gen inference is supported.
+### Dataset Format
+We support datasets in both `.jsonl` and `.csv` formats.
+#### Multiple Choice (`mcq`)
+For `mcq` datasets, the default fields are as follows:
+- `question`: The stem of the multiple-choice question.
+- `A`, `B`, `C`, ...: Single uppercase letters representing the options, with no limit on the number. Defaults to parsing consecutive letters strating from `A` as options.
+- `answer`: The correct answer to the multiple-choice question, which must be one of the options used above, such as `A`, `B`, `C`, etc.
+Non-default fields will be read in but are not used by default. To use them, specify in the `.meta.json` file.
+An example of the `.jsonl` format:
+```jsonl
+{"question": "165+833+650+615=", "A": "2258", "B": "2263", "C": "2281", "answer": "B"}
+{"question": "368+959+918+653+978=", "A": "3876", "B": "3878", "C": "3880", "answer": "A"}
+{"question": "776+208+589+882+571+996+515+726=", "A": "5213", "B": "5263", "C": "5383", "answer": "B"}
+{"question": "803+862+815+100+409+758+262+169=", "A": "4098", "B": "4128", "C": "4178", "answer": "C"}
+```
+An example of the `.csv` format:
+```csv
+question,A,B,C,answer
+127+545+588+620+556+199=,2632,2635,2645,B
+735+603+102+335+605=,2376,2380,2410,B
+506+346+920+451+910+142+659+850=,4766,4774,4784,C
+504+811+870+445=,2615,2630,2750,B
+```
+#### Question & Answer (`qa`)
+For `qa` datasets, the default fields are as follows:
+- `question`: The stem of the question & answer question.
+- `answer`: The correct answer to the question & answer question. It can be missing, indicating the dataset has no correct answer.
+Non-default fields will be read in but are not used by default. To use them, specify in the `.meta.json` file.
+An example of the `.jsonl` format:
+```jsonl
+{"question": "752+361+181+933+235+986=", "answer": "3448"}
+{"question": "712+165+223+711=", "answer": "1811"}
+{"question": "921+975+888+539=", "answer": "3323"}
+{"question": "752+321+388+643+568+982+468+397=", "answer": "4519"}
+```
+An example of the `.csv` format:
+```csv
+question,answer
+123+147+874+850+915+163+291+604=,3967
+149+646+241+898+822+386=,3142
+332+424+582+962+735+798+653+214=,4700
+649+215+412+495+220+738+989+452=,4170
+```
+### Command Line List
+Custom datasets can be directly called for evaluation through the command line.
+```bash
+python run.py \
+    --models hf_llama2_7b \
+    --custom-dataset-path xxx/test_mcq.csv \
+    --custom-dataset-data-type mcq \
+    --custom-dataset-infer-method ppl
+```
+```bash
+python run.py \
+    --models hf_llama2_7b \
+    --custom-dataset-path xxx/test_qa.jsonl \
+    --custom-dataset-data-type qa \
+    --custom-dataset-infer-method gen
+```
+In most cases, `--custom-dataset-data-type` and `--custom-dataset-infer-method` can be omitted. OpenCompass will
+set them based on the following logic:
+- If options like `A`, `B`, `C`, etc., can be parsed from the dataset file, it is considered an `mcq` dataset; otherwise, it is considered a `qa` dataset.
+- The default `infer_method` is `gen`.
+### Configuration File
+In the original configuration file, simply add a new item to the `datasets` variable. Custom datasets can be mixed with regular datasets.
+```python
+datasets = [
+    {"path": "xxx/test_mcq.csv", "data_type": "mcq", "infer_method": "ppl"},
+    {"path": "xxx/test_qa.jsonl", "data_type": "qa", "infer_method": "gen"},
+]
+```
+### Supplemental Information for Dataset `.meta.json`
+OpenCompass will try to parse the input dataset file by default, so in most cases, the `.meta.json` file is **not necessary**. However, if the dataset field names are not the default ones, or custom prompt words are required, it should be specified in the `.meta.json` file.
+The file is placed in the same directory as the dataset, with the filename followed by `.meta.json`. An example file structure is as follows:
+```tree
+.
+├── test_mcq.csv
+├── test_mcq.csv.meta.json
+├── test_qa.jsonl
+└── test_qa.jsonl.meta.json
+```
+Possible fields in this file include:
+- `abbr` (str): Abbreviation of the dataset, serving as its ID.
+- `data_type` (str): Type of dataset, options are `mcq` and `qa`.
+- `infer_method` (str): Inference method, options are `ppl` and `gen`.
+- `human_prompt` (str): User prompt template for generating prompts. Variables in the template are enclosed in `{}`, like `{question}`, `{opt1}`, etc. If `template` exists, this field will be ignored.
+- `bot_prompt` (str): Bot prompt template for generating prompts. Variables in the template are enclosed in `{}`, like `{answer}`, etc. If `template` exists, this field will be ignored.
+- `template` (str or dict): Question template for generating prompts. Variables in the template are enclosed in `{}`, like `{question}`, `{opt1}`, etc. The relevant syntax is in [here](../prompt/prompt_template.md) regarding `infer_cfg['prompt_template']['template']`.
+- `input_columns` (list): List of input fields for reading data.
+- `output_column` (str): Output field for reading data.
+- `options` (list): List of options for reading data, valid only when `data_type` is `mcq`.
+For example:
+```json
+{
+    "human_prompt": "Question: 127 + 545 + 588 + 620 + 556 + 199 =\nA. 2632\nB. 2635\nC. 2645\nAnswer: Let's think step by step, 127 + 545 + 588 + 620 + 556 + 199 = 672 + 588 + 620 + 556 + 199 = 1260 + 620 + 556 + 199 = 1880 + 556 + 199 = 2436 + 199 = 2635. So the answer is B.\nQuestion: {question}\nA. {A}\nB. {B}\nC. {C}\nAnswer: ",
+    "bot_prompt": "{answer}"
+}
+```
+or
+```json
+{
+    "template": "Question: {my_question}\nX. {X}\nY. {Y}\nZ. {Z}\nW. {W}\nAnswer:",
+    "input_columns": ["my_question", "X", "Y", "Z", "W"],
+    "output_column": "my_answer",
+}
+```

docs/en/advanced_guides/evaluation_lightllm.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# Evaluation with Lightllm
+We now support the evaluation of large language models using [Lightllm](https://github.com/ModelTC/lightllm) for inference. Developed by SenseTime, LightLLM is a Python-based LLM (Large Language Model) inference and serving framework, notable for its lightweight design, easy scalability, and high-speed performance. Lightllm provides support for various large Language models, allowing users to perform model inference through Lightllm, locally deploying it as a service. During the evaluation process, OpenCompass feeds data to Lightllm through an API and processes the response. OpenCompass has been adapted for compatibility with Lightllm, and this tutorial will guide you on using OpenCompass to evaluate models with Lightllm as the inference backend.
+## Setup
+### Install OpenCompass
+Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets.
+### Install Lightllm
+Please follow the [Lightllm homepage](https://github.com/ModelTC/lightllm) to install the Lightllm. Pay attention to aligning the versions of relevant dependencies, especially the version of the Transformers.
+## Evaluation
+We use the evaluation of Humaneval with the llama2-7B model as an example.
+### Step-1: Deploy the model locally as a service using Lightllm.
+```shell
+python -m lightllm.server.api_server --model_dir /path/llama2-7B    \
+                                     --host 0.0.0.0                 \
+                                     --port 1030                    \
+                                     --nccl_port 2066               \
+                                     --max_req_input_len 4096       \
+                                     --max_req_total_len 6144       \
+                                     --tp 1                         \
+                                     --trust_remote_code            \
+                                     --max_total_token_num 120000
+```
+\*\*Note: \*\* tp can be configured to enable TensorParallel inference on several gpus, suitable for the inference of very large models.
+\*\*Note: \*\* The max_total_token_num in the above command will affect the throughput performance during testing. It can be configured according to the documentation on the [Lightllm homepage](https://github.com/ModelTC/lightllm). As long as it does not run out of memory, it is often better to set it as high as possible.
+\*\*Note: \*\* If you want to start multiple LightLLM services on the same machine, you need to reconfigure the above port and nccl_port.
+You can use the following Python script to quickly test whether the current service has been successfully started.
+```python
+import time
+import requests
+import json
+url = 'http://localhost:8080/generate'
+headers = {'Content-Type': 'application/json'}
+data = {
+    'inputs': 'What is AI?',
+    "parameters": {
+        'do_sample': False,
+        'ignore_eos': False,
+        'max_new_tokens': 1024,
+    }
+}
+response = requests.post(url, headers=headers, data=json.dumps(data))
+if response.status_code == 200:
+    print(response.json())
+else:
+    print('Error:', response.status_code, response.text)
+```
+### Step-2: Evaluate the above model using OpenCompass.
+```shell
+python run.py examples/eval_lightllm.py
+```
+You are expected to get the evaluation results after the inference and evaluation.
+\*\*Note: \*\*In `eval_lightllm.py`, please align the configured URL with the service address from the previous step.

docs/en/advanced_guides/evaluation_lmdeploy.md ADDED Viewed

	@@ -0,0 +1,88 @@

+# Evaluation with LMDeploy
+We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. It has a remarkable inference performance. We now illustrate how to evaluate a model with the support of LMDeploy in OpenCompass.
+## Setup
+### Install OpenCompass
+Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets.
+### Install LMDeploy
+Install lmdeploy via pip (python 3.8+)
+```shell
+pip install lmdeploy
+```
+The default prebuilt package is compiled on CUDA 12. However, if CUDA 11+ is required, you can install lmdeploy by:
+```shell
+export LMDEPLOY_VERSION=0.6.0
+export PYTHON_VERSION=310
+pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+```
+## Evaluation
+When evaluating a model, it is necessary to prepare an evaluation configuration that specifies information such as the evaluation dataset, the model, and inference parameters.
+Taking [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) as an example, the evaluation config is as follows:
+```python
+# configure the dataset
+from mmengine.config import read_base
+with read_base():
+    # choose a list of datasets
+    from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
+        gsm8k_datasets
+    # and output the results in a chosen format
+    from .summarizers.medium import summarizer
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+# configure lmdeploy
+from opencompass.models import TurboMindModelwithChatTemplate
+# configure the model
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr=f'internlm2-chat-7b-lmdeploy',
+        # model path, which can be the address of a model repository on the Hugging Face Hub or a local path
+        path='internlm/internlm2-chat-7b',
+        # inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
+        # If the model is not supported by 'turbomind', it will fallback to
+        # 'pytorch'
+        backend='turbomind',
+        # For the detailed engine config and generation config, please refer to
+        # https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
+        engine_config=dict(tp=1),
+        gen_config=dict(do_sample=False),
+        # the max size of the context window
+        max_seq_len=7168,
+        # the max number of new tokens
+        max_out_len=1024,
+        # the max number of prompts that LMDeploy receives
+        # in `generate` function
+        batch_size=5000,
+        run_cfg=dict(num_gpus=1),
+    )
+]
+```
+Place the aforementioned configuration in a file, such as "configs/eval_internlm2_lmdeploy.py". Then, in the home folder of OpenCompass, start evaluation by the following command:
+```shell
+python run.py configs/eval_internlm2_lmdeploy.py -w outputs
+```
+You are expected to get the evaluation results after the inference and evaluation.

docs/en/advanced_guides/llm_judge.md ADDED Viewed

	@@ -0,0 +1,370 @@

+# LLM as Judge Evaluation
+## Introduction
+The GenericLLMEvaluator is particularly useful for scenarios where rule-based methods (like regular expressions) cannot perfectly judge outputs, such as:
+- Cases where models output answer content without option identifiers
+- Factual judgment datasets that are difficult to evaluate with rules
+- Open-ended responses requiring complex understanding and reasoning
+- Evaluation that requires a lot of rules to be designed
+OpenCompass provides the GenericLLMEvaluator component to facilitate LLM-as-judge evaluations.
+## Dataset Format
+The dataset for LLM judge evaluation should be in either JSON Lines (.jsonl) or CSV format. Each entry should contain at least:
+- A problem or question
+- A reference answer or gold standard
+- (The model's prediction will be generated during evaluation)
+Example JSONL format:
+```json
+{"problem": "What is the capital of France?", "answer": "Paris"}
+```
+Example CSV format:
+```csv
+problem,answer
+"What is the capital of France?","Paris"
+```
+## Configuration
+### Using LLM for Evaluation via Command Line
+Some datasets in OpenCompass already include LLM judge configurations.
+You need to use a model service (such as OpenAI or DeepSeek's official API) or start a model service locally using tools like LMDeploy, vLLM, or SGLang.
+Then, you can set the environment variables for the evaluation service and evaluate models using the following commands:
+```bash
+export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
+export OC_JUDGE_API_KEY=sk-1234
+export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
+```
+Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.
+### Using LLM for Evaluation via Configuration Files
+To set up an LLM judge evaluation, you'll need to configure three main components:
+1. Dataset Reader Configuration
+```python
+reader_cfg = dict(
+    input_columns=['problem'],  # Column name for the question
+    output_column='answer'      # Column name for the reference answer
+)
+```
+2. Inference Configuration
+```python
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}',  # Template for prompting the model
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+```
+3. Evaluation Configuration with LLM Judge
+```python
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,  # Using LLM as evaluator
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),  # Template for the judge
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='path/to/your/dataset',
+            file_name='your_dataset.jsonl',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=YOUR_JUDGE_MODEL_CONFIG,  # Configuration for the judge model
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),  # Post-processing the judge's output
+    ),
+)
+```
+## Using CustomDataset with GenericLLMEvaluator
+Here's how to set up a complete configuration for LLM judge evaluation:
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+# Import your judge model configuration
+with read_base():
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as judge_model,
+    )
+# Define your judge template
+JUDGE_TEMPLATE = """
+Please evaluate whether the following response correctly answers the question.
+Question: {problem}
+Reference Answer: {answer}
+Model Response: {prediction}
+Is the model response correct? If correct, answer "A"; if incorrect, answer "B".
+""".strip()
+# Dataset reader configuration
+reader_cfg = dict(input_columns=['problem'], output_column='answer')
+# Inference configuration for the model being evaluated
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+# Evaluation configuration with LLM judge
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=JUDGE_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='path/to/your/dataset',
+            file_name='your_dataset.jsonl',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=judge_model[0],
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+# Dataset configuration
+datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-dataset',
+        path='path/to/your/dataset',
+        file_name='your_dataset.jsonl',
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+    )
+]
+# Model configuration for the model being evaluated
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='model-to-evaluate',
+        path='path/to/your/model',
+        # ... other model configurations
+    )
+]
+# Output directory
+work_dir = './outputs/llm_judge_eval'
+```
+## GenericLLMEvaluator
+The GenericLLMEvaluator is designed to use an LLM as a judge for evaluating model outputs. Key features include:
+1. Flexible prompt templates for instructing the judge
+2. Support for various judge models (local or API-based)
+3. Customizable evaluation criteria through prompt engineering
+4. Post-processing of judge outputs to extract structured evaluations
+**Important Note**: The current generic version of the judge template only supports outputs in the format of "A" (correct) or "B" (incorrect), and does not support other output formats (like "CORRECT" or "INCORRECT"). This is because the post-processing function `generic_llmjudge_postprocess` is specifically designed to parse this format.
+The evaluator works by:
+1. Taking the original problem, reference answer, and model prediction
+2. Formatting them into a prompt for the judge model
+3. Parsing the judge's response to determine the evaluation result (looking for "A" or "B")
+4. Aggregating results across the dataset
+If you would like to see the full details of evaluation results, you can add `--dump-eval-details` to the command line when you start the job.
+Example evaluation output:
+```python
+{
+    'accuracy': 75.0,  # Percentage of responses judged as correct
+    'details': [
+        {
+            'origin_prompt': """
+            Please evaluate whether the following response correctly answers the question.
+            Question: What is the capital of France?
+            Reference Answer: Paris
+            Model Response: Paris
+            Is the model response correct? If correct, answer "A"; if incorrect, answer "B".
+""",
+            'gold': 'Paris',
+            'prediction': 'A',
+        },
+        # ... more results
+    ]
+}
+```
+## CascadeEvaluator
+OpenCompass also provides a CascadeEvaluator that combines the strengths of rule-based evaluation and LLM-based evaluation. The cascade evaluator has two modes:
+1. **Cascade Mode (parallel=False)**: First evaluates all samples with a rule-based evaluator, then only sends samples that were deemed incorrect by the rule-based evaluation to an LLM judge for re-evaluation. This approach reduces reliance on LLM judgments while maintaining accuracy, thus lowering evaluation costs and time.
+2. **Parallel Mode (parallel=True)**: Evaluates all samples with both the rule-based evaluator and LLM judge, then considers a sample correct if either method marks it as correct. This approach can increase the leniency of evaluation but may result in higher costs since all samples require LLM evaluation.
+### Configuring CascadeEvaluator
+Here's an example of how to configure the CascadeEvaluator:
+```python
+# Define a rule-based evaluator
+rule_evaluator = dict(type=MATHVerifyEvaluator)
+# Define an LLM judge evaluator
+llm_judge_evaluator = dict(
+    type=GenericLLMEvaluator,
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                )
+            ],
+            round=[
+                dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
+            ],
+        ),
+    ),
+    dataset_cfg=dict(
+        type=YourDataset,
+        path='path/to/your/dataset',
+        reader_cfg=reader_cfg,
+    ),
+    judge_cfg=dict(),  # Can use environment variables to configure the judge model
+)
+# Configure cascade evaluator (cascade mode)
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=False  # Cascade mode
+)
+# For parallel mode, set parallel=True
+parallel_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=True  # Parallel mode
+)
+# Use the cascade evaluator in your dataset evaluation config
+eval_cfg = dict(evaluator=cascade_evaluator)
+```
+### Evaluation Results
+The cascade evaluator outputs detailed evaluation statistics including:
+- Accuracy of the rule-based evaluation
+- Accuracy of the LLM evaluation (for samples that failed rule-based evaluation in cascade mode)
+- Final combined accuracy
+Example output:
+```python
+{
+    'accuracy': 85.0,  # Final accuracy
+    'cascade_stats': {
+        'total_samples': 100,
+        'rule_correct': 70,  # Number of samples correct by rule evaluation
+        'rule_accuracy': 70.0,  # Accuracy of rule evaluation
+        'llm_evaluated': 30,  # Number of samples evaluated by LLM (failed samples in cascade mode)
+        'llm_correct': 15,  # Number of samples correct by LLM evaluation
+        'llm_accuracy': 50.0,  # Accuracy of LLM evaluation
+        'final_correct': 85,  # Total correct samples
+        'final_accuracy': 85.0,  # Final accuracy
+        'parallel_mode': False,  # Whether parallel mode was used
+    },
+    'details': [
+        # Detailed evaluation results for each sample
+    ]
+}
+```
+The cascade evaluator is particularly useful for:
+1. Scenarios that require balancing evaluation cost and accuracy
+2. Cases where rule-based evaluators are available but might not be comprehensive
+3. Evaluation tasks that need more nuanced judgment for edge cases
+## Complete Example
+For a complete working example using GenericLLMEvaluator
+, refer to the `eval_llm_judge.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .
+For a complete working example using CascadeEvaluator, refer to the `eval_cascade_evaluator.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .

docs/en/advanced_guides/longeval.md ADDED Viewed

	@@ -0,0 +1,169 @@

+# Long Context Evaluation Guidance
+## Introduction
+Although large-scale language models (LLMs) such as GPT-4 have demonstrated significant advantages in handling natural language tasks, most current open-source models can only handle texts with a length of a few thousand tokens, which limits their ability to process long contexts such as reading books and writing text summaries. To explore the performance of models in dealing with long contexts, we use the [L-Eval](https://github.com/OpenLMLab/LEval) and [LongBench](https://github.com/THUDM/LongBench) datasets to test the model's ability to handle long contexts.
+## Existing Algorithms and models
+When dealing with long context inputs, the two main challenges faced by large models are the inference time cost and catastrophic forgetting. Recently, a large amount of research has been devoted to extending the model length, focusing on three improvement directions:
+- Attention mechanisms. The ultimate goal of these methods is to reduce the computation cost of query-key pairs, but they may affect the performance of downstream tasks.
+- Input methods. Some studies divide long context inputs into chunks or retrieve pre-existing text segments to enhance the model's ability to handle long contexts, but these methods are only effective for some tasks and are difficult to adapt to multiple downstream tasks.
+- Position encoding. This research includes RoPE, ALiBi, Position Interpolation etc., which have shown good results in length extrapolation. These methods have been used to train long context models such as ChatGLM2-6B-32k and LongChat-32k.
+First, we introduce some popular position encoding algorithms.
+### RoPE
+RoPE is a type of positional embedding that injects the information of position in Transformer. It encodes the absolute position with a rotation matrix and meanwhile incorporates the explicit relative position dependency in self-attention formulation. A graphic illustration of RoPE is shown below.
+<div align="center">
+<img src=https://github.com/open-compass/opencompass/assets/75252858/08c57958-0dcb-40d7-b91b-33f20ca2d89f>
+</div>
+RoPE comes with valuable properties such as flexibility of being expand to any sequence lengths, decaying inter-token dependency with increasing relative distances, and capability of equipping the linear self-attention with relative position encoding.
+RoPE is adopted in many LLMs including LLaMA, LLaMA 2 and Vicuna-7b-v1.5-16k.
+### ALiBi
+Though RoPE and other alternatives to the original sinusoidal position method(like T5 bias) have improved extrapolation, they are considerably slower than the sinusoidal approach and use extra memory and parameter. Therefore, Attention with Linear Biases (ALiBi) is introduced to facilitate efficient extrapolation.
+For an input subsequence of length L, the attention sublayer computes the attention scores for the ith query
+```{math}
+q_{i} \in R^{1 \times d}, (1 \leq i \leq L)
+```
+in each head, given the first i keys
+```{math}
+K \in R^{i \times d}
+```
+where d is the head dimension.
+```{math}
+softmax(q_{i}K^{T})
+```
+ALiBi negatively biases attention scores with a linearly decreasing penalty proportional to the distance between the relevant key and query.  The only modification it applies is after the query-key dot product, where it adds a static, non-learned bias.
+```{math}
+softmax(q_{i}K^{T}+m\cdot[-(i-1),...,-2,-1,0])
+```
+where scalar m is a head-specific slope fixed before training.
+ALiBi eliminates position embeddings and it is as fast as the sinusoidal approach. It is used in LLMs including mpt-7b-storywriter, which is prepared to handle extremely long inputs.
+### Position Interpolation(PI)
+Many existing pre-trained LLMs including LLaMA use positional encodings that have weak extrapolation properties(e.g. RoPE). Position Interpolation is proposed and it can easily enable very long context windows while preserving model quality relatively well for the tasks within its original context window size.
+The key idea of Position Interpolation is directly down-scale the position indices so that the maximum position index matches the previous context window limit in the pre-training stage. In other words, to accommodate more input tokens, the algorithm interpolates position encodings at neighboring integer positions, utilizing the fact that position encodings can be applied on non-integer positions, as opposed toextrapolating outside the trained positions, which may lead to catastrophic values. The algorithm requires only a very short period of fine-tuning for the model to fully adapt to greatly extended context windows.
+An illustration of Position Interpolation method is shown below. Lower left illustrates Position Interpolation where it downscales the position indices (blue and green dots) themselves from \[0, 4096\] to \[0, 2048\] to force them to reside in the pretrained range.
+<div align="center">
+<img src=https://github.com/open-compass/opencompass/assets/75252858/406454ba-a811-4c66-abbe-3a5528947257>
+</div>
+Position Interpolation empowers ChatGLM2-6B-32k, a model based on ChatGLM2-6B, to deal with a 32k context window size.
+Next, we introduce some long context language models we evaluate.
+### XGen-7B-8k
+XGen-7B-8k is trained with standard dense attention on up to 8k sequence length for up to 1.5T tokens. To mitigate slow training, XGen-7B-8k introduces training in stages with increasing sequence length. First, 800B tokens with sequence length of 2k tokens are observed, then 400B tokens with 4k, finally, 300B tokens with 8k length.
+### Vicuna-7b-v1.5-16k
+Vicuna-7b-v1.5-16k is fine-tuned from LLaMA 2 with supervised instruction fine-tuning and linear RoPE scaling. The training data is around 125K conversations collected from ShareGPT, a website where users can share their ChatGPT conversation. These conversations are packed into sequences that contain 16k tokens each.
+### LongChat-7b-v1.5-32k
+LongChat-7b-v1.5-32k is fine-tuned from LLaMA 2 models, which were originally pretrained with 4k context length. The training recipe can be conceptually described in two steps. The first step is condensing RoPE. Since the LLaMA model has not observed scenarios where position_ids > 4096 during the pre-training phase, LongChat condenses position_ids > 4096 to be within 0 to 4096. The second step is fine-tuning LongChat model on curated conversation data. In this step, the data is cleaned using FastChat data pipeline and truncated to the maximum length of model.
+### ChatGLM2-6B-32k
+The ChatGLM2-6B-32k further strengthens the ability to understand long texts based on the ChatGLM2-6B. Based on the method of Positional Interpolation, and trained with a 32K context length during the dialogue alignment, ChatGLM2-6B-32k can better handle up to 32K context length.
+## [L-Eval](https://github.com/OpenLMLab/LEval)
+L-Eval is a long context dataset built by OpenLMLab, consisting of 18 subtasks, including texts from various fields such as law, economy, and technology. The dataset consists of a total of 411 documents, over 2000 test cases, with an average document length of 7217 words. The subtasks in this dataset are divided into close-ended and open-ended categories, with 5 close-ended tasks evaluated using the exact match criterion and 13 open-ended tasks evaluated using Rouge scores.
+## [LongBench](https://github.com/THUDM/LongBench)
+LongBench is a long context dataset built by THUDM, consisting of 21 subtasks with a total of 4750 test cases. This dataset is the first long context dataset that includes both English and Chinese texts, with an average English text length of 6711 words and an average Chinese text length of 13386 characters. The 21 subtasks are divided into 6 types, providing a more comprehensive evaluation of the model's capabilities in various aspects.
+<div align="center">
+<img src=https://github.com/open-compass/opencompass/assets/75252858/4555e937-c519-4e9c-ad8d-7370430d466a>
+</div>
+## Evaluation Method
+Due to the different maximum input lengths accepted by different models, in order to compare these large models more fairly, when the input length exceeds the maximum input limit of the model, we will trim the middle part of the input text to avoid missing prompt words.
+## Long Context Ability Ranking
+In the LongBench and L-Eval ability rankings, we select the average ranking **(The lower the better)** of each model in the subtask as the standard. It can be seen that GPT-4 and GPT-3.5-turbo-16k still occupy a leading position in long context tasks, while models like ChatGLM2-6B-32k also show significant improvement in long context ability after position interpolation based on ChatGLM2-6B.
+<div align="center">
+<img src=https://github.com/open-compass/opencompass/assets/75252858/29b5ad12-d9a3-4255-be0a-f770923fe514>
+<img src=https://github.com/open-compass/opencompass/assets/75252858/680b4cda-c2b1-45d1-8c33-196dee1a38f3>
+</div>
+The original scores are shown below.
+| L-Eval            | GPT-4 | GPT-3.5-turbo-16k | chatglm2-6b-32k | vicuna-7b-v1.5-16k | xgen-7b-8k | internlm-chat-7b-8k | longchat-7b-v1.5-32k | chatglm2-6b |
+| ----------------- | ----- | ----------------- | --------------- | ------------------ | ---------- | ------------------- | -------------------- | ----------- |
+| coursera          | 61.05 | 50                | 45.35           | 26.74              | 33.72      | 40.12               | 27.91                | 38.95       |
+| gsm100            | 92    | 78                | 27              | 11                 | 8          | 19                  | 5                    | 8           |
+| quality           | 81.19 | 62.87             | 44.55           | 11.39              | 33.66      | 45.54               | 29.7                 | 41.09       |
+| tpo               | 72.93 | 74.72             | 56.51           | 17.47              | 44.61      | 60.59               | 17.1                 | 56.51       |
+| topic_retrieval   | 100   | 79.33             | 44.67           | 24.67              | 1.33       | 0                   | 25.33                | 1.33        |
+|                   |       |                   |                 |                    |            |                     |                      |             |
+| financialqa       | 53.49 | 50.32             | 35.41           | 44.59              | 39.28      | 25.09               | 34.07                | 17.82       |
+| gov_report        | 50.84 | 50.48             | 42.97           | 48.17              | 38.52      | 31.29               | 36.52                | 41.88       |
+| legal_contract_qa | 31.23 | 27.97             | 34.21           | 24.25              | 21.36      | 19.28               | 13.32                | 17.59       |
+| meeting_summ      | 31.44 | 33.54             | 29.13           | 28.52              | 27.96      | 17.56               | 22.32                | 15.98       |
+| multidocqa        | 37.81 | 35.84             | 28.6            | 26.88              | 24.41      | 22.43               | 21.85                | 19.66       |
+| narrativeqa       | 25.87 | 25.73             | 18.24           | 20.58              | 16.87      | 13.81               | 16.87                | 1.16        |
+| nq                | 67.36 | 66.91             | 41.06           | 36.44              | 29.43      | 16.42               | 35.02                | 0.92        |
+| news_summ         | 34.52 | 40.41             | 32.72           | 33.98              | 26.87      | 22.48               | 30.33                | 29.51       |
+| paper_assistant   | 42.26 | 41.76             | 34.59           | 35.83              | 25.39      | 28.25               | 30.42                | 30.43       |
+| patent_summ       | 48.61 | 50.62             | 46.04           | 48.87              | 46.53      | 30.3                | 41.6                 | 41.25       |
+| review_summ       | 31.98 | 33.37             | 21.88           | 29.21              | 26.85      | 16.61               | 20.02                | 19.68       |
+| scientificqa      | 49.76 | 48.32             | 31.27           | 31                 | 27.43      | 33.01               | 20.98                | 13.61       |
+| tvshow_summ       | 34.84 | 31.36             | 23.97           | 27.88              | 26.6       | 14.55               | 25.09                | 19.45       |
+| LongBench           | GPT-4 | GPT-3.5-turbo-16k | chatglm2-6b-32k | longchat-7b-v1.5-32k | vicuna-7b-v1.5-16k | internlm-chat-7b-8k | chatglm2-6b | xgen-7b-8k |
+| ------------------- | ----- | ----------------- | --------------- | -------------------- | ------------------ | ------------------- | ----------- | ---------- |
+| NarrativeQA         | 31.2  | 25.79             | 19.27           | 19.19                | 23.65              | 12.24               | 13.09       | 18.85      |
+| Qasper              | 42.77 | 43.4              | 33.93           | 30.36                | 31.45              | 24.81               | 22.52       | 20.18      |
+| MultiFieldQA-en     | 55.1  | 54.35             | 45.58           | 44.6                 | 43.38              | 25.41               | 38.09       | 37         |
+| MultiFieldQA-zh     | 64.4  | 61.92             | 52.94           | 32.35                | 44.65              | 36.13               | 37.67       | 14.7       |
+|                     |       |                   |                 |                      |                    |                     |             |            |
+| HotpotQA            | 59.85 | 52.49             | 46.41           | 34.43                | 34.17              | 27.42               | 27.35       | 28.78      |
+| 2WikiMQA            | 67.52 | 41.7              | 33.63           | 23.06                | 20.45              | 26.24               | 22.83       | 20.13      |
+| Musique             | 37.53 | 27.5              | 21.57           | 12.42                | 13.92              | 9.75                | 7.26        | 11.34      |
+| DuReader (zh)       | 38.65 | 29.37             | 38.53           | 20.25                | 20.42              | 11.11               | 17.18       | 8.57       |
+|                     |       |                   |                 |                      |                    |                     |             |            |
+| GovReport           | 32.09 | 29.92             | 32.47           | 29.83                | 29.27              | 18.38               | 22.86       | 23.37      |
+| QMSum               | 24.37 | 23.67             | 23.19           | 22.71                | 23.37              | 18.45               | 21.23       | 21.12      |
+| Multi_news          | 28.52 | 27.05             | 25.12           | 26.1                 | 27.83              | 24.52               | 24.7        | 23.69      |
+| VCSUM (zh)          | 15.54 | 16.88             | 15.95           | 13.46                | 15.76              | 12.91               | 14.07       | 0.98       |
+|                     |       |                   |                 |                      |                    |                     |             |            |
+| TREC                | 78.5  | 73.5              | 30.96           | 29.23                | 32.06              | 39                  | 24.46       | 29.31      |
+| TriviaQA            | 92.19 | 92.75             | 80.64           | 64.19                | 46.53              | 79.55               | 64.19       | 69.58      |
+| SAMSum              | 46.32 | 43.16             | 29.49           | 25.23                | 25.23              | 43.05               | 20.22       | 16.05      |
+| LSHT (zh)           | 41.5  | 34.5              | 22.75           | 20                   | 24.75              | 20.5                | 16          | 18.67      |
+|                     |       |                   |                 |                      |                    |                     |             |            |
+| Passage Count       | 8.5   | 3                 | 3               | 1                    | 3                  | 1.76                | 3           | 1          |
+| PassageRetrieval-en | 75    | 73                | 57.5            | 20.5                 | 16.5               | 7                   | 5.5         | 12         |
+| PassageRetrieval-zh | 96    | 82.5              | 58              | 15                   | 21                 | 2.29                | 5           | 3.75       |
+|                     |       |                   |                 |                      |                    |                     |             |            |
+| LCC                 | 59.25 | 53.49             | 53.3            | 51.46                | 49.3               | 49.32               | 46.59       | 44.1       |
+| RepoBench-P         | 55.42 | 55.95             | 46.66           | 52.18                | 41.49              | 35.86               | 41.97       | 41.83      |

docs/en/advanced_guides/math_verify.md ADDED Viewed

	@@ -0,0 +1,190 @@

+# General Math Evaluation Guidance
+## Introduction
+Mathematical reasoning is a crucial capability for large language models (LLMs). To evaluate a model's mathematical abilities, we need to test its capability to solve mathematical problems step by step and provide accurate final answers. OpenCompass provides a convenient way to evaluate mathematical reasoning through the CustomDataset and MATHVerifyEvaluator components.
+## Dataset Format
+The math evaluation dataset should be in either JSON Lines (.jsonl) or CSV format. Each problem should contain at least:
+- A problem statement
+- A solution/answer (typically in LaTeX format with the final answer in \\boxed{})
+Example JSONL format:
+```json
+{"problem": "Find the value of x if 2x + 3 = 7", "solution": "Let's solve step by step:\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\nTherefore, \\boxed{2}"}
+```
+Example CSV format:
+```csv
+problem,solution
+"Find the value of x if 2x + 3 = 7","Let's solve step by step:\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\nTherefore, \\boxed{2}"
+```
+## Configuration
+To evaluate mathematical reasoning, you'll need to set up three main components:
+1. Dataset Reader Configuration
+```python
+math_reader_cfg = dict(
+    input_columns=['problem'],  # Column name for the question
+    output_column='solution'    # Column name for the answer
+)
+```
+2. Inference Configuration
+```python
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+```
+3. Evaluation Configuration
+```python
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHVerifyEvaluator),
+)
+```
+## Using CustomDataset
+Here's how to set up a complete configuration for math evaluation:
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+math_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-math-dataset',              # Dataset abbreviation
+        path='path/to/your/dataset',         # Path to your dataset file
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+```
+## MATHVerifyEvaluator
+The MATHVerifyEvaluator is specifically designed to evaluate mathematical answers. It is developed based on the math_verify library, which provides mathematical expression parsing and verification capabilities, supporting extraction and equivalence verification for both LaTeX and general expressions.
+The MATHVerifyEvaluator implements:
+1. Extracts answers from both predictions and references using LaTeX extraction
+2. Handles various LaTeX formats and environments
+3. Verifies mathematical equivalence between predicted and reference answers
+4. Provides detailed evaluation results including:
+   - Accuracy score
+   - Detailed comparison between predictions and references
+   - Parse results of both predicted and reference answers
+The evaluator supports:
+- Basic arithmetic operations
+- Fractions and decimals
+- Algebraic expressions
+- Trigonometric functions
+- Roots and exponents
+- Mathematical symbols and operators
+Example evaluation output:
+```python
+{
+    'accuracy': 85.0,  # Percentage of correct answers
+    'details': [
+        {
+            'predictions': 'x = 2',           # Parsed prediction
+            'references': 'x = 2',         # Parsed reference
+            'correct': True            # Whether they match
+        },
+        # ... more results
+    ]
+}
+```
+## Complete Example
+Here's a complete example of how to set up math evaluation:
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+from opencompass.openicl.icl_evaluator.math_evaluator import MATHVerifyEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+# Dataset reader configuration
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+# Inference configuration
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+# Evaluation configuration
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHVerifyEvaluator),
+)
+# Dataset configuration
+math_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-math-dataset',
+        path='path/to/your/dataset.jsonl',  # or .csv
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+# Model configuration
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='your-model-name',
+        path='your/model/path',
+        # ... other model configurations
+    )
+]
+# Output directory
+work_dir = './outputs/math_eval'
+```

docs/en/advanced_guides/needleinahaystack_eval.md ADDED Viewed

	@@ -0,0 +1,138 @@

+# Needle In A Haystack Evaluation
+## Introduction to the Needle In A Haystack Test
+The Needle In A Haystack test (inspired by [NeedleInAHaystack](https://github.com/gkamradt/LLMTest_NeedleInAHaystack/blob/main/LLMNeedleHaystackTester.py)) is an evaluation method where key information is randomly inserted into long texts to form the prompt for large language models (LLMs). This test aims to assess whether LLMs can extract critical information from long texts, thereby evaluating their fundamental ability to comprehend and process long-context documents.
+## Task Overview
+Within the `OpenCompass` framework, under `NeedleBench`, we designed a series of progressively challenging evaluation tasks to comprehensively assess LLMs' long-text information extraction and reasoning capabilities. For a complete description, please refer to our [technical report](https://arxiv.org/abs/2407.11963).
+- **Single-Needle Retrieval Task (S-RT)**: Evaluates the LLM's ability to retrieve a single piece of key information from a long text, testing precise recall of specific details within extensive narratives. This corresponds to the **original Needle In A Haystack test** setup.
+- **Multi-Needle Retrieval Task (M-RT)**: Explores the LLM's ability to retrieve multiple relevant pieces of information from long texts, simulating complex queries over comprehensive documents.
+- **Multi-Needle Reasoning Task (M-RS)**: Assesses LLMs' abilities to integrate multiple key pieces of information extracted from long texts for reasoning, requiring a comprehensive understanding of content.
+- **Ancestral Trace Challenge (ATC)**: Tests LLMs' capabilities in handling multi-layer logical challenges within realistic long-text contexts through "kinship trace needles." In the ATC task, no irrelevant (haystack) texts are added; every piece of text is critical, and models must reason through all details for accurate answers.
+> **Note:** NeedleBench (v2) includes several optimizations and adjustments in dataset construction and task details. For a detailed comparison between the old and new versions, as well as a summary of updates, please refer to [opencompass/configs/datasets/needlebench_v2/readme.md](https://github.com/open-compass/opencompass/blob/main/opencompass/configs/datasets/needlebench_v2/readme.md).
+## Evaluation Steps
+> Note: In the latest `OpenCompass` codebase, the NeedleBench dataset is automatically loaded from the [Huggingface interface](https://huggingface.co/datasets/opencompass/NeedleBench), with no need for manual download or configuration.
+### `OpenCompass` Environment Setup
+```bash
+conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
+conda activate opencompass
+git clone https://github.com/open-compass/opencompass opencompass
+cd opencompass
+pip install -e .
+```
+### Dataset Configuration
+We have pre-configured various long-context settings (4k, 8k, 32k, 128k, 200k, 1000k) in `opencompass/configs/datasets/needlebench_v2`, and you can flexibly define your parameters by adjusting the configuration files.
+### Evaluation Example
+#### Evaluating with `VLLM` Deployed `Qwen2-5-7B` Model
+To evaluate the `Qwen2-5-7B` model deployed with `VLLM` on all tasks under NeedleBench-128K, use the following command. This leverages pre-defined model and dataset configuration files without needing additional configuration:
+##### Local Evaluation
+If evaluating locally, the command will use all available GPUs. You can control GPU visibility using `CUDA_VISIBLE_DEVICES`:
+```bash
+# Local evaluation
+python run.py --datasets needlebench_v2_128k --models vllm_qwen2_5_7b_instruct_128k  --summarizer needlebench/needlebench_v2_128k_summarizer
+```
+##### Evaluation on Slurm Cluster
+For Slurm environments, you can add options like `--slurm -p partition_name -q reserved --max-num-workers 16`:
+```bash
+# Slurm evaluation
+python run.py --datasets needlebench_v2_128k --models vllm_qwen2_5_7b_instruct_128k --summarizer needlebench/needlebench_v2_128k_summarizer --slurm -p partition_name -q reserved --max-num-workers 16
+```
+##### Evaluating Specific Subsets
+If you only want to test the original Needle In A Haystack task (e.g., single-needle 128k), adjust the dataset parameter:
+```bash
+python run.py --datasets needlebench_v2_single_128k --models vllm_qwen2_5_7b_instruct_128k --summarizer needlebench/needlebench_v2_128k_summarizer --slurm -p partition_name -q reserved --max-num-workers 16
+```
+To evaluate only Chinese versions, specify the subset dataset after `/`:
+```bash
+python run.py --datasets needlebench_v2_single_128k/needlebench_zh_datasets --models vllm_qwen2_5_7b_instruct_128k --summarizer needlebench/needlebench_v2_128k_summarizer --slurm -p partition_name -q reserved --max-num-workers 16
+```
+Ensure `VLLM` is installed beforehand:
+```bash
+# Install vLLM with CUDA 12.4.
+# For other CUDA versions, please refer to the [official documentation](https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html)
+pip install vllm
+```
+#### Evaluating Other `Huggingface` Models
+For other models, it is recommended to write your own config file (such as `examples/eval_needlebench_v2.py`) to adjust `max_seq_len` and `max_out_len`, so that the model can process the full context.
+You can then run evaluation with:
+```bash
+python run.py examples/eval_needlebench_v2.py --slurm -p partition_name -q reserved --max-num-workers 16
+```
+No need to manually specify `--datasets`, `--models`, or `--summarizer` again.
+### Visualization
+NeedleBench's latest version has built-in visualization integrated into the summarizer. You can find corresponding visualizations in the `plots` directory under the output folder without needing additional scripts.
+### Citation
+If you use NeedleBench, please cite us:
+```bibtex
+@misc{li2025needlebenchllmsretrievalreasoning,
+      title={NeedleBench: Can LLMs Do Retrieval and Reasoning in Information-Dense Context?},
+      author={Mo Li and Songyang Zhang and Taolin Zhang and Haodong Duan and Yunxin Liu and Kai Chen},
+      year={2025},
+      eprint={2407.11963},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2407.11963},
+}
+@misc{2023opencompass,
+    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
+    author={OpenCompass Contributors},
+    howpublished={\url{https://github.com/open-compass/opencompass}},
+    year={2023}
+}
+@misc{LLMTest_NeedleInAHaystack,
+  title={LLMTest Needle In A Haystack - Pressure Testing LLMs},
+  author={gkamradt},
+  year={2023},
+  howpublished={\url{https://github.com/gkamradt/LLMTest_NeedleInAHaystack}}
+}
+@misc{wei2023skywork,
+      title={Skywork: A More Open Bilingual Foundation Model},
+      author={Tianwen Wei and Liang Zhao and Lichang Zhang and Bo Zhu and Lijie Wang and Haihua Yang and Biye Li and Cheng Cheng and Weiwei L\"u and Rui Hu and Chenxia Li and Liu Yang and Xilin Luo and Xuejie Wu and Lunan Liu and Wenjun Cheng and Peng Cheng and Jianhao Zhang and Xiaoyu Zhang and Lei Lin and Xiaokun Wang and Yutuan Ma and Chuanhai Dong and Yanqi Sun and Yifu Chen and Yongyi Peng and Xiaojuan Liang and Shuicheng Yan and Han Fang and Yahui Zhou},
+      year={2023},
+      eprint={2310.19341},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```

docs/en/advanced_guides/new_dataset.md ADDED Viewed

	@@ -0,0 +1,105 @@

+# Add a dataset
+Although OpenCompass has already included most commonly used datasets, users need to follow the steps below to support a new dataset if wanted:
+1. Add a dataset script `mydataset.py` to the `opencompass/datasets` folder. This script should include:
+   - The dataset and its loading method. Define a `MyDataset` class that implements the data loading method `load` as a static method. This method should return data of type `datasets.Dataset`. We use the Hugging Face dataset as the unified interface for datasets to avoid introducing additional logic. Here's an example:
+   ```python
+   import datasets
+   from .base import BaseDataset
+   class MyDataset(BaseDataset):
+       @staticmethod
+       def load(**kwargs) -> datasets.Dataset:
+           pass
+   ```
+   - (Optional) If the existing evaluators in OpenCompass do not meet your needs, you need to define a `MyDatasetEvaluator` class that implements the scoring method `score`. This method should take `predictions` and `references` as input and return the desired dictionary. Since a dataset may have multiple metrics, the method should return a dictionary containing the metrics and their corresponding scores. Here's an example:
+   ```python
+   from opencompass.openicl.icl_evaluator import BaseEvaluator
+   class MyDatasetEvaluator(BaseEvaluator):
+       def score(self, predictions: List, references: List) -> dict:
+           pass
+   ```
+   - (Optional) If the existing postprocessors in OpenCompass do not meet your needs, you need to define the `mydataset_postprocess` method. This method takes an input string and returns the corresponding postprocessed result string. Here's an example:
+   ```python
+   def mydataset_postprocess(text: str) -> str:
+       pass
+   ```
+2. After defining the dataset loading, data postprocessing, and evaluator methods, you need to add the following configurations to the configuration file:
+   ```python
+   from opencompass.datasets import MyDataset, MyDatasetEvaluator, mydataset_postprocess
+   mydataset_eval_cfg = dict(
+       evaluator=dict(type=MyDatasetEvaluator),
+       pred_postprocessor=dict(type=mydataset_postprocess))
+   mydataset_datasets = [
+       dict(
+           type=MyDataset,
+           ...,
+           reader_cfg=...,
+           infer_cfg=...,
+           eval_cfg=mydataset_eval_cfg)
+   ]
+   ```
+   - To facilitate the access of your datasets to other users, you need to specify the channels for downloading the datasets in the configuration file. Specifically, you need to first fill in a dataset name given by yourself in the `path` field in the `mydataset_datasets` configuration, and this name will be mapped to the actual download path in the `opencompass/utils/datasets_info.py` file. Here's an example:
+   ```python
+    mmlu_datasets = [an
+        dict(
+            ...,
+            path='opencompass/mmlu',
+            ...,
+        )
+   ]
+   ```
+   - Next, you need to create a dictionary key in `opencompass/utils/datasets_info.py` with the same name as the one you provided above. If you have already hosted the dataset on HuggingFace or Modelscope, please add a dictionary key to the `DATASETS_MAPPING` dictionary and fill in the HuggingFace or Modelscope dataset address in the `hf_id` or `ms_id` key, respectively. You can also specify a default local address. Here's an example:
+   ```python
+   "opencompass/mmlu": {
+        "ms_id": "opencompass/mmlu",
+        "hf_id": "opencompass/mmlu",
+        "local": "./data/mmlu/",
+    }
+   ```
+   - If you wish for the provided dataset to be directly accessible from the OpenCompass OSS repository when used by others, you need to submit the dataset files in the Pull Request phase. We will then transfer the dataset to the OSS on your behalf and create a new dictionary key in the `DATASET_URL`.
+   - To ensure the optionality of data sources, you need to improve the method `load` in the dataset script `mydataset.py`. Specifically, you need to implement a functionality to switch among different download sources based on the setting of the environment variable `DATASET_SOURCE`. It should be noted that if the environment variable `DATASET_SOURCE` is not set, the dataset will default to being downloaded from the OSS repository. Here's an example from `opencompass/dataset/cmmlu.py`:
+   ```python
+    def load(path: str, name: str, **kwargs):
+        ...
+        if environ.get('DATASET_SOURCE') == 'ModelScope':
+            ...
+        else:
+            ...
+        return dataset
+   ```
+3. After completing the dataset script and config file, you need to register the information of your new dataset in the file `dataset-index.yml` at the main directory, so that it can be added to the dataset statistics list on the OpenCompass website.
+   - The keys that need to be filled in include `name`: the name of your dataset, `category`: the category of your dataset, `paper`: the URL of the paper or project, and `configpath`: the path to the dataset config file. Here's an example:
+   ```
+   - mydataset:
+       name: MyDataset
+       category: Understanding
+       paper: https://arxiv.org/pdf/xxxxxxx
+       configpath: opencompass/configs/datasets/MyDataset
+   ```
+   Detailed dataset configuration files and other required configuration files can be referred to in the [Configuration Files](../user_guides/config.md) tutorial. For guides on launching tasks, please refer to the [Quick Start](../get_started/quick_start.md) tutorial.

docs/en/advanced_guides/new_model.md ADDED Viewed

	@@ -0,0 +1,73 @@

+# Add a Model
+Currently, we support HF models, some model APIs, and some third-party models.
+## Adding API Models
+To add a new API-based model, you need to create a new file named `mymodel_api.py` under `opencompass/models` directory. In this file, you should inherit from `BaseAPIModel` and implement the `generate` method for inference and the `get_token_len` method to calculate the length of tokens. Once you have defined the model, you can modify the corresponding configuration file.
+```python
+from ..base_api import BaseAPIModel
+class MyModelAPI(BaseAPIModel):
+    is_api: bool = True
+    def __init__(self,
+                 path: str,
+                 max_seq_len: int = 2048,
+                 query_per_second: int = 1,
+                 retry: int = 2,
+                 **kwargs):
+        super().__init__(path=path,
+                         max_seq_len=max_seq_len,
+                         meta_template=meta_template,
+                         query_per_second=query_per_second,
+                         retry=retry)
+        ...
+    def generate(
+        self,
+        inputs,
+        max_out_len: int = 512,
+        temperature: float = 0.7,
+    ) -> List[str]:
+        """Generate results given a list of inputs."""
+        pass
+    def get_token_len(self, prompt: str) -> int:
+        """Get lengths of the tokenized string."""
+        pass
+```
+## Adding Third-Party Models
+To add a new third-party model, you need to create a new file named `mymodel.py` under `opencompass/models` directory. In this file, you should inherit from `BaseModel` and implement the `generate` method for generative inference, the `get_ppl` method for discriminative inference, and the `get_token_len` method to calculate the length of tokens. Once you have defined the model, you can modify the corresponding configuration file.
+```python
+from ..base import BaseModel
+class MyModel(BaseModel):
+    def __init__(self,
+                 pkg_root: str,
+                 ckpt_path: str,
+                 tokenizer_only: bool = False,
+                 meta_template: Optional[Dict] = None,
+                 **kwargs):
+        ...
+    def get_token_len(self, prompt: str) -> int:
+        """Get lengths of the tokenized strings."""
+        pass
+    def generate(self, inputs: List[str], max_out_len: int) -> List[str]:
+        """Generate results given a list of inputs. """
+        pass
+    def get_ppl(self,
+                inputs: List[str],
+                mask_length: Optional[List[int]] = None) -> List[float]:
+        """Get perplexity scores given a list of inputs."""
+        pass
+```

docs/en/advanced_guides/objective_judgelm_evaluation.md ADDED Viewed

	@@ -0,0 +1,186 @@

+# Using Large Models as JudgeLLM for Objective Evaluation
+## Introduction
+Traditional objective evaluations often rely on standard answers for reference. However, in practical applications, the predicted results of models may vary due to differences in the model's instruction-following capabilities or imperfections in post-processing functions. This can lead to incorrect extraction of answers and comparison with standard answers, resulting in potentially inaccurate evaluation outcomes. To address this issue, we have adopted a process similar to subjective evaluations by introducing JudgeLLM post-prediction to assess the consistency between model responses and standard answers. ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)).
+Currently, all models supported by the opencompass repository can be directly used as JudgeLLM. Additionally, we are planning to support dedicated JudgeLLMs.
+## Currently Supported Objective Evaluation Datasets
+1. MATH ([https://github.com/hendrycks/math](https://github.com/hendrycks/math))
+## Custom JudgeLLM Objective Dataset Evaluation
+OpenCompass currently supports most datasets that use `GenInferencer` for inference. The specific process for custom JudgeLLM objective evaluation includes:
+1. Building evaluation configurations using API models or open-source models for inference of question answers.
+2. Employing a selected evaluation model (JudgeLLM) to assess the outputs of the model.
+### Step One: Building Evaluation Configurations, Using MATH as an Example
+Below is the Config for evaluating the MATH dataset with JudgeLLM, with the evaluation model being *Llama3-8b-instruct* and the JudgeLLM being *Llama3-70b-instruct*. For more detailed config settings, please refer to `examples/eval_math_llm_judge.py`. The following is a brief version of the annotations to help users understand the meaning of the configuration file.
+```python
+# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
+from mmengine.config import read_base
+with read_base():
+    from .models.hf_llama.hf_llama3_8b_instruct import models as hf_llama3_8b_instruct_model # noqa: F401, F403
+    from .models.hf_llama.hf_llama3_70b_instruct import models as hf_llama3_70b_instruct_model  # noqa: F401, F403
+    from .datasets.math.math_llm_judge import math_datasets  # noqa: F401, F403
+from opencompass.datasets import math_judement_preprocess
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.summarizers import AllObjSummarizer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+# ------------- Prompt Settings ----------------------------------------
+# Evaluation template, please modify the template as needed, JudgeLLM typically uses [Yes] or [No] as the response. For the MATH dataset, the evaluation template is as follows:
+eng_obj_prompt = """
+Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
+Examples:
+    Expression 1: $2x+3$
+    Expression 2: $3+2x$
+[Yes]
+    Expression 1: 3/2
+    Expression 2: 1.5
+[Yes]
+    Expression 1: $x^2+2x+1$
+    Expression 2: $y^2+2y+1$
+[No]
+    Expression 1: $x^2+2x+1$
+    Expression 2: $(x+1)^2$
+[Yes]
+    Expression 1: 3245/5
+    Expression 2: 649
+[No]
+(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
+    Expression 1: 2/(-3)
+    Expression 2: -2/3
+[Yes]
+(trivial simplifications are allowed)
+    Expression 1: 72 degrees
+    Expression 2: 72
+[Yes]
+(give benefit of the doubt to units)
+    Expression 1: 64
+    Expression 2: 64 square feet
+[Yes]
+(give benefit of the doubt to units)
+    Expression 1: 64
+    Expression 2:
+[No]
+(only mark as equivalent if both expressions are nonempty)
+---
+YOUR TASK
+Respond with only "[Yes]" or "[No]" (without quotes). Do not include a rationale.
+    Expression 1: {obj_gold}
+    Expression 2: {prediction}
+"""
+# ------------- Inference Phase ----------------------------------------
+# Models to be evaluated
+models = [*hf_llama3_8b_instruct_model]
+# Evaluation models
+judge_models = hf_llama3_70b_instruct_model
+eng_datasets = [*math_datasets]
+chn_datasets = []
+datasets = eng_datasets + chn_datasets
+for d in eng_datasets:
+    d['eval_cfg']= dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            # If you need to preprocess model predictions before judging,
+            # you can specify a pred_postprocessor function here
+            pred_postprocessor=dict(type=math_judement_preprocess),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = eng_obj_prompt
+                    ),
+                ]),
+            ),
+        ),
+        pred_role="BOT",
+    )
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=40000),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=256,
+        task=dict(type=OpenICLInferTask)),
+)
+# ------------- Evaluation Configuration --------------------------------
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveSizePartitioner, max_task_size=80000, mode='singlescore', models=models, judge_models=judge_models,
+    ),
+    runner=dict(type=LocalRunner,
+        max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
+)
+summarizer = dict(
+    type=AllObjSummarizer
+)
+# Output folder
+work_dir = 'outputs/obj_all/'
+```
+### Step Two: Launch Evaluation and Output Results
+```shell
+python run.py eval_math_llm_judge.py
+```
+This will initiate two rounds of evaluation. The first round involves model inference to obtain predicted answers to questions, and the second round involves JudgeLLM evaluating the consistency between the predicted answers and the standard answers, and scoring them.
+- The results of model predictions will be saved in `output/.../timestamp/predictions/xxmodel/xxx.json`
+- The JudgeLLM's evaluation responses will be saved in `output/.../timestamp/results/xxmodel/xxx.json`
+- The evaluation report will be output to `output/.../timestamp/summary/timestamp/xxx.csv`
+## Results
+Using the Llama3-8b-instruct as the evaluation model and the Llama3-70b-instruct as the evaluator, the MATH dataset was assessed with the following results:
+| Model               | JudgeLLM Evaluation | Naive Evaluation |
+| ------------------- | ------------------- | ---------------- |
+| llama-3-8b-instruct | 27.7                | 27.8             |

docs/en/advanced_guides/persistence.md ADDED Viewed

	@@ -0,0 +1,65 @@

+# Evaluation Results Persistence
+## Introduction
+Normally, the evaluation results of OpenCompass will be saved to your work directory. But in some cases, there may be a need for data sharing among users or quickly browsing existing public evaluation results. Therefore, we provide an interface that can quickly transfer evaluation results to external public data stations, and on this basis, provide functions such as uploading, overwriting, and reading.
+## Quick Start
+### Uploading
+By adding `args` to the evaluation command or adding configuration in the Eval script, the results of evaluation can be stored in the path you specify. Here are the examples:
+(Approach 1) Add an `args` option to the command and specify your public path address.
+```bash
+opencompass  ...  -sp '/your_path'
+```
+(Approach 2) Add configuration in the Eval script.
+```pythonE
+station_path = '/your_path'
+```
+### Overwriting
+The above storage method will first determine whether the same task result already exists in the data station based on the `abbr` attribute in the model and dataset configuration before uploading data. If results already exists, cancel this storage. If you need to update these results, please add the `station-overwrite` option to the command, here is an example:
+```bash
+opencompass  ...  -sp '/your_path' --station-overwrite
+```
+### Reading
+You can directly read existing results from the data station to avoid duplicate evaluation tasks. The read results will directly participate in the 'summarize' step. When using this configuration, only tasks that do not store results in the data station will be initiated. Here is an example:
+```bash
+opencompass  ...  -sp '/your_path' --read-from-station
+```
+### Command Combination
+1. Only upload the results under your latest working directory to the data station, without supplementing tasks that missing results:
+```bash
+opencompass  ...  -sp '/your_path' -r latest -m viz
+```
+## Storage Format of the Data Station
+In the data station, the evaluation results are stored as `json` files for each `model-dataset` pair. The specific directory form is `/your_path/dataset_name/model_name.json `. Each `json` file stores a dictionary corresponding to the results, including `predictions`, `results`, and `cfg`, here is an example:
+```pythonE
+Result = {
+    'predictions': List[Dict],
+    'results': Dict,
+    'cfg': Dict = {
+        'models': Dict,
+        'datasets': Dict,
+        (Only subjective datasets)'judge_models': Dict
+    }
+}
+```
+Among this three keys, `predictions` records the predictions of the model on each item of data in the dataset. `results` records the total score of the model on the dataset. `cfg` records detailed configurations of the model and the dataset in this evaluation task.

docs/en/advanced_guides/prompt_attack.md ADDED Viewed

	@@ -0,0 +1,108 @@

+# Prompt Attack
+We support prompt attack following the idea of [PromptBench](https://github.com/microsoft/promptbench). The main purpose here is to evaluate the robustness of prompt instruction, which means when attack/modify the prompt to instruct the task, how well can this task perform as the original task.
+## Set up environment
+Some components are necessary to prompt attack experiment, therefore we need to set up environments.
+```shell
+git clone https://github.com/microsoft/promptbench.git
+pip install textattack==0.3.8
+export PYTHONPATH=$PYTHONPATH:promptbench/
+```
+## How to attack
+### Add a dataset config
+We will use GLUE-wnli dataset as example, most configuration settings can refer to [config.md](../user_guides/config.md) for help.
+First we need support the basic dataset config, you can find the existing config files in `configs` or support your own config according to [new-dataset](./new_dataset.md)
+Take the following `infer_cfg` as example, we need to define the prompt template. `adv_prompt` is the basic prompt placeholder to be attacked in the experiment. `sentence1` and `sentence2` are the input columns of this dataset. The attack will only modify the `adv_prompt` here.
+Then, we should use `AttackInferencer` with `original_prompt_list` and `adv_key` to tell the inferencer where to attack and what text to be attacked.
+More details can refer to `configs/datasets/promptbench/promptbench_wnli_gen_50662f.py` config file.
+```python
+original_prompt_list = [
+    'Are the following two sentences entailment or not_entailment? Answer me with "A. entailment" or "B. not_entailment", just one word. ',
+    "Does the relationship between the given sentences represent entailment or not_entailment? Respond with 'A. entailment' or 'B. not_entailment'.",
+    ...,
+]
+wnli_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role="HUMAN",
+                prompt="""{adv_prompt}
+Sentence 1: {sentence1}
+Sentence 2: {sentence2}
+Answer:"""),
+        ]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(
+        type=AttackInferencer,
+        original_prompt_list=original_prompt_list,
+        adv_key='adv_prompt'))
+```
+### Add a eval config
+We should use `OpenICLAttackTask` here for attack task. Also `NaivePartitioner` should be used because the attack experiment will run the whole dataset repeatedly for nearly hurdurds times to search the best attack, we do not want to split the dataset for convenience.
+```note
+Please choose a small dataset(example < 1000) for attack, due to the aforementioned repeated search, otherwise the time cost is enumerous.
+```
+There are several other options in `attack` config:
+- `attack`: attack type, available options includes `textfooler`, `textbugger`, `deepwordbug`, `bertattack`, `checklist`, `stresstest`;
+- `query_budget`: upper boundary of queries, which means the total numbers of running the dataset;
+- `prompt_topk`: number of topk prompt to be attacked. In most case, the original prompt list is great than 10, running the whole set is time consuming.
+```python
+# Please run whole dataset at a time, aka use `NaivePartitioner` only
+# Please use `OpenICLAttackTask` if want to perform attack experiment
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=SlurmRunner,
+        max_num_workers=8,
+        task=dict(type=OpenICLAttackTask),
+        retry=0),
+)
+attack = dict(
+    attack='textfooler',
+    query_budget=100,
+    prompt_topk=2,
+)
+```
+### Run the experiment
+Please use `--mode infer` when run the attack experiment, and set `PYTHONPATH` env.
+```shell
+python run.py examples/eval_attack.py --mode infer
+```
+All the results will be saved in `attack` folder.
+The content includes the original prompt accuracy and the attacked prompt with dropped accuracy of `topk` prompt, for instance:
+```
+Prompt: Assess the connection between the following sentences and classify it as 'A. entailment' or 'B. not_entailment'., acc: 59.15%
+Prompt: Does the relationship between the given sentences represent entailment or not_entailment? Respond with 'A. entailment' or 'B. not_entailment'., acc: 57.75%
+Prompt: Analyze the two provided sentences and decide if their relationship is 'A. entailment' or 'B. not_entailment'., acc: 56.34%
+Prompt: Identify whether the given pair of sentences demonstrates entailment or not_entailment. Answer with 'A. entailment' or 'B. not_entailment'., acc: 54.93%
+...
+Original prompt: Assess the connection between the following sentences and classify it as 'A. entailment' or 'B. not_entailment'.
+Attacked prompt: b"Assess the attach between the following sentences and sorted it as 'A. entailment' or 'B. not_entailment'."
+Original acc: 59.15%, attacked acc: 40.85%, dropped acc: 18.31%
+```

docs/en/advanced_guides/subjective_evaluation.md ADDED Viewed

	@@ -0,0 +1,171 @@

+# Subjective Evaluation Guidance
+## Introduction
+Subjective evaluation aims to assess the model's performance in tasks that align with human preferences. The key criterion for this evaluation is human preference, but it comes with a high cost of annotation.
+To explore the model's subjective capabilities, we employ JudgeLLM as a substitute for human assessors ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)).
+A popular evaluation method involves
+- Compare Mode: comparing model responses pairwise to calculate their win rate
+- Score Mode: another method involves calculate scores with single model response ([Chatbot Arena](https://chat.lmsys.org/)).
+We support the use of GPT-4 (or other JudgeLLM) for the subjective evaluation of models based on above methods.
+## Currently Supported Subjective Evaluation Datasets
+1. AlignBench Chinese Scoring Dataset (https://github.com/THUDM/AlignBench)
+2. MTBench English Scoring Dataset, two-turn dialogue (https://github.com/lm-sys/FastChat)
+3. MTBench101 English Scoring Dataset, multi-turn dialogue (https://github.com/mtbench101/mt-bench-101)
+4. AlpacaEvalv2 English Compare Dataset (https://github.com/tatsu-lab/alpaca_eval)
+5. ArenaHard English Compare Dataset, mainly focused on coding (https://github.com/lm-sys/arena-hard/tree/main)
+6. Fofo English Scoring Dataset (https://github.com/SalesforceAIResearch/FoFo/)
+7. Wildbench English Score and Compare Dataset（https://github.com/allenai/WildBench）
+## Initiating Subjective Evaluation
+Similar to existing objective evaluation methods, you can configure related settings in `examples/eval_subjective.py`.
+### Basic Parameters: Specifying models, datasets, and judgemodels
+Similar to objective evaluation, import the models and datasets that need to be evaluated, for example:
+```
+with read_base():
+    from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
+    from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
+    from .models.qwen.hf_qwen_7b import models
+```
+It is worth noting that since the model setup parameters for subjective evaluation are often different from those for objective evaluation, it often requires setting up `do_sample` for inference instead of `greedy`. You can modify the relevant parameters in the configuration file as needed, for example:
+```
+models = [
+    dict(
+        type=HuggingFaceChatGLM3,
+        abbr='chatglm3-6b-hf2',
+        path='THUDM/chatglm3-6b',
+        tokenizer_path='THUDM/chatglm3-6b',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        generation_kwargs=dict(
+            do_sample=True,
+        ),
+        meta_template=api_meta_template,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
+```
+The judgemodel is usually set to a powerful model like GPT4, and you can directly enter your API key according to the configuration in the config file, or use a custom model as the judgemodel.
+### Specifying Other Parameters
+In addition to the basic parameters, you can also modify the `infer` and `eval` fields in the config to set a more appropriate partitioning method. The currently supported partitioning methods mainly include three types: NaivePartitioner, SizePartitioner, and NumberWorkPartitioner. You can also specify your own workdir to save related files.
+## Subjective Evaluation with Custom Dataset
+The specific process includes:
+1. Data preparation
+2. Model response generation
+3. Evaluate the response with a JudgeLLM
+4. Generate JudgeLLM's response and calculate the metric
+### Step-1: Data Preparation
+This step requires preparing the dataset file and implementing your own dataset class under `Opencompass/datasets/subjective/`, returning the read data in the format of `list of dict`.
+Actually, you can prepare the data in any format you like (csv, json, jsonl, etc.). However, to make it easier to get started, it is recommended to construct the data according to the format of the existing subjective datasets or according to the following json format.
+We provide mini test-set for **Compare Mode** and **Score Mode** as below:
+```python
+###COREV2
+[
+    {
+        "question": "如果我在空中垂直抛球，球最初向哪个方向行进？",
+        "capability": "知识-社会常识",
+        "others": {
+            "question": "如果我在空中垂直抛球，球最初向哪个方向行进？",
+            "evaluating_guidance": "",
+            "reference_answer": "上"
+        }
+    },...]
+###CreationV0.1
+[
+    {
+        "question": "请你扮演一个邮件管家，我让你给谁发送什么主题的邮件，你就帮我扩充好邮件正文，并打印在聊天框里。你需要根据我提供���邮件收件人以及邮件主题，来斟酌用词，并使用合适的敬语。现在请给导师发送邮件，询问他是否可以下周三下午15:00进行科研同步会，大约200字。",
+        "capability": "邮件通知",
+        "others": ""
+    },
+```
+The json must includes the following fields:
+- 'question': Question description
+- 'capability': The capability dimension of the question.
+- 'others': Other needed information.
+If you want to modify prompt on each single question, you can full some other information into 'others' and construct it.
+### Step-2: Evaluation Configuration(Compare Mode)
+Taking Alignbench as an example, `configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py`:
+1. First, you need to set `subjective_reader_cfg` to receive the relevant fields returned from the custom Dataset class and specify the output fields when saving files.
+2. Then, you need to specify the root path `data_path` of the dataset and the dataset filename `subjective_all_sets`. If there are multiple sub-files, you can add them to this list.
+3. Specify `subjective_infer_cfg` and `subjective_eval_cfg` to configure the corresponding inference and evaluation prompts.
+4. Specify additional information such as `mode` at the corresponding location. Note that the fields required for different subjective datasets may vary.
+5. Define post-processing and score statistics. For example, the postprocessing function `alignbench_postprocess` located under `opencompass/opencompass/datasets/subjective/alignbench`.
+### Step-3: Launch the Evaluation
+```shell
+python run.py config/eval_subjective_score.py -r
+```
+The `-r` parameter allows the reuse of model inference and GPT-4 evaluation results.
+The response of JudgeLLM will be output to `output/.../results/timestamp/xxmodel/xxdataset/.json`.
+The evaluation report will be output to `output/.../summary/timestamp/report.csv`.
+## Multi-round Subjective Evaluation in OpenCompass
+In OpenCompass, we also support subjective multi-turn dialogue evaluation. For instance, the evaluation of MT-Bench can be referred to in `configs/datasets/subjective/multiround`.
+In the multi-turn dialogue evaluation, you need to organize the data format into the following dialogue structure:
+```
+"dialogue": [
+    {
+        "role": "user",
+        "content": "Imagine you are participating in a race with a group of people. If you have just overtaken the second person, what's your current position? Where is the person you just overtook?"
+    },
+    {
+        "role": "assistant",
+        "content": ""
+    },
+    {
+        "role": "user",
+        "content": "If the \"second person\" is changed to \"last person\" in the above question, what would the answer be?"
+    },
+    {
+        "role": "assistant",
+        "content": ""
+    }
+],
+```
+It's important to note that due to the different question types in MTBench having different temperature settings, we need to divide the original data files into three different subsets according to the temperature for separate inference. For different subsets, we can set different temperatures. For specific settings, please refer to `configs\datasets\subjective\multiround\mtbench_single_judge_diff_temp.py`.

docs/en/conf.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# flake8: noqa
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+# -- Path setup --------------------------------------------------------------
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import subprocess
+import sys
+import pytorch_sphinx_theme
+from sphinx.builders.html import StandaloneHTMLBuilder
+sys.path.insert(0, os.path.abspath('../../'))
+# -- Project information -----------------------------------------------------
+project = 'OpenCompass'
+copyright = '2023, OpenCompass'
+author = 'OpenCompass Authors'
+# The full version, including alpha/beta/rc tags
+version_file = '../../opencompass/__init__.py'
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+release = get_version()
+# -- General configuration ---------------------------------------------------
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'myst_parser',
+    'sphinx_copybutton',
+    'sphinx_tabs.tabs',
+    'notfound.extension',
+    'sphinxcontrib.jquery',
+    'sphinx_design',
+]
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+language = 'en'
+# The master toctree document.
+root_doc = 'index'
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+# -- Options for HTML output -------------------------------------------------
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'pytorch_sphinx_theme'
+html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+# yapf: disable
+html_theme_options = {
+    'menu': [
+        {
+            'name': 'GitHub',
+            'url': 'https://github.com/open-compass/opencompass'
+        },
+    ],
+    # Specify the language of shared menu
+    'menu_lang': 'en',
+    # Disable the default edit on GitHub
+    'default_edit_on_github': False,
+}
+# yapf: enable
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_css_files = [
+    'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css',
+    'css/readthedocs.css'
+]
+html_js_files = [
+    'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js',
+    'js/custom.js'
+]
+html_context = {
+    'github_version': 'main',
+}
+# -- Options for HTMLHelp output ---------------------------------------------
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'opencompassdoc'
+# -- Options for LaTeX output ------------------------------------------------
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+}
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (root_doc, 'opencompass.tex', 'OpenCompass Documentation', author,
+     'manual'),
+]
+# -- Options for manual page output ------------------------------------------
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(root_doc, 'opencompass', 'OpenCompass Documentation', [author],
+              1)]
+# -- Options for Texinfo output ----------------------------------------------
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (root_doc, 'opencompass', 'OpenCompass Documentation', author,
+     'OpenCompass Authors', 'AGI evaluation toolbox and benchmark.',
+     'Miscellaneous'),
+]
+# -- Options for Epub output -------------------------------------------------
+# Bibliographic Dublin Core info.
+epub_title = project
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+# A unique identification for the text.
+#
+# epub_uid = ''
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+# set priority when building html
+StandaloneHTMLBuilder.supported_image_types = [
+    'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg'
+]
+# -- Extension configuration -------------------------------------------------
+# Ignore >>> when copying code
+copybutton_prompt_text = r'>>> |\.\.\. '
+copybutton_prompt_is_regexp = True
+# Auto-generated header anchors
+myst_heading_anchors = 3
+# Enable "colon_fence" extension of myst.
+myst_enable_extensions = ['colon_fence', 'dollarmath']
+# Configuration for intersphinx
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3', None),
+    'numpy': ('https://numpy.org/doc/stable', None),
+    'torch': ('https://pytorch.org/docs/stable/', None),
+    'mmengine': ('https://mmengine.readthedocs.io/en/latest/', None),
+    'transformers':
+    ('https://huggingface.co/docs/transformers/main/en/', None),
+}
+napoleon_custom_sections = [
+    # Custom sections for data elements.
+    ('Meta fields', 'params_style'),
+    ('Data fields', 'params_style'),
+]
+# Disable docstring inheritance
+autodoc_inherit_docstrings = False
+# Mock some imports during generate API docs.
+autodoc_mock_imports = ['rich', 'attr', 'einops']
+# Disable displaying type annotations, these can be very verbose
+autodoc_typehints = 'none'
+# The not found page
+notfound_template = '404.html'
+def builder_inited_handler(app):
+    subprocess.run(['./statis.py'])
+def setup(app):
+    app.connect('builder-inited', builder_inited_handler)

docs/en/docutils.conf ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [html writers]
2	+ table_style: colwidths-auto

docs/en/get_started/faq.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# FAQ
+## General
+### What are the differences and connections between `ppl` and `gen`?
+`ppl` stands for perplexity, an index used to evaluate a model's language modeling capabilities. In the context of OpenCompass, it generally refers to a method of answering multiple-choice questions: given a context, the model needs to choose the most appropriate option from multiple choices. In this case, we concatenate the n options with the context to form n sequences, then calculate the model's perplexity for these n sequences. We consider the option corresponding to the sequence with the lowest perplexity as the model's reasoning result for this question. This evaluation method is simple and direct in post-processing, with high certainty.
+`gen` is an abbreviation for generate. In the context of OpenCompass, it refers to the model's continuation writing result given a context as the reasoning result for a question. Generally, the string obtained from continuation writing requires a heavier post-processing process to extract reliable answers and complete the evaluation.
+In terms of usage, multiple-choice questions and some multiple-choice-like questions of the base model use `ppl`, while the base model's multiple-selection and non-multiple-choice questions use `gen`. All questions of the chat model use `gen`, as many commercial API models do not expose the `ppl` interface. However, there are exceptions, such as when we want the base model to output the problem-solving process (e.g., Let's think step by step), we will also use `gen`, but the overall usage is as shown in the following table:
+|            | ppl            | gen                  |
+| ---------- | -------------- | -------------------- |
+| Base Model | Only MCQ Tasks | Tasks Other Than MCQ |
+| Chat Model | None           | All Tasks            |
+Similar to `ppl`, conditional log probability (`clp`) calculates the probability of the next token given a context. It is also only applicable to multiple-choice questions, and the range of probability calculation is limited to the tokens corresponding to the option numbers. The option corresponding to the token with the highest probability is considered the model's reasoning result. Compared to `ppl`, `clp` calculation is more efficient, requiring only one inference, whereas `ppl` requires n inferences. However, the drawback is that `clp` is subject to the tokenizer. For example, the presence or absence of space symbols before and after an option can change the tokenizer's encoding result, leading to unreliable test results. Therefore, `clp` is rarely used in OpenCompass.
+### How does OpenCompass control the number of shots in few-shot evaluations?
+In the dataset configuration file, there is a retriever field indicating how to recall samples from the dataset as context examples. The most commonly used is `FixKRetriever`, which means using a fixed k samples, hence k-shot. There is also `ZeroRetriever`, which means not using any samples, which in most cases implies 0-shot.
+On the other hand, in-context samples can also be directly specified in the dataset template. In this case, `ZeroRetriever` is also used, but the evaluation is not 0-shot and needs to be determined based on the specific template. Refer to [prompt](../prompt/prompt_template.md) for more details
+### How does OpenCompass allocate GPUs?
+OpenCompass processes evaluation requests using the unit termed as "task". Each task is an independent combination of model(s) and dataset(s). The GPU resources needed for a task are determined entirely by the model being evaluated, specifically by the `num_gpus` parameter.
+During evaluation, OpenCompass deploys multiple workers to execute tasks in parallel. These workers continuously try to secure GPU resources and run tasks until they succeed. As a result, OpenCompass always strives to leverage all available GPU resources to their maximum capacity.
+For instance, if you're using OpenCompass on a local machine equipped with 8 GPUs, and each task demands 4 GPUs, then by default, OpenCompass will employ all 8 GPUs to concurrently run 2 tasks. However, if you adjust the `--max-num-workers` setting to 1, then only one task will be processed at a time, utilizing just 4 GPUs.
+### Why doesn't the GPU behavior of HuggingFace models align with my expectations?
+This is a complex issue that needs to be explained from both the supply and demand sides:
+The supply side refers to how many tasks are being run. A task is a combination of a model and a dataset, and it primarily depends on how many models and datasets need to be tested. Additionally, since OpenCompass splits a larger task into multiple smaller tasks, the number of data entries per sub-task (`--max-partition-size`) also affects the number of tasks. (The `--max-partition-size` is proportional to the actual number of data entries, but the relationship is not 1:1).
+The demand side refers to how many workers are running. Since OpenCompass instantiates multiple models for inference simultaneously, we use `--hf-num-gpus` to specify how many GPUs each instance uses. Note that `--hf-num-gpus` is a parameter specific to HuggingFace models and setting this parameter for non-HuggingFace models will not have any effect. We also use `--max-num-workers` to indicate the maximum number of instances running at the same time. Lastly, due to issues like GPU memory and insufficient load, OpenCompass also supports running multiple instances on the same GPU, which is managed by the parameter `--max-num-workers-per-gpu`. Therefore, it can be generally assumed that we will use a total of `--hf-num-gpus` * `--max-num-workers` / `--max-num-workers-per-gpu` GPUs.
+In summary, when tasks run slowly or the GPU load is low, we first need to check if the supply is sufficient. If not, consider reducing `--max-partition-size` to split the tasks into finer parts. Next, we need to check if the demand is sufficient. If not, consider increasing `--max-num-workers` and `--max-num-workers-per-gpu`. Generally, **we set `--hf-num-gpus` to the minimum value that meets the demand and do not adjust it further.**
+### How do I control the number of GPUs that OpenCompass occupies?
+Currently, there isn't a direct method to specify the number of GPUs OpenCompass can utilize. However, the following are some indirect strategies:
+**If evaluating locally:**
+You can limit OpenCompass's GPU access by setting the `CUDA_VISIBLE_DEVICES` environment variable. For instance, using `CUDA_VISIBLE_DEVICES=0,1,2,3 python run.py ...` will only expose the first four GPUs to OpenCompass, ensuring it uses no more than these four GPUs simultaneously.
+**If using Slurm or DLC:**
+Although OpenCompass doesn't have direct access to the resource pool, you can adjust the `--max-num-workers` parameter to restrict the number of evaluation tasks being submitted simultaneously. This will indirectly manage the number of GPUs that OpenCompass employs. For instance, if each task requires 4 GPUs, and you wish to allocate a total of 8 GPUs, then you should set `--max-num-workers` to 2.
+### `libGL.so.1` not foune
+opencv-python depends on some dynamic libraries that are not present in the environment. The simplest solution is to uninstall opencv-python and then install opencv-python-headless.
+```bash
+pip uninstall opencv-python
+pip install opencv-python-headless
+```
+Alternatively, you can install the corresponding dependency libraries according to the error message
+```bash
+sudo apt-get update
+sudo apt-get install -y libgl1 libglib2.0-0
+```
+## Network
+### My tasks failed with error: `('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))` or `urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443)`
+Because of HuggingFace's implementation, OpenCompass requires network (especially the connection to HuggingFace) for the first time it loads some datasets and models. Additionally, it connects to HuggingFace each time it is launched. For a successful run, you may:
+- Work behind a proxy by specifying the environment variables `http_proxy` and `https_proxy`;
+- Use the cache files from other machines. You may first run the experiment on a machine that has access to the Internet, and then copy the cached files to the offline one. The cached files are located at `~/.cache/huggingface/` by default ([doc](https://huggingface.co/docs/datasets/cache#cache-directory)). When the cached files are ready, you can start the evaluation in offline mode:
+  ```python
+  HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 HF_EVALUATE_OFFLINE=1 python run.py ...
+  ```
+  With which no more network connection is needed for the evaluation. However, error will still be raised if the files any dataset or model is missing from the cache.
+- Use mirror like [hf-mirror](https://hf-mirror.com/)
+  ```python
+  HF_ENDPOINT=https://hf-mirror.com python run.py ...
+  ```
+### My server cannot connect to the Internet, how can I use OpenCompass?
+Use the cache files from other machines, as suggested in the answer to [Network-Q1](#my-tasks-failed-with-error-connection-aborted-connectionreseterror104-connection-reset-by-peer-or-urllib3exceptionsmaxretryerror-httpsconnectionpoolhostcdn-lfshuggingfaceco-port443).
+### In evaluation phase, I'm running into an error saying that `FileNotFoundError: Couldn't find a module script at opencompass/accuracy.py. Module 'accuracy' doesn't exist on the Hugging Face Hub either.`
+HuggingFace tries to load the metric (e.g. `accuracy`) as an module online, and it could fail if the network is unreachable. Please refer to [Network-Q1](#my-tasks-failed-with-error-connection-aborted-connectionreseterror104-connection-reset-by-peer-or-urllib3exceptionsmaxretryerror-httpsconnectionpoolhostcdn-lfshuggingfaceco-port443) for guidelines to fix your network issue.
+The issue has been fixed in the latest version of OpenCompass, so you might also consider pull from the latest version.
+## Efficiency
+### Why does OpenCompass partition each evaluation request into tasks?
+Given the extensive evaluation time and the vast quantity of datasets, conducting a comprehensive linear evaluation on LLM models can be immensely time-consuming. To address this, OpenCompass divides the evaluation request into multiple independent "tasks". These tasks are then dispatched to various GPU groups or nodes, achieving full parallelism and maximizing the efficiency of computational resources.
+### How does task partitioning work?
+Each task in OpenCompass represents a combination of specific model(s) and portions of the dataset awaiting evaluation. OpenCompass offers a variety of task partitioning strategies, each tailored for different scenarios. During the inference stage, the prevalent partitioning method seeks to balance task size, or computational cost. This cost is heuristically derived from the dataset size and the type of inference.
+### Why does it take more time to evaluate LLM models on OpenCompass?
+There is a tradeoff between the number of tasks and the time to load the model. For example, if we partition an request that evaluates a model against a dataset into 100 tasks, the model will be loaded 100 times in total. When resources are abundant, these 100 tasks can be executed in parallel, so the additional time spent on model loading can be ignored. However, if resources are limited, these 100 tasks will operate more sequentially, and repeated loadings can become a bottleneck in execution time.
+Hence, if users find that the number of tasks greatly exceeds the available GPUs, we advise setting the `--max-partition-size` to a larger value.
+## Model
+### How to use the downloaded huggingface models?
+If you have already download the checkpoints of the model, you can specify the local path of the model. For example
+```bash
+python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path /path/to/model
+```
+## Dataset
+### How to build a new dataset?
+- For building new objective dataset: [new_dataset](../advanced_guides/new_dataset.md)
+- For building new subjective dataset: [subjective_evaluation](../advanced_guides/subjective_evaluation.md)

docs/en/get_started/installation.md ADDED Viewed

	@@ -0,0 +1,142 @@

+# Installation
+## Basic Installation
+1. Prepare the OpenCompass runtime environment using Conda:
+```conda create --name opencompass python=3.10 -y
+   # conda create --name opencompass_lmdeploy python=3.10 -y
+   conda activate opencompass
+```
+If you want to customize the PyTorch version or related CUDA version, please refer to the [official documentation](https://pytorch.org/get-started/locally/) to set up the PyTorch environment. Note that OpenCompass requires `pytorch>=1.13`.
+2. Install OpenCompass:
+   - pip Installation
+   ```bash
+   # For support of most datasets and models
+   pip install -U opencompass
+   # Complete installation (supports more datasets)
+   # pip install "opencompass[full]"
+   # API Testing (e.g., OpenAI, Qwen)
+   # pip install "opencompass[api]"
+   ```
+   - Building from Source Code If you want to use the latest features of OpenCompass
+   ```bash
+   git clone https://github.com/open-compass/opencompass opencompass
+   cd opencompass
+   pip install -e .
+   ```
+## Other Installations
+### Inference Backends
+```bash
+ # Model inference backends. Since these backends often have dependency conflicts,
+ # we recommend using separate virtual environments to manage them.
+ pip install "opencompass[lmdeploy]"
+ # pip install "opencompass[vllm]"
+```
+- LMDeploy
+You can check if the inference backend has been installed successfully with the following command. For more information, refer to the [official documentation](https://lmdeploy.readthedocs.io/en/latest/get_started.html)
+```bash
+lmdeploy chat internlm/internlm2_5-1_8b-chat --backend turbomind
+```
+- vLLM
+You can check if the inference backend has been installed successfully with the following command. For more information, refer to the [official documentation](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
+```bash
+vllm serve facebook/opt-125m
+```
+### API
+OpenCompass supports different commercial model API calls, which you can install via pip or by referring to the [API dependencies](https://github.com/open-compass/opencompass/blob/main/requirements/api.txt) for specific API model dependencies.
+```bash
+pip install "opencompass[api]"
+# pip install openai # GPT-3.5-Turbo / GPT-4-Turbo / GPT-4 / GPT-4o (API)
+# pip install anthropic # Claude (API)
+# pip install dashscope # Qwen (API)
+# pip install volcengine-python-sdk # ByteDance Volcano Engine (API)
+# ...
+```
+### Datasets
+The basic installation supports most fundamental datasets. For certain datasets (e.g., Alpaca-eval, Longbench, etc.), additional dependencies need to be installed.
+You can install these through pip or refer to the [additional dependencies](<(https://github.com/open-compass/opencompass/blob/main/requirements/extra.txt)>) for specific dependencies.
+```bash
+pip install "opencompass[full]"
+```
+For HumanEvalX / HumanEval+ / MBPP+, you need to manually clone the Git repository and install it.
+```bash
+git clone --recurse-submodules git@github.com:open-compass/human-eval.git
+cd human-eval
+pip install -e .
+pip install -e evalplus
+```
+Some agent evaluations require installing numerous dependencies, which may conflict with existing runtime environments. We recommend creating separate conda environments to manage these.
+```bash
+# T-Eval
+pip install lagent==0.1.2
+# CIBench
+pip install -r requirements/agent.txt
+```
+# Dataset Preparation
+The datasets supported by OpenCompass mainly include three parts:
+1. Huggingface datasets: The [Huggingface Datasets](https://huggingface.co/datasets) provide a large number of datasets, which will **automatically download** when running with this option.
+   Translate the paragraph into English:
+2. ModelScope Datasets: [ModelScope OpenCompass Dataset](https://modelscope.cn/organization/opencompass) supports automatic downloading of datasets from ModelScope.
+   To enable this feature, set the environment variable: `export DATASET_SOURCE=ModelScope`. The available datasets include (sourced from OpenCompassData-core.zip):
+   ```plain
+   humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli
+   ```
+3. Custom dataset: OpenCompass also provides some Chinese custom **self-built** datasets. Please run the following command to **manually download and extract** them.
+Run the following commands to download and place the datasets in the `${OpenCompass}/data` directory can complete dataset preparation.
+```bash
+# Run in the OpenCompass directory
+wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
+unzip OpenCompassData-core-20240207.zip
+```
+If you need to use the more comprehensive dataset (~500M) provided by OpenCompass, You can download and `unzip` it using the following command:
+```bash
+# For proxy and resumable downloads, try `aria2c -x16 -s16 -k1M "http://ghfast.top/https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-complete-20240207.zip" `
+wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-complete-20240207.zip
+unzip OpenCompassData-complete-20240207.zip
+cd ./data
+find . -name "*.zip" -exec unzip "{}" \;
+```
+The list of datasets included in both `.zip` can be found [here](https://github.com/open-compass/opencompass/releases/tag/0.2.2.rc1)
+OpenCompass has supported most of the datasets commonly used for performance comparison, please refer to `configs/dataset` for the specific list of supported datasets.
+For next step, please read [Quick Start](./quick_start.md).

docs/en/get_started/quick_start.md ADDED Viewed

	@@ -0,0 +1,300 @@

+# Quick Start
+![image](https://github.com/open-compass/opencompass/assets/22607038/d063cae0-3297-4fd2-921a-366e0a24890b)
+## Overview
+OpenCompass provides a streamlined workflow for evaluating a model, which consists of the following stages: **Configure** -> **Inference** -> **Evaluation** -> **Visualization**.
+**Configure**: This is your starting point. Here, you'll set up the entire evaluation process, choosing the model(s) and dataset(s) to assess. You also have the option to select an evaluation strategy, the computation backend, and define how you'd like the results displayed.
+**Inference & Evaluation**: OpenCompass efficiently manages the heavy lifting, conducting parallel inference and evaluation on your chosen model(s) and dataset(s). The **Inference** phase is all about producing outputs from your datasets, whereas the **Evaluation** phase measures how well these outputs align with the gold standard answers. While this procedure is broken down into multiple "tasks" that run concurrently for greater efficiency, be aware that working with limited computational resources might introduce some unexpected overheads, and resulting in generally slower evaluation. To understand this issue and know how to solve it, check out [FAQ: Efficiency](faq.md#efficiency).
+**Visualization**: Once the evaluation is done, OpenCompass collates the results into an easy-to-read table and saves them as both CSV and TXT files. If you need real-time updates, you can activate lark reporting and get immediate status reports in your Lark clients.
+Coming up, we'll walk you through the basics of OpenCompass, showcasing evaluations of pretrained models [OPT-125M](https://huggingface.co/facebook/opt-125m) and [OPT-350M](https://huggingface.co/facebook/opt-350m) on the [SIQA](https://huggingface.co/datasets/social_i_qa) and [Winograd](https://huggingface.co/datasets/winograd_wsc) benchmark tasks. Their configuration files can be found at [configs/eval_demo.py](https://github.com/open-compass/opencompass/blob/main/configs/eval_demo.py).
+Before running this experiment, please make sure you have installed OpenCompass locally and it should run successfully under one _GTX-1660-6G_ GPU.
+For larger parameterized models like Llama-7B, refer to other examples provided in the [configs directory](https://github.com/open-compass/opencompass/tree/main/configs).
+## Configuring an Evaluation Task
+In OpenCompass, each evaluation task consists of the model to be evaluated and the dataset. The entry point for evaluation is `run.py`. Users can select the model and dataset to be tested either via command line or configuration files.
+`````{tabs}
+````{tab} Command Line (Custom HF Model)
+For HuggingFace models, users can set model parameters directly through the command line without additional configuration files. For instance, for the `facebook/opt-125m` model, you can evaluate it with the following command:
+```bash
+python run.py --datasets siqa_gen winograd_ppl \
+--hf-type base \
+--hf-path facebook/opt-125m
+```
+Note that in this way, OpenCompass only evaluates one model at a time, while other ways can evaluate multiple models at once.
+```{caution}
+`--hf-num-gpus` does not stand for the actual number of GPUs to use in evaluation, but the minimum required number of GPUs for this model. [More](faq.md#how-does-opencompass-allocate-gpus)
+```
+:::{dropdown} More detailed example
+:animate: fade-in-slide-down
+```bash
+python run.py --datasets siqa_gen winograd_ppl \
+--hf-type base \  # HuggingFace model type, base or chat
+--hf-path facebook/opt-125m \  # HuggingFace model path
+--tokenizer-path facebook/opt-125m \  # HuggingFace tokenizer path (if the same as the model path, can be omitted)
+--tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True \  # Arguments to construct the tokenizer
+--model-kwargs device_map='auto' \  # Arguments to construct the model
+--max-seq-len 2048 \  # Maximum sequence length the model can accept
+--max-out-len 100 \  # Maximum number of tokens to generate
+--min-out-len 100 \  # Minimum number of tokens to generate
+--batch-size 64  \  # Batch size
+--hf-num-gpus 1  # Number of GPUs required to run the model
+```
+```{seealso}
+For all HuggingFace related parameters supported by `run.py`, please read [Launching Evaluation Task](../user_guides/experimentation.md#launching-an-evaluation-task).
+```
+:::
+````
+````{tab} Command Line
+Users can combine the models and datasets they want to test using `--models` and `--datasets`.
+```bash
+python run.py --models hf_opt_125m hf_opt_350m --datasets siqa_gen winograd_ppl
+```
+The models and datasets are pre-stored in the form of configuration files in `configs/models` and `configs/datasets`. Users can view or filter the currently available model and dataset configurations using `tools/list_configs.py`.
+```bash
+# List all configurations
+python tools/list_configs.py
+# List all configurations related to llama and mmlu
+python tools/list_configs.py llama mmlu
+```
+:::{dropdown} More about `list_configs`
+:animate: fade-in-slide-down
+Running `python tools/list_configs.py llama mmlu` gives the output like:
+```text
++-----------------+-----------------------------------+
+| Model           | Config Path                       |
+|-----------------+-----------------------------------|
+| hf_llama2_13b   | configs/models/hf_llama2_13b.py   |
+| hf_llama2_70b   | configs/models/hf_llama2_70b.py   |
+| ...             | ...                               |
++-----------------+-----------------------------------+
++-------------------+---------------------------------------------------+
+| Dataset           | Config Path                                       |
+|-------------------+---------------------------------------------------|
+| cmmlu_gen         | configs/datasets/cmmlu/cmmlu_gen.py               |
+| cmmlu_gen_ffe7c0  | configs/datasets/cmmlu/cmmlu_gen_ffe7c0.py        |
+| ...               | ...                                               |
++-------------------+---------------------------------------------------+
+```
+Users can use the names in the first column as input parameters for `--models` and `--datasets` in `python run.py`. For datasets, the same name with different suffixes generally indicates that its prompts or evaluation methods are different.
+:::
+:::{dropdown} Model not on the list?
+:animate: fade-in-slide-down
+If you want to evaluate other models, please check out the "Command Line (Custom HF Model)" tab for the way to construct a custom HF model without a configuration file, or "Configuration File" tab to learn the general way to prepare your model configurations.
+:::
+````
+````{tab} Configuration File
+In addition to configuring the experiment through the command line, OpenCompass also allows users to write the full configuration of the experiment in a configuration file and run it directly through `run.py`. The configuration file is organized in Python format and must include the `datasets` and `models` fields.
+The test configuration for this time is [configs/eval_demo.py](https://github.com/open-compass/opencompass/blob/main/configs/eval_demo.py). This configuration introduces the required dataset and model configurations through the [inheritance mechanism](../user_guides/config.md#inheritance-mechanism) and combines the `datasets` and `models` fields in the required format.
+```python
+from mmengine.config import read_base
+with read_base():
+    from .datasets.siqa.siqa_gen import siqa_datasets
+    from .datasets.winograd.winograd_ppl import winograd_datasets
+    from .models.opt.hf_opt_125m import opt125m
+    from .models.opt.hf_opt_350m import opt350m
+datasets = [*siqa_datasets, *winograd_datasets]
+models = [opt125m, opt350m]
+```
+When running tasks, we just need to pass the path of the configuration file to `run.py`:
+```bash
+python run.py configs/eval_demo.py
+```
+:::{dropdown} More about `models`
+:animate: fade-in-slide-down
+OpenCompass provides a series of pre-defined model configurations under `configs/models`. Below is the configuration snippet related to [opt-350m](https://github.com/open-compass/opencompass/blob/main/configs/models/opt/hf_opt_350m.py) (`configs/models/opt/hf_opt_350m.py`):
+```python
+# Evaluate models supported by HuggingFace's `AutoModelForCausalLM` using `HuggingFaceBaseModel`
+from opencompass.models import HuggingFaceBaseModel
+models = [
+    # OPT-350M
+    dict(
+        type=HuggingFaceBaseModel,
+        # Initialization parameters for `HuggingFaceBaseModel`
+        path='facebook/opt-350m',
+        # Below are common parameters for all models, not specific to HuggingFaceBaseModel
+        abbr='opt-350m-hf',         # Model abbreviation
+        max_out_len=1024,           # Maximum number of generated tokens
+        batch_size=32,              # Batch size
+        run_cfg=dict(num_gpus=1),   # The required GPU numbers for this model
+    )
+]
+```
+When using configurations, we can specify the relevant files through the command-line argument ` --models` or import the model configurations into the  `models` list in the configuration file using the inheritance mechanism.
+```{seealso}
+More information about model configuration can be found in [Prepare Models](../user_guides/models.md).
+```
+:::
+:::{dropdown} More about `datasets`
+:animate: fade-in-slide-down
+Similar to models, dataset configuration files are provided under `configs/datasets`. Users can use `--datasets` in the command line or import related configurations in the configuration file via inheritance
+Below is a dataset-related configuration snippet from `configs/eval_demo.py`:
+```python
+from mmengine.config import read_base  # Use mmengine.read_base() to read the base configuration
+with read_base():
+    # Directly read the required dataset configurations from the preset dataset configurations
+    from .datasets.winograd.winograd_ppl import winograd_datasets  # Read Winograd configuration, evaluated based on PPL (perplexity)
+    from .datasets.siqa.siqa_gen import siqa_datasets  # Read SIQA configuration, evaluated based on generation
+datasets = [*siqa_datasets, *winograd_datasets]       # The final config needs to contain the required evaluation dataset list 'datasets'
+```
+Dataset configurations are typically of two types: 'ppl' and 'gen', indicating the evaluation method used. Where `ppl` means discriminative evaluation and `gen` means generative evaluation.
+Moreover, [configs/datasets/collections](https://github.com/open-compass/opencompass/blob/main/configs/datasets/collections) houses various dataset collections, making it convenient for comprehensive evaluations. OpenCompass often uses [`base_medium.py`](/configs/datasets/collections/base_medium.py) for full-scale model testing. To replicate results, simply import that file, for example:
+```bash
+python run.py --models hf_llama_7b --datasets base_medium
+```
+```{seealso}
+You can find more information from [Dataset Preparation](../user_guides/datasets.md).
+```
+:::
+````
+`````
+```{warning}
+OpenCompass usually assumes network is available. If you encounter network issues or wish to run OpenCompass in an offline environment, please refer to [FAQ - Network - Q1](./faq.md#network) for solutions.
+```
+The following sections will use configuration-based method as an example to explain the other features.
+## Launching Evaluation
+Since OpenCompass launches evaluation processes in parallel by default, we can start the evaluation in `--debug` mode for the first run and check if there is any problem. In `--debug` mode, the tasks will be executed sequentially and output will be printed in real time.
+```bash
+python run.py configs/eval_demo.py -w outputs/demo --debug
+```
+The pretrained models 'facebook/opt-350m' and 'facebook/opt-125m' will be automatically downloaded from HuggingFace during the first run.
+If everything is fine, you should see "Starting inference process" on screen:
+```bash
+[2023-07-12 18:23:55,076] [opencompass.openicl.icl_inferencer.icl_gen_inferencer] [INFO] Starting inference process...
+```
+Then you can press `ctrl+c` to interrupt the program, and run the following command in normal mode:
+```bash
+python run.py configs/eval_demo.py -w outputs/demo
+```
+In normal mode, the evaluation tasks will be executed parallelly in the background, and their output will be redirected to the output directory `outputs/demo/{TIMESTAMP}`. The progress bar on the frontend only indicates the number of completed tasks, regardless of their success or failure. **Any backend task failures will only trigger a warning message in the terminal.**
+:::{dropdown} More parameters in `run.py`
+:animate: fade-in-slide-down
+Here are some parameters related to evaluation that can help you configure more efficient inference tasks based on your environment:
+- `-w outputs/demo`: Work directory to save evaluation logs and results. In this case, the experiment result will be saved to `outputs/demo/{TIMESTAMP}`.
+- `-r`: Reuse existing inference results, and skip the finished tasks. If followed by a timestamp, the result under that timestamp in the workspace path will be reused; otherwise, the latest result in the specified workspace path will be reused.
+- `--mode all`: Specify a specific stage of the task.
+  - all: (Default) Perform a complete evaluation, including inference and evaluation.
+  - infer: Perform inference on each dataset.
+  - eval: Perform evaluation based on the inference results.
+  - viz: Display evaluation results only.
+- `--max-partition-size 2000`: Dataset partition size. Some datasets may be large, and using this parameter can split them into multiple sub-tasks to efficiently utilize resources. However, if the partition is too fine, the overall speed may be slower due to longer model loading times.
+- `--max-num-workers 32`: Maximum number of parallel tasks. In distributed environments such as Slurm, this parameter specifies the maximum number of submitted tasks. In a local environment, it specifies the maximum number of tasks executed in parallel. Note that the actual number of parallel tasks depends on the available GPU resources and may not be equal to this number.
+If you are not performing the evaluation on your local machine but using a Slurm cluster, you can specify the following parameters:
+- `--slurm`: Submit tasks using Slurm on the cluster.
+- `--partition(-p) my_part`: Slurm cluster partition.
+- `--retry 2`: Number of retries for failed tasks.
+```{seealso}
+The entry also supports submitting tasks to Alibaba Deep Learning Center (DLC), and more customized evaluation strategies. Please refer to [Launching an Evaluation Task](../user_guides/experimentation.md#launching-an-evaluation-task) for details.
+```
+:::
+## Visualizing Evaluation Results
+After the evaluation is complete, the evaluation results table will be printed as follows:
+```text
+dataset    version    metric    mode      opt350m    opt125m
+---------  ---------  --------  ------  ---------  ---------
+siqa       e78df3     accuracy  gen         21.55      12.44
+winograd   b6c7ed     accuracy  ppl         51.23      49.82
+```
+All run outputs will be directed to `outputs/demo/` directory with following structure:
+```text
+outputs/default/
+├── 20200220_120000
+├── 20230220_183030     # one experiment pre folder
+│   ├── configs         # Dumped config files for record. Multiple configs may be kept if different experiments have been re-run on the same experiment folder
+│   ├── logs            # log files for both inference and evaluation stages
+│   │   ├── eval
+│   │   └── infer
+│   ├── predictions   # Prediction results for each task
+│   ├── results       # Evaluation results for each task
+│   └── summary       # Summarized evaluation results for a single experiment
+├── ...
+```
+The summarization process can be further customized in configuration and output the averaged score of some benchmarks (MMLU, C-Eval, etc.).
+More information about obtaining evaluation results can be found in [Results Summary](../user_guides/summarizer.md).
+## Additional Tutorials
+To learn more about using OpenCompass, explore the following tutorials:
+- [Prepare Datasets](../user_guides/datasets.md)
+- [Prepare Models](../user_guides/models.md)
+- [Task Execution and Monitoring](../user_guides/experimentation.md)
+- [Understand Prompts](../prompt/overview.md)
+- [Results Summary](../user_guides/summarizer.md)
+- [Learn about Config](../user_guides/config.md)

docs/en/index.rst ADDED Viewed

	@@ -0,0 +1,99 @@

+Welcome to OpenCompass' documentation!
+==========================================
+Getting started with OpenCompass
+-------------------------------
+To help you quickly familiarized with OpenCompass, we recommend you to walk through the following documents in order:
+- First read the GetStarted_ section set up the environment, and run a mini experiment.
+- Then learn its basic usage through the UserGuides_.
+- If you want to tune the prompts, refer to the Prompt_.
+- If you want to customize some modules, like adding a new dataset or model, we have provided the AdvancedGuides_.
+- There are more handy tools, such as prompt viewer and lark bot reporter, all presented in Tools_.
+We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
+.. _GetStarted:
+.. toctree::
+   :maxdepth: 1
+   :caption: Get Started
+   get_started/installation.md
+   get_started/quick_start.md
+   get_started/faq.md
+.. _UserGuides:
+.. toctree::
+   :maxdepth: 1
+   :caption: User Guides
+   user_guides/framework_overview.md
+   user_guides/config.md
+   user_guides/datasets.md
+   user_guides/models.md
+   user_guides/evaluation.md
+   user_guides/experimentation.md
+   user_guides/metrics.md
+   user_guides/deepseek_r1.md
+   user_guides/interns1.md
+.. _Prompt:
+.. toctree::
+   :maxdepth: 1
+   :caption: Prompt
+   prompt/overview.md
+   prompt/prompt_template.md
+   prompt/meta_template.md
+   prompt/chain_of_thought.md
+.. _AdvancedGuides:
+.. toctree::
+   :maxdepth: 1
+   :caption: Advanced Guides
+   advanced_guides/new_dataset.md
+   advanced_guides/custom_dataset.md
+   advanced_guides/new_model.md
+   advanced_guides/evaluation_lmdeploy.md
+   advanced_guides/accelerator_intro.md
+   advanced_guides/math_verify.md
+   advanced_guides/llm_judge.md
+   advanced_guides/code_eval.md
+   advanced_guides/code_eval_service.md
+   advanced_guides/subjective_evaluation.md
+   advanced_guides/persistence.md
+.. _Tools:
+.. toctree::
+   :maxdepth: 1
+   :caption: Tools
+   tools.md
+.. _Dataset List:
+.. toctree::
+   :maxdepth: 1
+   :caption: Dataset List
+   dataset_statistics.md
+.. _Notes:
+.. toctree::
+   :maxdepth: 1
+   :caption: Notes
+   notes/contribution_guide.md
+   notes/academic.md
+Indexes & Tables
+==================
+* :ref:`genindex`
+* :ref:`search`

docs/en/notes/academic.md ADDED Viewed

	@@ -0,0 +1,106 @@

+# Guide to Reproducing CompassAcademic Leaderboard Results
+To provide users with a quick and intuitive overview of the performance of mainstream open-source and commercial models on widely-used datasets, we maintain the [CompassAcademic Leaderboard](https://rank.opencompass.org.cn/leaderboard-llm-academic/?m=REALTIME) for LLMs on our official website, updating it typically every two weeks.
+Given the continuous iteration of models and datasets, along with ongoing upgrades to the OpenCompass, the configuration settings for the CompassAcademic leaderboard may evolve. Specifically, we adhere to the following update principles:
+- Newly released models are promptly included, while models published six months to one year (or more) ago are removed from the leaderboard.
+- New datasets are incorporated, while datasets nearing performance saturation are phased out.
+- Existing evaluation results on the leaderboard are updated in sync with changes to the evaluation configuration.
+To support rapid reproducibility, OpenCompass provides the real-time configuration files used in the academic leaderboard.
+## CompassAcademic Leaderboard Reproduction
+[eval_academic_leaderboard_REALTIME.py](https://github.com/open-compass/opencompass/blob/main/examples/eval_academic_leaderboard_REALTIME.py) contains the configuration currently used for academic ranking evaluation. You can replicate the evaluation by following the steps as follows.
+### 1: Model Configs
+Firstly, modify the Model List code block in [eval_academic_leaderboard_REALTIME.py](https://github.com/open-compass/opencompass/blob/main/examples/eval_academic_leaderboard_REALTIME.py) to include the model you wish to evaluate.
+```python
+# Models (add your models here)
+from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
+    models as hf_internlm2_5_7b_chat_model
+```
+The original example calls an lmdeploy-based model configuration in OpenCompass.
+You can also build your new model configuration based on [this document](https://opencompass.readthedocs.io/zh-cn/latest/user_guides/models.html).
+An example of a configuration that calls the deployed service of Qwen3-235B-A22B based on OpenAISDK is as follows:
+```python
+from opencompass.models import OpenAISDK
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+qwen3_235b_a22b_model = dict(
+    abbr="qwen_3_235b_a22b_thinking", # Used to identify the model configuration
+    key="YOUR_SERVE_API_KEY",
+    openai_api_base="YOUR_SERVE_API_URL",
+    type=OpenAISDK, # The model configuration types, commonly used such as OpenAISDK, TurboMindModelwithChatTemplate, HuggingFacewithChatTemplate
+    path="Qwen/Qwen3-235B-A22B",
+    temperature=0.6,
+    meta_template=dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+),
+    query_per_second=1,
+    max_out_len=32000,
+    max_seq_len=32768,
+    batch_size=8,
+    retry=10,
+    extra_body={
+        'chat_template_kwargs': {'enable_thinking': True},
+    }, # Additional configurations of the model, such as the option in Qwen3 series to control whether they thinks or not
+    pred_postprocessor=dict(type=extract_non_reasoning_content), # adding this pred_postprocessor can extract the non-reasoning content from models that output with a think tag
+)
+models = [
+    qwen3_235b_a22b_model,
+]
+```
+Here are the commonly used parameters for reference.
+- `max_seq_len` = 65536 or 32768
+- `max_out_len` = 64000 or 32000
+- `temperature` = 0.6
+- `top_p` = 0.95
+### 2: Verifier Configs
+Complete your verifier model information in `judge_cfg`.
+For detailed information about LLM verifiers, please refer to [this document](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/llm_judge.html).
+At present, CompassAcademic use [CompassVerifier-32B](https://huggingface.co/opencompass/CompassVerifier-32B), here is the config example using OpenAISDK:
+```python
+judge_cfg = dict(
+    abbr='CompassVerifier',
+    type=OpenAISDK,
+    path='opencompass/CompassVerifier-32B',
+    key='YOUR_API_KEY',
+    openai_api_base='YOUR_API_BASE',
+    meta_template=dict(
+        round=[
+            dict(role='HUMAN', api_role='HUMAN'),
+            dict(role='BOT', api_role='BOT', generate=True),
+        ]),
+    query_per_second=1,
+    batch_size=8,
+    temperature=0.001,
+    max_out_len=8192,
+    max_seq_len=32768,
+    mode='mid',
+)
+```
+### 3: Execute evaluation
+After completing the above configuration file, you can enter the following content in the CLI to start the evaluation:
+```bash
+  opencompass examples/eval_academic_leaderboard_REALTIME.py
+```
+For more detailed CLI parameters, please refer to [this document](https://opencompass.readthedocs.io/zh-cn/latest/user_guides/experimentation.html)。

docs/en/notes/contribution_guide.md ADDED Viewed

	@@ -0,0 +1,158 @@

+# Contributing to OpenCompass
+- [Contributing to OpenCompass](#contributing-to-opencompass)
+  - [What is PR](#what-is-pr)
+  - [Basic Workflow](#basic-workflow)
+  - [Procedures in detail](#procedures-in-detail)
+    - [1. Get the most recent codebase](#1-get-the-most-recent-codebase)
+    - [2. Checkout a new branch from `main` branch](#2-checkout-a-new-branch-from-main-branch)
+    - [3. Commit your changes](#3-commit-your-changes)
+    - [4. Push your changes to the forked repository and create a PR](#4-push-your-changes-to-the-forked-repository-and-create-a-pr)
+    - [5. Discuss and review your code](#5-discuss-and-review-your-code)
+    - [6.  Merge your branch to `main` branch and delete the branch](#6--merge-your-branch-to-main-branch-and-delete-the-branch)
+  - [Code style](#code-style)
+    - [Python](#python)
+  - [About Contributing Test Datasets](#about-contributing-test-datasets)
+Thanks for your interest in contributing to OpenCompass! All kinds of contributions are welcome, including but not limited to the following.
+- Fix typo or bugs
+- Add documentation or translate the documentation into other languages
+- Add new features and components
+## What is PR
+`PR` is the abbreviation of `Pull Request`. Here's the definition of `PR` in the [official document](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) of Github.
+```
+Pull requests let you tell others about changes you have pushed to a branch in a repository on GitHub. Once a pull request is opened, you can discuss and review the potential changes with collaborators and add follow-up commits before your changes are merged into the base branch.
+```
+## Basic Workflow
+1. Get the most recent codebase
+2. Checkout a new branch from `main` branch.
+3. Commit your changes ([Don't forget to use pre-commit hooks!](#3-commit-your-changes))
+4. Push your changes and create a PR
+5. Discuss and review your code
+6. Merge your branch to `main` branch
+## Procedures in detail
+### 1. Get the most recent codebase
+- When you work on your first PR
+  Fork the OpenCompass repository: click the **fork** button at the top right corner of Github page
+  ![avatar](https://github.com/open-compass/opencompass/assets/22607038/851ed33d-02db-49c9-bf94-7c62eee89eb2)
+  Clone forked repository to local
+  ```bash
+  git clone git@github.com:XXX/opencompass.git
+  ```
+  Add source repository to upstream
+  ```bash
+  git remote add upstream git@github.com:InternLM/opencompass.git
+  ```
+- After your first PR
+  Checkout the latest branch of the local repository and pull the latest branch of the source repository.
+  ```bash
+  git checkout main
+  git pull upstream main
+  ```
+### 2. Checkout a new branch from `main` branch
+```bash
+git checkout main -b branchname
+```
+### 3. Commit your changes
+- If you are a first-time contributor, please install and initialize pre-commit hooks from the repository root directory first.
+  ```bash
+  pip install -U pre-commit
+  pre-commit install
+  ```
+- Commit your changes as usual. Pre-commit hooks will be triggered to stylize your code before each commit.
+  ```bash
+  # coding
+  git add [files]
+  git commit -m 'messages'
+  ```
+  ```{note}
+  Sometimes your code may be changed by pre-commit hooks. In this case, please remember to re-stage the modified files and commit again.
+  ```
+### 4. Push your changes to the forked repository and create a PR
+- Push the branch to your forked remote repository
+  ```bash
+  git push origin branchname
+  ```
+- Create a PR
+  ![avatar](https://github.com/open-compass/opencompass/assets/22607038/08feb221-b145-4ea8-8e20-05f143081604)
+- Revise PR message template to describe your motivation and modifications made in this PR. You can also link the related issue to the PR manually in the PR message (For more information, checkout the [official guidance](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)).
+- You can also ask a specific person to review the changes you've proposed.
+### 5. Discuss and review your code
+- Modify your codes according to reviewers' suggestions and then push your changes.
+### 6. Merge your branch to `main` branch and delete the branch
+- After the PR is merged by the maintainer, you can delete the branch you created in your forked repository.
+  ```bash
+  git branch -d branchname # delete local branch
+  git push origin --delete branchname # delete remote branch
+  ```
+## Code style
+### Python
+We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.
+We use the following tools for linting and formatting:
+- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools.
+- [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports.
+- [yapf](https://github.com/google/yapf): A formatter for Python files.
+- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files.
+- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.
+- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.
+Style configurations of yapf and isort can be found in [setup.cfg](https://github.com/open-mmlab/OpenCompass/blob/main/setup.cfg).
+## About Contributing Test Datasets
+- Submitting Test Datasets
+  - Please implement logic for automatic dataset downloading in the code; or provide a method for obtaining the dataset in the PR. The OpenCompass maintainers will follow up accordingly. If the dataset is not yet public, please indicate so.
+- Submitting Data Configuration Files
+- Provide a README in the same directory as the data configuration. The README should include, but is not limited to:
+  - A brief description of the dataset
+  - The official link to the dataset
+  - Some test examples from the dataset
+  - Evaluation results of the dataset on relevant models
+  - Citation of the dataset
+- (Optional) Summarizer of the dataset
+- (Optional) If the testing process cannot be achieved simply by concatenating the dataset and model configuration files, a configuration file for conducting the test is also required.
+- (Optional) If necessary, please add a description of the dataset in the relevant documentation sections. This is very necessary to help users understand the testing scheme. You can refer to the following types of documents in OpenCompass:
+  - [Circular Evaluation](../advanced_guides/circular_eval.md)
+  - [Code Evaluation](../advanced_guides/code_eval.md)
+  - [Contamination Assessment](../advanced_guides/contamination_eval.md)

docs/en/notes/news.md ADDED Viewed

	@@ -0,0 +1,40 @@

+# News
+- **\[2024.05.08\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py). Try them out now!
+- **\[2024.04.30\]** We supported evaluating a model's compression efficiency by calculating its Bits per Character (BPC) metric on an [external corpora](configs/datasets/llm_compression/README.md) ([official paper](https://github.com/hkust-nlp/llm-compression-intelligence)). Check out the [llm-compression](configs/eval_llm_compression.py) evaluation config now! 🔥🔥🔥
+- **\[2024.04.29\]** We report the performance of several famous LLMs on the common benchmarks, welcome to [documentation](https://opencompass.readthedocs.io/en/latest/user_guides/corebench.html) for more information! 🔥🔥🔥.
+- **\[2024.04.26\]** We deprecated the multi-madality evaluating function from OpenCompass, related implement has moved to [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), welcome to use! 🔥🔥🔥.
+- **\[2024.04.26\]** We supported the evaluation of [ArenaHard](configs/eval_subjective_arena_hard.py)  welcome to try!🔥🔥🔥.
+- **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) 和 [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥
+- **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)
+- **\[2024.01.30\]** We release OpenCompass 2.0. Click  [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information !
+- **\[2024.01.17\]** We supported the evaluation of [InternLM2](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_keyset.py) and [InternLM2-Chat](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py), InternLM2 showed extremely strong performance in these tests, welcome to try!
+- **\[2024.01.17\]** We supported the needle in a haystack test with multiple needles, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html#id8).
+- **\[2023.12.28\]** We have enabled seamless evaluation of all models developed using [LLaMA2-Accessory](https://github.com/Alpha-VLLM/LLaMA2-Accessory), a powerful toolkit for comprehensive LLM development.
+- **\[2023.12.22\]** We have released [T-Eval](https://github.com/open-compass/T-Eval), a step-by-step evaluation benchmark to gauge your LLMs on tool utilization. Welcome to our [Leaderboard](https://open-compass.github.io/T-Eval/leaderboard.html) for more details!
+- **\[2023.12.10\]** We have released [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), a toolkit for evaluating vision-language models (VLMs), currently support 20+ VLMs and 7 multi-modal benchmarks (including MMBench series).
+- **\[2023.12.10\]** We have supported Mistral AI's MoE LLM: **Mixtral-8x7B-32K**. Welcome to [MixtralKit](https://github.com/open-compass/MixtralKit) for more details about inference and evaluation.
+- **\[2023.11.22\]** We have supported many API-based models, include **Baidu, ByteDance, Huawei, 360**. Welcome to [Models](https://opencompass.readthedocs.io/en/latest/user_guides/models.html) section for more details.
+- **\[2023.11.20\]** Thanks [helloyongyang](https://github.com/helloyongyang) for supporting the evaluation with [LightLLM](https://github.com/ModelTC/lightllm) as backent. Welcome to [Evaluation With LightLLM](https://opencompass.readthedocs.io/en/latest/advanced_guides/evaluation_lightllm.html) for more details.
+- **\[2023.11.13\]** We are delighted to announce the release of OpenCompass v0.1.8. This version enables local loading of evaluation benchmarks, thereby eliminating the need for an internet connection. Please note that with this update, **you must re-download all evaluation datasets** to ensure accurate and up-to-date results.
+- **\[2023.11.06\]** We have supported several API-based models, include  **ChatGLM Pro@Zhipu, ABAB-Chat@MiniMax and Xunfei**. Welcome to [Models](https://opencompass.readthedocs.io/en/latest/user_guides/models.html) section for more details.
+- **\[2023.10.24\]** We release a new benchmark for evaluating LLMs’ capabilities of having multi-turn dialogues. Welcome to [BotChat](https://github.com/open-compass/BotChat) for more details.
+- **\[2023.09.26\]** We update the leaderboard with [Qwen](https://github.com/QwenLM/Qwen), one of the best-performing open-source models currently available, welcome to our [homepage](https://opencompass.org.cn) for more details.
+- **\[2023.09.20\]** We update the leaderboard with [InternLM-20B](https://github.com/InternLM/InternLM), welcome to our [homepage](https://opencompass.org.cn) for more details.
+- **\[2023.09.19\]** We update the leaderboard with WeMix-LLaMA2-70B/Phi-1.5-1.3B, welcome to our [homepage](https://opencompass.org.cn) for more details.
+- **\[2023.09.18\]** We have released [long context evaluation guidance](docs/en/advanced_guides/longeval.md).
+- **\[2023.09.08\]** We update the leaderboard with Baichuan-2/Tigerbot-2/Vicuna-v1.5, welcome to our [homepage](https://opencompass.org.cn) for more details.
+- **\[2023.09.06\]**  [**Baichuan2**](https://github.com/baichuan-inc/Baichuan2) team adpots OpenCompass to evaluate their models systematically. We deeply appreciate the community's dedication to transparency and reproducibility in LLM evaluation.
+- **\[2023.09.02\]** We have supported the evaluation of [Qwen-VL](https://github.com/QwenLM/Qwen-VL) in OpenCompass.
+- **\[2023.08.25\]**  [**TigerBot**](https://github.com/TigerResearch/TigerBot) team adpots OpenCompass to evaluate their models systematically. We deeply appreciate the community's dedication to transparency and reproducibility in LLM evaluation.
+- **\[2023.08.21\]** [**Lagent**](https://github.com/InternLM/lagent) has been released, which is a lightweight framework for building LLM-based agents. We are working with Lagent team to support the evaluation of general tool-use capability, stay tuned!
+- **\[2023.08.18\]** We have supported evaluation for **multi-modality learning**, include **MMBench, SEED-Bench, COCO-Caption, Flickr-30K, OCR-VQA, ScienceQA** and so on. Leaderboard is on the road. Feel free to try multi-modality evaluation with  OpenCompass !
+- **\[2023.08.18\]** [Dataset card](https://opencompass.org.cn/dataset-detail/MMLU) is now online. Welcome new evaluation benchmark  OpenCompass !
+- **\[2023.08.11\]** [Model comparison](https://opencompass.org.cn/model-compare/GPT-4,ChatGPT,LLaMA-2-70B,LLaMA-65B) is now online. We hope this feature offers deeper insights!
+- **\[2023.08.11\]** We have supported [LEval](https://github.com/OpenLMLab/LEval).
+- **\[2023.08.10\]** OpenCompass is compatible with [LMDeploy](https://github.com/InternLM/lmdeploy). Now you can follow this [instruction](https://opencompass.readthedocs.io/en/latest/advanced_guides/evaluation_lmdeploy.html#) to evaluate the accelerated models provide by the **Turbomind**.
+- **\[2023.08.10\]** We have supported [Qwen-7B](https://github.com/QwenLM/Qwen-7B) and [XVERSE-13B](https://github.com/xverse-ai/XVERSE-13B) ! Go to our [leaderboard](https://opencompass.org.cn/leaderboard-llm) for more results! More models are welcome to join OpenCompass.
+- **\[2023.08.09\]** Several new datasets(**CMMLU, TydiQA, SQuAD2.0, DROP**) are updated on our [leaderboard](https://opencompass.org.cn/leaderboard-llm)! More datasets are welcomed to join OpenCompass.
+- **\[2023.08.07\]** We have added a [script](tools/eval_mmbench.py) for users to evaluate the inference results of [MMBench](https://opencompass.org.cn/MMBench)-dev.
+- **\[2023.08.05\]** We have supported [GPT-4](https://openai.com/gpt-4)! Go to our [leaderboard](https://opencompass.org.cn/leaderboard-llm) for more results! More models are welcome to join OpenCompass.
+- **\[2023.07.27\]** We have supported [CMMLU](https://github.com/haonan-li/CMMLU)! More datasets are welcome to join OpenCompass.

docs/en/prompt/chain_of_thought.md ADDED Viewed

	@@ -0,0 +1,127 @@

+# Chain of Thought
+## Background
+During the process of reasoning, CoT (Chain of Thought) method is an efficient way to help LLMs deal complex questions, for example: math problem and relation inference. In OpenCompass, we support multiple types of CoT method.
+![image](https://github.com/open-compass/opencompass/assets/28834990/45d60e0e-02a1-49aa-b792-40a1f95f9b9e)
+## 1. Zero Shot CoT
+You can change the `PromptTemplate` of the dataset config, by simply add *Let's think step by step* to realize a Zero-Shot CoT prompt for your evaluation:
+```python
+qa_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template="Answer the question:\nQ: {question}?\nLet's think step by step:\n"
+    ),
+    retriever=dict(type=ZeroRetriever)
+)
+```
+## 2. Few Shot CoT
+Few-shot CoT can make LLMs easy to follow your instructions and get better answers. For few-shot CoT, add your CoT template to `PromptTemplate` like following config to create a one-shot prompt:
+```python
+qa_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=
+'''Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?
+Let's think step by step
+Answer:
+Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.
+His team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers
+They scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.
+All together his team scored 50+24+10= 84 points
+Mark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.
+His opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.
+They also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.
+All together Mark's opponents scored 100+12+5=117 points
+The total score for the game is both team's scores added together, so it is 84+117=201 points
+The answer is 201
+Question: {question}\nLet's think step by step:\n{answer}
+'''),
+    retriever=dict(type=ZeroRetriever)
+)
+```
+## 3. Self-Consistency
+The SC (Self-Consistency) method is proposed in [this paper](https://arxiv.org/abs/2203.11171), which will sample multiple reasoning paths for the question, and make majority voting to the generated answers for LLMs. This method displays remarkable proficiency among reasoning tasks with high accuracy but may consume more time and resources when inferencing, because of the majority voting strategy. In OpenCompass, You can easily implement the SC method by replacing `GenInferencer` with `SCInferencer` in the dataset configuration and setting the corresponding parameters like:
+```python
+# This SC gsm8k config can be found at: opencompass.configs.datasets.gsm8k.gsm8k_gen_a3e34a.py
+gsm8k_infer_cfg = dict(
+    inferencer=dict(
+        type=SCInferencer, # Replace GenInferencer with SCInferencer.
+        generation_kwargs=dict(do_sample=True, temperature=0.7, top_k=40),  # Set sample parameters to make sure model generate various output, only works for models load from HuggingFace now.
+        infer_type='SC',
+        sc_size = SAMPLE_SIZE
+    )
+)
+gsm8k_eval_cfg = dict(sc_size=SAMPLE_SIZE)
+```
+```{note}
+OpenCompass defaults to use argmax for sampling the next token. Therefore, if the sampling parameters are not specified, the model's inference results will be completely consistent each time, and multiple rounds of evaluation will be ineffective.
+```
+Where `SAMPLE_SIZE` is the number of reasoning paths in Self-Consistency, higher value usually outcome higher performance. The following figure from the original SC paper demonstrates the relation between reasoning paths and performance in several reasoning tasks:
+![image](https://github.com/open-compass/opencompass/assets/28834990/05c7d850-7076-43ca-b165-e6251f9b3001)
+From the figure, it can be seen that in different reasoning tasks, performance tends to improve as the number of reasoning paths increases. However, for some tasks, increasing the number of reasoning paths may reach a limit, and further increasing the number of paths may not bring significant performance improvement. Therefore, it is necessary to conduct experiments and adjustments on specific tasks to find the optimal number of reasoning paths that best suit the task.
+## 4. Tree-of-Thoughts
+In contrast to the conventional CoT approach that considers only a single reasoning path, Tree-of-Thoughts (ToT) allows the language model to explore multiple diverse reasoning paths simultaneously. The model evaluates the reasoning process through self-assessment and makes global choices by conducting lookahead or backtracking when necessary. Specifically, this process is divided into the following four stages:
+**1. Thought Decomposition**
+Based on the nature of the problem, break down the problem into multiple intermediate steps. Each step can be a phrase, equation, or writing plan, depending on the nature of the problem.
+**2. Thought Generation**
+Assuming that solving the problem requires k steps, there are two methods to generate reasoning content:
+- Independent sampling: For each state, the model independently extracts k reasoning contents from the CoT prompts, without relying on other reasoning contents.
+- Sequential generation: Sequentially use "prompts" to guide the generation of reasoning content, where each reasoning content may depend on the previous one.
+**3. Heuristic Evaluation**
+Use heuristic methods to evaluate the contribution of each generated reasoning content to problem-solving. This self-evaluation is based on the model's self-feedback and involves designing prompts to have the model score multiple generated results.
+**4. Search Algorithm Selection**
+Based on the methods of generating and evaluating reasoning content, select an appropriate search algorithm. For example, you can use breadth-first search (BFS) or depth-first search (DFS) algorithms to systematically explore the thought tree, conducting lookahead and backtracking.
+In OpenCompass, ToT parameters need to be set according to the requirements. Below is an example configuration for the 24-Point game from the [official paper](https://arxiv.org/pdf/2305.10601.pdf). Currently, ToT inference is supported only with Huggingface models:
+```python
+# This ToT Game24 config can be found at: opencompass/configs/datasets/game24/game24_gen_8dfde3.py.
+from opencompass.datasets import (Game24Dataset, game24_postprocess,
+                                  Game24Evaluator, Game24PromptWrapper)
+generation_kwargs = dict(temperature=0.7)
+game24_infer_cfg = dict(
+        prompt_template=dict(
+        type=PromptTemplate,
+        template='{input}'), # Directly pass the input content, as the Prompt needs to be specified in steps
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=ToTInferencer, # Replace GenInferencer with ToTInferencer
+                    generation_kwargs=generation_kwargs,
+                    method_generate='propose',  # Method for generating reasoning content, can be independent sampling (sample) or sequential generation (propose)
+                    method_evaluate='value', # Method for evaluating reasoning content, can be voting (vote) or scoring (value)
+                    method_select='greedy', # Method for selecting reasoning content, can be greedy (greedy) or random (sample)
+                    n_evaluate_sample=3,
+                    n_select_sample=5,
+                    task_wrapper=dict(type=Game24PromptWrapper) # This Wrapper class includes the prompts for each step and methods for generating and evaluating reasoning content, needs customization according to the task
+                    ))
+```
+If you want to use the ToT method on a custom dataset, you'll need to make additional configurations in the `opencompass.datasets.YourDataConfig.py` file to set up the `YourDataPromptWrapper` class. This is required for handling the thought generation and heuristic evaluation step within the ToT framework. For reasoning tasks similar to the game 24-Point, you can refer to the implementation in `opencompass/datasets/game24.py` for guidance.