danieldk HF Staff commited on Jan 28, 2025

Commit

f1e9385

1 Parent(s): 91deba9

Build

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py +0 -47
build/{torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so → torch24-cxx11-cu118-x86_64-linux/moe/_moe_w3lspmuramohg.abi3.so} +1 -1
build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py +3 -3
build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py +46 -6
build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py +2 -2
build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py +0 -47
build/{torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so → torch24-cxx11-cu121-x86_64-linux/moe/_moe_xztwj3vfii47s.abi3.so} +1 -1
build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py +3 -3
build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py +46 -6
build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py +2 -2
build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py +0 -47
build/{torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so → torch24-cxx11-cu124-x86_64-linux/moe/_moe_zjfwjryvbxcss.abi3.so} +1 -1
build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py +3 -3
build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py +46 -6
build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py +2 -2
build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py +0 -47
build/{torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so → torch24-cxx98-cu118-x86_64-linux/moe/_moe_vjujc4o4hplak.abi3.so} +2 -2
build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py +3 -3
build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py +46 -6
build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py +2 -2
build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py +0 -47
build/{torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so → torch24-cxx98-cu121-x86_64-linux/moe/_moe_bjua6v5mj6njy.abi3.so} +1 -1
build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so +0 -3
build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py +3 -3
build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py +46 -6
build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py +2 -2
build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py +0 -47
build/{torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so → torch24-cxx98-cu124-x86_64-linux/moe/_moe_ajhcvhc2njy6q.abi3.so} +1 -1
build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py +3 -3
build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py +46 -6
build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py +2 -2
build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py +0 -47
build/{torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so → torch25-cxx11-cu118-x86_64-linux/moe/_moe_wbafjrt24mw7y.abi3.so} +2 -2
build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py +3 -3
build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py +46 -6
build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py +2 -2
build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py +0 -47
build/{torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so → torch25-cxx11-cu121-x86_64-linux/moe/_moe_ezuwtpw27xv6u.abi3.so} +2 -2
build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py +3 -3
build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py +46 -6
build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py +2 -2
build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py +0 -47
build/{torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so → torch25-cxx11-cu124-x86_64-linux/moe/_moe_b3lelvb3xhtk2.abi3.so} +1 -1
build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py +3 -3
build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py +46 -6
build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py +2 -2
build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py +0 -47
build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so +0 -3
build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_mqt4gjnisx6je.abi3.so +3 -0
build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py +3 -3

build/torch24-cxx11-cu118-x86_64-linux/moe/__init__.py CHANGED Viewed

@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
 import torch
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-    def register_fake(fn):
-        return lambda name: fn
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-if hasattr(ops, "marlin_gemm_moe"):
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

 import torch
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

build/{torch25-cxx11-cu118-x86_64-linux/moe/_moe_nskz7v224zllw.abi3.so → torch24-cxx11-cu118-x86_64-linux/moe/_moe_w3lspmuramohg.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e5defb7114c1ba9cfdb740230057cb0c5cb21efe628840771db32494a89b5aa7
 size 84165672

 version https://git-lfs.github.com/spec/v1
+oid sha256:2faeea044dbfd59eaf429d039ae368ed0c3e500817ac1acaefb3720ceca1f5ea
 size 84165672

build/torch24-cxx11-cu118-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_wtjc356yopxde
-ops = torch.ops._moe_wtjc356yopxde
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_wtjc356yopxde::{op_name}"

 import torch
+from . import _moe_w3lspmuramohg
+ops = torch.ops._moe_w3lspmuramohg
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_w3lspmuramohg::{op_name}"

build/torch24-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py CHANGED Viewed

@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Dict, Optional
 import torch
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
     scalar_type = get_scalar_type(num_bits, has_zero_point)
-    intermediate_cache = ops.ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)

 """Fused MoE utilities for GPTQ."""
 import functools
+from typing import TYPE_CHECKING, Any, Dict, Optional
 import torch
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
+from .scalar_type import ScalarType, scalar_types
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 def get_scalar_type(num_bits: int, has_zp: bool):
     scalar_type = get_scalar_type(num_bits, has_zero_point)
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
         dtype=hidden_states.dtype,
     )
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+if hasattr(ops, "marlin_gemm_moe"):
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)

build/torch24-cxx11-cu118-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
-from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe as ops
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

 import triton
 import triton.language as tl
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
+from .platforms import current_platform
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

build/torch24-cxx11-cu121-x86_64-linux/moe/__init__.py CHANGED Viewed

@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
 import torch
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-    def register_fake(fn):
-        return lambda name: fn
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-if hasattr(ops, "marlin_gemm_moe"):
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

 import torch
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

build/{torch25-cxx11-cu121-x86_64-linux/moe/_moe_t32bhzwhzero6.abi3.so → torch24-cxx11-cu121-x86_64-linux/moe/_moe_xztwj3vfii47s.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8094d225249868d1f1c0abbfe8db3a486a99bd1f0928705e7dd5a998f125d8bf
 size 84364504

 version https://git-lfs.github.com/spec/v1
+oid sha256:9c5d4bd811ee24dd293d42959e6d23d66dddcc186b2ede701ebcbf6d66705fe1
 size 84364504

build/torch24-cxx11-cu121-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_fidhfyl4jgbje
-ops = torch.ops._moe_fidhfyl4jgbje
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_fidhfyl4jgbje::{op_name}"

 import torch
+from . import _moe_xztwj3vfii47s
+ops = torch.ops._moe_xztwj3vfii47s
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_xztwj3vfii47s::{op_name}"

build/torch24-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py CHANGED Viewed

@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Dict, Optional
 import torch
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
     scalar_type = get_scalar_type(num_bits, has_zero_point)
-    intermediate_cache = ops.ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)

 """Fused MoE utilities for GPTQ."""
 import functools
+from typing import TYPE_CHECKING, Any, Dict, Optional
 import torch
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
+from .scalar_type import ScalarType, scalar_types
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 def get_scalar_type(num_bits: int, has_zp: bool):
     scalar_type = get_scalar_type(num_bits, has_zero_point)
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
         dtype=hidden_states.dtype,
     )
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+if hasattr(ops, "marlin_gemm_moe"):
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)

build/torch24-cxx11-cu121-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
-from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe as ops
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

 import triton
 import triton.language as tl
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
+from .platforms import current_platform
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

build/torch24-cxx11-cu124-x86_64-linux/moe/__init__.py CHANGED Viewed

@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
 import torch
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-    def register_fake(fn):
-        return lambda name: fn
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-if hasattr(ops, "marlin_gemm_moe"):
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

 import torch
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

build/{torch25-cxx11-cu124-x86_64-linux/moe/_moe_pgljmg5ek5k4e.abi3.so → torch24-cxx11-cu124-x86_64-linux/moe/_moe_zjfwjryvbxcss.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:23f0aec499051a34ed7ba7ac4e58d7d84c5501b8beb1794d6ae8c13f54b08b9e
 size 84063160

 version https://git-lfs.github.com/spec/v1
+oid sha256:a8e33340a0b05f5776c1e5ef66e371b2c198dc00c03c810e2c4ef20923d7a417
 size 84063160

build/torch24-cxx11-cu124-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_sg5gu4g3brle6
-ops = torch.ops._moe_sg5gu4g3brle6
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_sg5gu4g3brle6::{op_name}"

 import torch
+from . import _moe_zjfwjryvbxcss
+ops = torch.ops._moe_zjfwjryvbxcss
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_zjfwjryvbxcss::{op_name}"

build/torch24-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py CHANGED Viewed

@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Dict, Optional
 import torch
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
     scalar_type = get_scalar_type(num_bits, has_zero_point)
-    intermediate_cache = ops.ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)

 """Fused MoE utilities for GPTQ."""
 import functools
+from typing import TYPE_CHECKING, Any, Dict, Optional
 import torch
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
+from .scalar_type import ScalarType, scalar_types
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 def get_scalar_type(num_bits: int, has_zp: bool):
     scalar_type = get_scalar_type(num_bits, has_zero_point)
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
         dtype=hidden_states.dtype,
     )
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+if hasattr(ops, "marlin_gemm_moe"):
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)

build/torch24-cxx11-cu124-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
-from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe as ops
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

 import triton
 import triton.language as tl
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
+from .platforms import current_platform
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

build/torch24-cxx98-cu118-x86_64-linux/moe/__init__.py CHANGED Viewed

@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
 import torch
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-    def register_fake(fn):
-        return lambda name: fn
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-if hasattr(ops, "marlin_gemm_moe"):
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

 import torch
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

build/{torch24-cxx11-cu118-x86_64-linux/moe/_moe_wtjc356yopxde.abi3.so → torch24-cxx98-cu118-x86_64-linux/moe/_moe_vjujc4o4hplak.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6330aa66b63067a8c9c031419773dc47e8853a717ef20b03c57df76660188831
-size 84165640

 version https://git-lfs.github.com/spec/v1
+oid sha256:0aea1e40159b3d8ca879344b36d6c3229d764baf9553b1bef2a04460f1f03f31
+size 84157888

build/torch24-cxx98-cu118-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_v3wdnwni3a5ce
-ops = torch.ops._moe_v3wdnwni3a5ce
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_v3wdnwni3a5ce::{op_name}"

 import torch
+from . import _moe_vjujc4o4hplak
+ops = torch.ops._moe_vjujc4o4hplak
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_vjujc4o4hplak::{op_name}"

build/torch24-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py CHANGED Viewed

@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Dict, Optional
 import torch
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
     scalar_type = get_scalar_type(num_bits, has_zero_point)
-    intermediate_cache = ops.ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)

 """Fused MoE utilities for GPTQ."""
 import functools
+from typing import TYPE_CHECKING, Any, Dict, Optional
 import torch
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
+from .scalar_type import ScalarType, scalar_types
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 def get_scalar_type(num_bits: int, has_zp: bool):
     scalar_type = get_scalar_type(num_bits, has_zero_point)
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
         dtype=hidden_states.dtype,
     )
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+if hasattr(ops, "marlin_gemm_moe"):
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)

build/torch24-cxx98-cu118-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
-from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe as ops
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

 import triton
 import triton.language as tl
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
+from .platforms import current_platform
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

build/torch24-cxx98-cu121-x86_64-linux/moe/__init__.py CHANGED Viewed

@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
 import torch
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-    def register_fake(fn):
-        return lambda name: fn
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-if hasattr(ops, "marlin_gemm_moe"):
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

 import torch
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

build/{torch25-cxx98-cu121-x86_64-linux/moe/_moe_plblvprmwqffy.abi3.so → torch24-cxx98-cu121-x86_64-linux/moe/_moe_bjua6v5mj6njy.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:704adc83ab06534f1af22b829003765b42c118df3790569b346ef36e7be570de
 size 84360960

 version https://git-lfs.github.com/spec/v1
+oid sha256:71767ce941c8fb0e823c11cdebb01bfd77f2250df2873b862473803072276bf4
 size 84360960

build/torch24-cxx98-cu121-x86_64-linux/moe/_moe_hrq7opevcb4ug.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0d1b063e4c52f5d744025e000fd79c5f41cdf56a32883c2d269b9c59f586c9e4
-size 84360992

build/torch24-cxx98-cu121-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_hrq7opevcb4ug
-ops = torch.ops._moe_hrq7opevcb4ug
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_hrq7opevcb4ug::{op_name}"

 import torch
+from . import _moe_bjua6v5mj6njy
+ops = torch.ops._moe_bjua6v5mj6njy
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_bjua6v5mj6njy::{op_name}"

build/torch24-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py CHANGED Viewed

@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Dict, Optional
 import torch
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
     scalar_type = get_scalar_type(num_bits, has_zero_point)
-    intermediate_cache = ops.ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)

 """Fused MoE utilities for GPTQ."""
 import functools
+from typing import TYPE_CHECKING, Any, Dict, Optional
 import torch
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
+from .scalar_type import ScalarType, scalar_types
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 def get_scalar_type(num_bits: int, has_zp: bool):
     scalar_type = get_scalar_type(num_bits, has_zero_point)
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
         dtype=hidden_states.dtype,
     )
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+if hasattr(ops, "marlin_gemm_moe"):
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)

build/torch24-cxx98-cu121-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
-from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe as ops
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

 import triton
 import triton.language as tl
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
+from .platforms import current_platform
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

build/torch24-cxx98-cu124-x86_64-linux/moe/__init__.py CHANGED Viewed

@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
 import torch
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-    def register_fake(fn):
-        return lambda name: fn
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-if hasattr(ops, "marlin_gemm_moe"):
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

 import torch
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

build/{torch25-cxx98-cu124-x86_64-linux/moe/_moe_k6bmwmtgkqymw.abi3.so → torch24-cxx98-cu124-x86_64-linux/moe/_moe_ajhcvhc2njy6q.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:259f926d53dc10e91ef41311f61bcea93fbdbda94758fdca164b37256f9c69de
 size 84059616

 version https://git-lfs.github.com/spec/v1
+oid sha256:38256704ec3f4ad93da175dff5054670c8e9db26b5573579d80331af6f271373
 size 84059616

build/torch24-cxx98-cu124-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_p3swbnotpexcc
-ops = torch.ops._moe_p3swbnotpexcc
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_p3swbnotpexcc::{op_name}"

 import torch
+from . import _moe_ajhcvhc2njy6q
+ops = torch.ops._moe_ajhcvhc2njy6q
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_ajhcvhc2njy6q::{op_name}"

build/torch24-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py CHANGED Viewed

@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Dict, Optional
 import torch
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
     scalar_type = get_scalar_type(num_bits, has_zero_point)
-    intermediate_cache = ops.ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)

 """Fused MoE utilities for GPTQ."""
 import functools
+from typing import TYPE_CHECKING, Any, Dict, Optional
 import torch
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
+from .scalar_type import ScalarType, scalar_types
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 def get_scalar_type(num_bits: int, has_zp: bool):
     scalar_type = get_scalar_type(num_bits, has_zero_point)
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
         dtype=hidden_states.dtype,
     )
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+if hasattr(ops, "marlin_gemm_moe"):
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)

build/torch24-cxx98-cu124-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
-from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe as ops
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

 import triton
 import triton.language as tl
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
+from .platforms import current_platform
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py CHANGED Viewed

@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
 import torch
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-    def register_fake(fn):
-        return lambda name: fn
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-if hasattr(ops, "marlin_gemm_moe"):
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

 import torch
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

build/{torch24-cxx11-cu121-x86_64-linux/moe/_moe_fidhfyl4jgbje.abi3.so → torch25-cxx11-cu118-x86_64-linux/moe/_moe_wbafjrt24mw7y.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b0ca4f733821a564c525a36bb13e35ae960dc1e20f6472b177f67b9b165597ff
-size 84364504

 version https://git-lfs.github.com/spec/v1
+oid sha256:eb03ab835bafe70c299a49cec39abf27f5b5d78715b16eed3527a683181df529
+size 84165672

build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_nskz7v224zllw
-ops = torch.ops._moe_nskz7v224zllw
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_nskz7v224zllw::{op_name}"

 import torch
+from . import _moe_wbafjrt24mw7y
+ops = torch.ops._moe_wbafjrt24mw7y
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_wbafjrt24mw7y::{op_name}"

build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py CHANGED Viewed

@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Dict, Optional
 import torch
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
     scalar_type = get_scalar_type(num_bits, has_zero_point)
-    intermediate_cache = ops.ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)

 """Fused MoE utilities for GPTQ."""
 import functools
+from typing import TYPE_CHECKING, Any, Dict, Optional
 import torch
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
+from .scalar_type import ScalarType, scalar_types
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 def get_scalar_type(num_bits: int, has_zp: bool):
     scalar_type = get_scalar_type(num_bits, has_zero_point)
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
         dtype=hidden_states.dtype,
     )
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+if hasattr(ops, "marlin_gemm_moe"):
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)

build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
-from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe as ops
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

 import triton
 import triton.language as tl
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
+from .platforms import current_platform
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py CHANGED Viewed

@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
 import torch
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-    def register_fake(fn):
-        return lambda name: fn
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-if hasattr(ops, "marlin_gemm_moe"):
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

 import torch
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

build/{torch24-cxx98-cu118-x86_64-linux/moe/_moe_v3wdnwni3a5ce.abi3.so → torch25-cxx11-cu121-x86_64-linux/moe/_moe_ezuwtpw27xv6u.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e83b7db92da1ee38a3a4e5a453d4279024e6af95efcf0ad4b34e275029e44729
-size 84157912

 version https://git-lfs.github.com/spec/v1
+oid sha256:378a8a453186ae62a92342077a988271cd7a02f46fbe303b4505d4484f1bfaef
+size 84364536

build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_t32bhzwhzero6
-ops = torch.ops._moe_t32bhzwhzero6
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_t32bhzwhzero6::{op_name}"

 import torch
+from . import _moe_ezuwtpw27xv6u
+ops = torch.ops._moe_ezuwtpw27xv6u
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_ezuwtpw27xv6u::{op_name}"

build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py CHANGED Viewed

@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Dict, Optional
 import torch
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
     scalar_type = get_scalar_type(num_bits, has_zero_point)
-    intermediate_cache = ops.ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)

 """Fused MoE utilities for GPTQ."""
 import functools
+from typing import TYPE_CHECKING, Any, Dict, Optional
 import torch
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
+from .scalar_type import ScalarType, scalar_types
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 def get_scalar_type(num_bits: int, has_zp: bool):
     scalar_type = get_scalar_type(num_bits, has_zero_point)
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
         dtype=hidden_states.dtype,
     )
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+if hasattr(ops, "marlin_gemm_moe"):
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)

build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
-from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe as ops
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

 import triton
 import triton.language as tl
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
+from .platforms import current_platform
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py CHANGED Viewed

@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
 import torch
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-    def register_fake(fn):
-        return lambda name: fn
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-if hasattr(ops, "marlin_gemm_moe"):
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

 import torch
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

build/{torch24-cxx11-cu124-x86_64-linux/moe/_moe_sg5gu4g3brle6.abi3.so → torch25-cxx11-cu124-x86_64-linux/moe/_moe_b3lelvb3xhtk2.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0864e745883f687c46c9ce743f1e2887113734c57268b9bc0e290185be28cf65
 size 84063128

 version https://git-lfs.github.com/spec/v1
+oid sha256:3ae1204c5e2f4c7692676e0ef703dbab4f20a9f14652c75dee41b8d56560db19
 size 84063128

build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_pgljmg5ek5k4e
-ops = torch.ops._moe_pgljmg5ek5k4e
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_pgljmg5ek5k4e::{op_name}"

 import torch
+from . import _moe_b3lelvb3xhtk2
+ops = torch.ops._moe_b3lelvb3xhtk2
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_b3lelvb3xhtk2::{op_name}"

build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py CHANGED Viewed

@@ -1,13 +1,25 @@
 """Fused MoE utilities for GPTQ."""
 import functools
-from typing import Any, Dict, Optional
 import torch
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
-from .scalar_type import scalar_types
-import moe as ops
 def get_scalar_type(num_bits: int, has_zp: bool):
@@ -116,7 +128,7 @@ def single_marlin_moe(
     scalar_type = get_scalar_type(num_bits, has_zero_point)
-    intermediate_cache = ops.ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
@@ -287,7 +299,7 @@ def fused_marlin_moe(
         dtype=hidden_states.dtype,
     )
-    intermediate_cache1 = ops.ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
@@ -312,7 +324,7 @@ def fused_marlin_moe(
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
-    intermediate_cache3 = ops.ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
@@ -336,3 +348,31 @@ def fused_marlin_moe(
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)

 """Fused MoE utilities for GPTQ."""
 import functools
+from typing import TYPE_CHECKING, Any, Dict, Optional
 import torch
+from ._ops import add_op_namespace_prefix, ops
 from .fused_moe import fused_topk, moe_align_block_size, try_get_optimal_moe_config
+from .scalar_type import ScalarType, scalar_types
+# neuron has torch version that doesn't even have impl_abstract
+if TYPE_CHECKING:
+    def register_fake(fn):
+        return lambda name: fn
+else:
+    try:
+        from torch.library import register_fake
+    except ImportError:
+        from torch.library import impl_abstract as register_fake
 def get_scalar_type(num_bits: int, has_zp: bool):
     scalar_type = get_scalar_type(num_bits, has_zero_point)
+    intermediate_cache = ops.marlin_gemm_moe(
         hidden_states,
         w,
         sorted_token_ids,
         dtype=hidden_states.dtype,
     )
+    intermediate_cache1 = ops.marlin_gemm_moe(
         hidden_states,
         w1,
         sorted_token_ids,
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
+    intermediate_cache3 = ops.marlin_gemm_moe(
         intermediate_cache2,
         w2,
         sorted_token_ids,
     )
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
+if hasattr(ops, "marlin_gemm_moe"):
+    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
+    def marlin_gemm_moe_fake(
+        a: torch.Tensor,
+        b_q_weights: torch.Tensor,
+        sorted_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        b_scales: torch.Tensor,
+        b_zero_points: torch.Tensor,
+        g_idx: torch.Tensor,
+        perm: torch.Tensor,
+        workspace: torch.Tensor,
+        b_q_type: ScalarType,
+        size_m: torch.SymInt,
+        size_n: torch.SymInt,
+        size_k: torch.SymInt,
+        is_k_full: bool,
+        num_experts: int,
+        topk: int,
+        moe_block_size: int,
+        replicate_input: bool,
+        apply_weights: bool,
+    ) -> torch.Tensor:
+        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)

build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py CHANGED Viewed

@@ -9,9 +9,9 @@ import torch
 import triton
 import triton.language as tl
-from .platforms import current_platform
 from .fp8 import scaled_fp8_quant
-import moe as ops
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

 import triton
 import triton.language as tl
+from ._ops import ops
 from .fp8 import scaled_fp8_quant
+from .platforms import current_platform
 VLLM_FUSED_MOE_CHUNK_SIZE = int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768"))

build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py CHANGED Viewed

@@ -1,19 +1,5 @@
-from typing import TYPE_CHECKING
 import torch
-# neuron has torch version that doesn't even have impl_abstract
-if TYPE_CHECKING:
-    def register_fake(fn):
-        return lambda name: fn
-else:
-    try:
-        from torch.library import register_fake
-    except ImportError:
-        from torch.library import impl_abstract as register_fake
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
@@ -91,39 +77,6 @@ def topk_softmax(
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
-if hasattr(ops, "marlin_gemm_moe"):
-    @register_fake(add_op_namespace_prefix("marlin_gemm_moe"))
-    def marlin_gemm_moe_fake(
-        a: torch.Tensor,
-        b_q_weights: torch.Tensor,
-        sorted_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        b_scales: torch.Tensor,
-        b_zero_points: torch.Tensor,
-        g_idx: torch.Tensor,
-        perm: torch.Tensor,
-        workspace: torch.Tensor,
-        b_q_type: ScalarType,
-        size_m: torch.SymInt,
-        size_n: torch.SymInt,
-        size_k: torch.SymInt,
-        is_k_full: bool,
-        num_experts: int,
-        topk: int,
-        moe_block_size: int,
-        replicate_input: bool,
-        apply_weights: bool,
-    ) -> torch.Tensor:
-        return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device)
-def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    ops.silu_and_mul(out, x)
-    return out
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

 import torch
 from ._ops import add_op_namespace_prefix, ops
 from .fused_marlin_moe import fused_marlin_moe
 from .fused_moe import fused_moe, fused_topk, grouped_topk
     ops.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 __all__ = [
     "gptq_marlin_moe_repack",
     "awq_marlin_moe_repack",

build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_dtibz76vuxaaq.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b1eef7e6a15aca930caa813a845147beeec16159c8cce89891c40d080a6f3062
-size 84157880

build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_mqt4gjnisx6je.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b8ebfaa74892fb13f34924a63e188b9593cc3290831bf31e0f78ae99c9526b0
+size 84157856

build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _moe_dtibz76vuxaaq
-ops = torch.ops._moe_dtibz76vuxaaq
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_moe_dtibz76vuxaaq::{op_name}"

 import torch
+from . import _moe_mqt4gjnisx6je
+ops = torch.ops._moe_mqt4gjnisx6je
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_moe_mqt4gjnisx6je::{op_name}"