harness / diffs /37449.patch
ArthurZ's picture
ArthurZ HF Staff
Initial harness: 100 perf tasks + Gradio browser
dfefe0b verified
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index 1eb50ee4ad7f..72853d4ca4d6 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -14,6 +14,8 @@ ARG PYTORCH='2.6.0'
ARG INTEL_TORCH_EXT='2.3.0'
# Example: `cu102`, `cu113`, etc.
ARG CUDA='cu121'
+# Disable kernel mapping for now until all tests pass
+ENV DISABLE_KERNEL_MAPPING=1
RUN apt update
RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py
index d64e2746d492..fdb825cad370 100644
--- a/src/transformers/models/aria/modeling_aria.py
+++ b/src/transformers/models/aria/modeling_aria.py
@@ -228,7 +228,6 @@ def forward(self, key_value_states: torch.Tensor, attn_mask: Optional[torch.Tens
return out
-@use_kernel_forward_from_hub("MLP")
class AriaSharedExpertsMLP(nn.Module):
"""
Shared Expert MLP for shared experts.
diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py
index 0cf23edb7510..8fd2483bcd60 100644
--- a/src/transformers/models/bamba/modeling_bamba.py
+++ b/src/transformers/models/bamba/modeling_bamba.py
@@ -882,7 +882,6 @@ def forward(
return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
-@use_kernel_forward_from_hub("MLP")
class BambaMLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index fd888c38d7fd..8cbb7128c734 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -36,7 +36,6 @@
from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache, StaticCache
from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub
from ...modeling_attn_mask_utils import AttentionMaskConverter
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
@@ -118,7 +117,6 @@ def forward(self, x, position_ids):
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-@use_kernel_forward_from_hub("MLP")
class CohereMLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/cohere2/modeling_cohere2.py b/src/transformers/models/cohere2/modeling_cohere2.py
index e419379969d3..18a3a50ac157 100644
--- a/src/transformers/models/cohere2/modeling_cohere2.py
+++ b/src/transformers/models/cohere2/modeling_cohere2.py
@@ -28,7 +28,6 @@
from ...activations import ACT2FN
from ...cache_utils import Cache, HybridCache, StaticCache
from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
@@ -268,7 +267,6 @@ def forward(
return attn_output, attn_weights
-@use_kernel_forward_from_hub("MLP")
class Cohere2MLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/diffllama/modeling_diffllama.py b/src/transformers/models/diffllama/modeling_diffllama.py
index ed536cbebaf3..e7fecb4be6a9 100644
--- a/src/transformers/models/diffllama/modeling_diffllama.py
+++ b/src/transformers/models/diffllama/modeling_diffllama.py
@@ -74,7 +74,6 @@
_CONFIG_FOR_DOC = "DiffLlamaConfig"
-@use_kernel_forward_from_hub("MLP")
class DiffLlamaMLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py
index 4646b9f9bdee..fcc55b67d153 100644
--- a/src/transformers/models/emu3/modeling_emu3.py
+++ b/src/transformers/models/emu3/modeling_emu3.py
@@ -84,7 +84,6 @@ def extra_repr(self):
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-@use_kernel_forward_from_hub("MLP")
class Emu3MLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index 679bc0869855..40497433284a 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -27,7 +27,6 @@
from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache, StaticCache
from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub
from ...modeling_attn_mask_utils import AttentionMaskConverter
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import (
@@ -85,7 +84,6 @@ def extra_repr(self):
return f"{tuple(self.weight.shape)}, eps={self.eps}"
-@use_kernel_forward_from_hub("MLP")
class GemmaMLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py
index c7040de011ba..144a94ef33e9 100644
--- a/src/transformers/models/gemma2/modeling_gemma2.py
+++ b/src/transformers/models/gemma2/modeling_gemma2.py
@@ -28,7 +28,6 @@
from ...activations import ACT2FN
from ...cache_utils import Cache, HybridCache, StaticCache
from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import (
BaseModelOutputWithPast,
@@ -78,7 +77,6 @@ def extra_repr(self):
return f"{tuple(self.weight.shape)}, eps={self.eps}"
-@use_kernel_forward_from_hub("MLP")
class Gemma2MLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py
index 23f28281a1de..0988e2692aa4 100644
--- a/src/transformers/models/gemma3/modeling_gemma3.py
+++ b/src/transformers/models/gemma3/modeling_gemma3.py
@@ -31,7 +31,6 @@
from ...activations import ACT2FN
from ...cache_utils import Cache, HybridCache, StaticCache
from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
@@ -107,7 +106,6 @@ def forward(self, input_ids: torch.Tensor):
return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype)
-@use_kernel_forward_from_hub("MLP")
class Gemma3MLP(nn.Module):
def __init__(self, config: Gemma3TextConfig):
super().__init__()
diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py
index 6f15f9ca095a..80d3ad696dc0 100644
--- a/src/transformers/models/granite/modeling_granite.py
+++ b/src/transformers/models/granite/modeling_granite.py
@@ -228,7 +228,6 @@ def extra_repr(self):
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-@use_kernel_forward_from_hub("MLP")
class GraniteMLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/helium/modeling_helium.py b/src/transformers/models/helium/modeling_helium.py
index 2597ce27fa94..d565af9e27f1 100644
--- a/src/transformers/models/helium/modeling_helium.py
+++ b/src/transformers/models/helium/modeling_helium.py
@@ -29,7 +29,6 @@
from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache, StaticCache
from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub
from ...modeling_attn_mask_utils import AttentionMaskConverter
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import (
@@ -118,7 +117,6 @@ def forward(self, x, position_ids):
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-@use_kernel_forward_from_hub("MLP")
class HeliumMLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index e8dd13952661..d36fb1b6a47e 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -160,7 +160,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
return q_embed, k_embed
-@use_kernel_forward_from_hub("MLP")
class LlamaMLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index 8f1b416d5b16..7f88b8d8570c 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -45,7 +45,6 @@
_CONFIG_FOR_DOC = "MistralConfig"
-@use_kernel_forward_from_hub("MLP")
class MistralMLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index 8b8783d1ad8d..5b6ca9f4b356 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -14,7 +14,6 @@
from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache, StaticCache
from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub
from ...modeling_attn_mask_utils import AttentionMaskConverter
from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
@@ -58,7 +57,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
)
-@use_kernel_forward_from_hub("MLP")
class OlmoMLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/olmo2/modeling_olmo2.py b/src/transformers/models/olmo2/modeling_olmo2.py
index bcf990ccda60..4046dc582673 100644
--- a/src/transformers/models/olmo2/modeling_olmo2.py
+++ b/src/transformers/models/olmo2/modeling_olmo2.py
@@ -218,7 +218,6 @@ def forward(
return attn_output, attn_weights
-@use_kernel_forward_from_hub("MLP")
class Olmo2MLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index d3180b35b3a4..7b62632bd8e4 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -45,7 +45,6 @@
_CONFIG_FOR_DOC = "Qwen2Config"
-@use_kernel_forward_from_hub("MLP")
class Qwen2MLP(nn.Module):
def __init__(self, config):
super().__init__()
diff --git a/src/transformers/models/qwen3/modeling_qwen3.py b/src/transformers/models/qwen3/modeling_qwen3.py
index 5852470d1c23..15773b4516ae 100644
--- a/src/transformers/models/qwen3/modeling_qwen3.py
+++ b/src/transformers/models/qwen3/modeling_qwen3.py
@@ -81,7 +81,6 @@ def extra_repr(self):
return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-@use_kernel_forward_from_hub("MLP")
class Qwen3MLP(nn.Module):
def __init__(self, config):
super().__init__()