dheeena's picture
Add files using upload-large-folder tool
6f0b660 verified
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from functools import partial
from typing import Optional, Union
from ..modeling_flash_attention_utils import lazy_import_flash_attention
from .flash_attention import flash_attention_forward
try:
from kernels import (
Device,
LayerRepository,
Mode,
get_kernel,
register_kernel_mapping,
replace_kernel_forward_from_hub,
use_kernel_forward_from_hub,
)
_kernels_available = True
_KERNEL_MAPPING: dict[str, dict[Union[Device, str], LayerRepository]] = {
"MultiScaleDeformableAttention": {
"cuda": LayerRepository(
repo_id="kernels-community/deformable-detr",
layer_name="MultiScaleDeformableAttention",
)
},
"Llama4TextMoe": {
"cuda": LayerRepository(
# Move to kernels-community/moe once we release.
repo_id="kernels-community/moe",
layer_name="Llama4TextMoe",
)
},
"RMSNorm": {
"cuda": LayerRepository(
repo_id="kernels-community/liger_kernels",
layer_name="LigerRMSNorm",
# revision="pure-layer-test",
),
"rocm": {
Mode.INFERENCE: LayerRepository(
repo_id="kernels-community/liger_kernels",
layer_name="LigerRMSNorm",
# revision="pure-layer-test",
)
},
},
"MLP": {
"cuda": LayerRepository(
repo_id="medmekk/triton-llama-mlp",
layer_name="TritonLlamaMLP",
)
},
"MegaBlocksMoeMLP": {
"cuda": {
Mode.TRAINING: LayerRepository(
repo_id="kernels-community/megablocks",
layer_name="MegaBlocksMoeMLP",
),
Mode.INFERENCE: LayerRepository(
repo_id="kernels-community/megablocks",
layer_name="MegaBlocksMoeMLP",
),
},
"rocm": {
Mode.INFERENCE: LayerRepository(
repo_id="ahadnagy/megablocks",
layer_name="MegaBlocksMoeMLP",
)
},
},
"FastGELU": {
"cuda": {
Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
repo_id="kernels-community/activation",
layer_name="FastGELU",
version=">=0.0.4,<0.1.0",
)
}
},
"QuickGELU": {
"cuda": {
Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
repo_id="kernels-community/activation",
layer_name="QuickGELU",
version=">=0.0.4,<0.1.0",
)
}
},
"NewGELU": {
"cuda": {
Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
repo_id="kernels-community/activation",
layer_name="NewGELU",
version=">=0.0.4,<0.1.0",
)
}
},
"SiLU": {
"cuda": {
Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
repo_id="kernels-community/activation", layer_name="Silu", version=">=0.1.0"
)
}
},
"GeLU": {
"cuda": {
Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
repo_id="kernels-community/activation", layer_name="Gelu", version=">=0.1.0"
)
}
},
"GeluTanh": {
"cuda": {
Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
repo_id="kernels-community/activation", layer_name="GeluTanh", version=">=0.1.0"
)
}
},
}
register_kernel_mapping(_KERNEL_MAPPING)
except ImportError:
_kernels_available = False
# Stub to make decorators int transformers work when `kernels`
# is not installed.
def use_kernel_forward_from_hub(*args, **kwargs):
def decorator(cls):
return cls
return decorator
class LayerRepository:
def __init__(self, *args, **kwargs):
raise RuntimeError("LayerRepository requires `kernels` to be installed. Run `pip install kernels`.")
def replace_kernel_forward_from_hub(*args, **kwargs):
raise RuntimeError(
"replace_kernel_forward_from_hub requires `kernels` to be installed. Run `pip install kernels`."
)
def register_kernel_mapping(*args, **kwargs):
raise RuntimeError("register_kernel_mapping requires `kernels` to be installed. Run `pip install kernels`.")
def is_kernel(attn_implementation: Optional[str]) -> bool:
"""Check whether `attn_implementation` matches a kernel pattern from the hub."""
return (
attn_implementation is not None
and re.search(r"^[^/:]+/[^/:]+(?:@[^/:]+)?(?::[^/:]+)?$", attn_implementation) is not None
)
def load_and_register_kernel(attn_implementation: str) -> None:
"""Load and register the kernel associated to `attn_implementation`."""
if not is_kernel(attn_implementation):
return
if not _kernels_available:
raise ImportError(
"`kernels` is either not installed or uses an incompatible version. "
"Please install the latest version with `pip install -U kernels`."
)
# Need to be imported here as otherwise we have a circular import in `modeling_utils`
from ..masking_utils import ALL_MASK_ATTENTION_FUNCTIONS
from ..modeling_utils import ALL_ATTENTION_FUNCTIONS
attention_wrapper = None
# FIXME: @ArthurZucker this is dirty, did not want to do a lof of extra work
actual_attn_name = attn_implementation
if "|" in attn_implementation:
attention_wrapper, actual_attn_name = attn_implementation.split("|")
# `transformers` has wrapper for sdpa, paged, flash, flex etc.
attention_wrapper = ALL_ATTENTION_FUNCTIONS.get(attention_wrapper)
# Extract repo_id and kernel_name from the string
if ":" in actual_attn_name:
repo_id, kernel_name = actual_attn_name.split(":")
kernel_name = kernel_name.strip()
else:
repo_id = actual_attn_name
kernel_name = None
repo_id = repo_id.strip()
# extract the rev after the @ if it exists
repo_id, _, rev = repo_id.partition("@")
repo_id = repo_id.strip()
rev = rev.strip() if rev else None
# Load the kernel from hub
try:
kernel = get_kernel(repo_id, revision=rev)
except Exception as e:
raise ValueError(f"An error occurred while trying to load from '{repo_id}': {e}.")
# correctly wrap the kernel
if hasattr(kernel, "flash_attn_varlen_func"):
if attention_wrapper is None:
attention_wrapper = flash_attention_forward
kernel_function = partial(attention_wrapper, implementation=kernel)
lazy_import_flash_attention(kernel, force_import=True)
elif kernel_name is not None:
kernel_function = getattr(kernel, kernel_name)
# Register the kernel as a valid attention
ALL_ATTENTION_FUNCTIONS.register(attn_implementation, kernel_function)
ALL_MASK_ATTENTION_FUNCTIONS.register(attn_implementation, ALL_MASK_ATTENTION_FUNCTIONS["flash_attention_2"])
__all__ = [
"LayerRepository",
"use_kernel_forward_from_hub",
"register_kernel_mapping",
"replace_kernel_forward_from_hub",
]