Add files using upload-large-folder tool

6f0b660 verified about 1 month ago

8.29 kB

	# Copyright 2025 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import re
	from functools import partial
	from typing import Optional, Union

	from ..modeling_flash_attention_utils import lazy_import_flash_attention
	from .flash_attention import flash_attention_forward


	try:
	from kernels import (
	Device,
	LayerRepository,
	Mode,
	get_kernel,
	register_kernel_mapping,
	replace_kernel_forward_from_hub,
	use_kernel_forward_from_hub,
	)

	_kernels_available = True

	_KERNEL_MAPPING: dict[str, dict[Union[Device, str], LayerRepository]] = {
	"MultiScaleDeformableAttention": {
	"cuda": LayerRepository(
	repo_id="kernels-community/deformable-detr",
	layer_name="MultiScaleDeformableAttention",
	)
	},
	"Llama4TextMoe": {
	"cuda": LayerRepository(
	# Move to kernels-community/moe once we release.
	repo_id="kernels-community/moe",
	layer_name="Llama4TextMoe",
	)
	},
	"RMSNorm": {
	"cuda": LayerRepository(
	repo_id="kernels-community/liger_kernels",
	layer_name="LigerRMSNorm",
	# revision="pure-layer-test",
	),
	"rocm": {
	Mode.INFERENCE: LayerRepository(
	repo_id="kernels-community/liger_kernels",
	layer_name="LigerRMSNorm",
	# revision="pure-layer-test",
	)
	},
	},
	"MLP": {
	"cuda": LayerRepository(
	repo_id="medmekk/triton-llama-mlp",
	layer_name="TritonLlamaMLP",
	)
	},
	"MegaBlocksMoeMLP": {
	"cuda": {
	Mode.TRAINING: LayerRepository(
	repo_id="kernels-community/megablocks",
	layer_name="MegaBlocksMoeMLP",
	),
	Mode.INFERENCE: LayerRepository(
	repo_id="kernels-community/megablocks",
	layer_name="MegaBlocksMoeMLP",
	),
	},
	"rocm": {
	Mode.INFERENCE: LayerRepository(
	repo_id="ahadnagy/megablocks",
	layer_name="MegaBlocksMoeMLP",
	)
	},
	},
	"FastGELU": {
	"cuda": {
	Mode.INFERENCE \| Mode.TORCH_COMPILE: LayerRepository(
	repo_id="kernels-community/activation",
	layer_name="FastGELU",
	version=">=0.0.4,<0.1.0",
	)
	}
	},
	"QuickGELU": {
	"cuda": {
	Mode.INFERENCE \| Mode.TORCH_COMPILE: LayerRepository(
	repo_id="kernels-community/activation",
	layer_name="QuickGELU",
	version=">=0.0.4,<0.1.0",
	)
	}
	},
	"NewGELU": {
	"cuda": {
	Mode.INFERENCE \| Mode.TORCH_COMPILE: LayerRepository(
	repo_id="kernels-community/activation",
	layer_name="NewGELU",
	version=">=0.0.4,<0.1.0",
	)
	}
	},
	"SiLU": {
	"cuda": {
	Mode.INFERENCE \| Mode.TORCH_COMPILE: LayerRepository(
	repo_id="kernels-community/activation", layer_name="Silu", version=">=0.1.0"
	)
	}
	},
	"GeLU": {
	"cuda": {
	Mode.INFERENCE \| Mode.TORCH_COMPILE: LayerRepository(
	repo_id="kernels-community/activation", layer_name="Gelu", version=">=0.1.0"
	)
	}
	},
	"GeluTanh": {
	"cuda": {
	Mode.INFERENCE \| Mode.TORCH_COMPILE: LayerRepository(
	repo_id="kernels-community/activation", layer_name="GeluTanh", version=">=0.1.0"
	)
	}
	},
	}

	register_kernel_mapping(_KERNEL_MAPPING)

	except ImportError:
	_kernels_available = False

	# Stub to make decorators int transformers work when `kernels`
	# is not installed.
	def use_kernel_forward_from_hub(args, *kwargs):
	def decorator(cls):
	return cls

	return decorator

	class LayerRepository:
	def __init__(self, args, *kwargs):
	raise RuntimeError("LayerRepository requires `kernels` to be installed. Run `pip install kernels`.")

	def replace_kernel_forward_from_hub(args, *kwargs):
	raise RuntimeError(
	"replace_kernel_forward_from_hub requires `kernels` to be installed. Run `pip install kernels`."
	)

	def register_kernel_mapping(args, *kwargs):
	raise RuntimeError("register_kernel_mapping requires `kernels` to be installed. Run `pip install kernels`.")


	def is_kernel(attn_implementation: Optional[str]) -> bool:
	"""Check whether `attn_implementation` matches a kernel pattern from the hub."""
	return (
	attn_implementation is not None
	and re.search(r"^[^/:]+/[^/:]+(?:@[^/:]+)?(?::[^/:]+)?$", attn_implementation) is not None
	)


	def load_and_register_kernel(attn_implementation: str) -> None:
	"""Load and register the kernel associated to `attn_implementation`."""
	if not is_kernel(attn_implementation):
	return
	if not _kernels_available:
	raise ImportError(
	"`kernels` is either not installed or uses an incompatible version. "
	"Please install the latest version with `pip install -U kernels`."
	)

	# Need to be imported here as otherwise we have a circular import in `modeling_utils`
	from ..masking_utils import ALL_MASK_ATTENTION_FUNCTIONS
	from ..modeling_utils import ALL_ATTENTION_FUNCTIONS

	attention_wrapper = None
	# FIXME: @ArthurZucker this is dirty, did not want to do a lof of extra work
	actual_attn_name = attn_implementation
	if "\|" in attn_implementation:
	attention_wrapper, actual_attn_name = attn_implementation.split("\|")
	# `transformers` has wrapper for sdpa, paged, flash, flex etc.
	attention_wrapper = ALL_ATTENTION_FUNCTIONS.get(attention_wrapper)
	# Extract repo_id and kernel_name from the string
	if ":" in actual_attn_name:
	repo_id, kernel_name = actual_attn_name.split(":")
	kernel_name = kernel_name.strip()
	else:
	repo_id = actual_attn_name
	kernel_name = None
	repo_id = repo_id.strip()
	# extract the rev after the @ if it exists
	repo_id, _, rev = repo_id.partition("@")
	repo_id = repo_id.strip()
	rev = rev.strip() if rev else None

	# Load the kernel from hub
	try:
	kernel = get_kernel(repo_id, revision=rev)
	except Exception as e:
	raise ValueError(f"An error occurred while trying to load from '{repo_id}': {e}.")
	# correctly wrap the kernel
	if hasattr(kernel, "flash_attn_varlen_func"):
	if attention_wrapper is None:
	attention_wrapper = flash_attention_forward
	kernel_function = partial(attention_wrapper, implementation=kernel)
	lazy_import_flash_attention(kernel, force_import=True)
	elif kernel_name is not None:
	kernel_function = getattr(kernel, kernel_name)
	# Register the kernel as a valid attention
	ALL_ATTENTION_FUNCTIONS.register(attn_implementation, kernel_function)
	ALL_MASK_ATTENTION_FUNCTIONS.register(attn_implementation, ALL_MASK_ATTENTION_FUNCTIONS["flash_attention_2"])


	__all__ = [
	"LayerRepository",
	"use_kernel_forward_from_hub",
	"register_kernel_mapping",
	"replace_kernel_forward_from_hub",
	]