Upload folder using huggingface_hub

714cf46 verified 20 days ago

26.6 kB

	# MultiMolecule
	# Copyright (C) 2024-Present MultiMolecule

	# This file is part of MultiMolecule.

	# MultiMolecule is free software: you can redistribute it and/or modify
	# it under the terms of the GNU Affero General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# any later version.

	# MultiMolecule is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU Affero General Public License for more details.

	# You should have received a copy of the GNU Affero General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.

	# For additional terms and clarifications, please refer to our License FAQ at:
	# <https://multimolecule.danling.org/about/license-faq>.


	from __future__ import annotations


	import torch
	import torch.nn as nn
	import os
	from torch import Tensor
	from functools import lru_cache
	from itertools import product
	from typing import Any, Sequence, Tuple, List
	from pathlib import Path
	from collections import OrderedDict
	from transformers.tokenization_utils import PreTrainedTokenizer


	VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
	SPECIAL_TOKENS_MAP = {
	"pad_token": {
	"content": "<pad>",
	"lstrip": False,
	"normalized": False,
	"rstrip": False,
	"single_word": False,
	},
	"cls_token": {
	"content": "<cls>",
	"lstrip": False,
	"normalized": False,
	"rstrip": False,
	"single_word": False,
	},
	"eos_token": {
	"content": "<eos>",
	"lstrip": False,
	"normalized": False,
	"rstrip": False,
	"single_word": False,
	},
	"unk_token": {
	"content": "<unk>",
	"lstrip": False,
	"normalized": False,
	"rstrip": False,
	"single_word": False,
	},
	"mask_token": {
	"content": "<mask>",
	"lstrip": False,
	"normalized": False,
	"rstrip": False,
	"single_word": False,
	},
	"null_token": {
	"content": "<null>",
	"lstrip": False,
	"normalized": False,
	"rstrip": False,
	"single_word": False,
	},
	}

	STANDARD_ALPHABET = list("ACGUNRYSWKMBDHV.X*-I")

	IUPAC_ALPHABET = list("ACGUNRYSWKMBDHV.")

	STREAMLINE_ALPHABET = list("ACGUN")

	NUCLEOBASE_ALPHABET = list("ACGU")

	ALPHABETS = {
	"standard": STANDARD_ALPHABET,
	"iupac": IUPAC_ALPHABET,
	"streamline": STREAMLINE_ALPHABET,
	"nucleobase": NUCLEOBASE_ALPHABET,
	}

	VOCAB_MAPPING = {
	"R": "AG",
	"Y": "CU",
	"S": "CG",
	"W": "AU",
	"K": "GU",
	"M": "AC",
	"B": "CGU",
	"D": "AGU",
	"H": "ACU",
	"V": "ACG",
	"X": "ACGU",
	}

	TOKENIZER_CONFIG = {
	"tokenizer_class": "RnaTokenizer",
	"clean_up_tokenization_spaces": True,
	}


	def get_alphabet(alphabet: List[str] \| str \| None = None, nmers: int = 1, **kwargs) -> Alphabet:
	if alphabet is None:
	alphabet = STANDARD_ALPHABET if nmers <= 1 else STREAMLINE_ALPHABET
	elif isinstance(alphabet, str):
	alphabet = ALPHABETS[alphabet]
	return Alphabet(alphabet, nmers=nmers, **kwargs)


	def get_vocab_mapping():
	return VOCAB_MAPPING


	def get_special_tokens_map():
	return SPECIAL_TOKENS_MAP


	def get_tokenizer_config(add_special_tokens: bool = False):
	config = TOKENIZER_CONFIG
	if add_special_tokens:
	config.setdefault("added_tokens_decoder", {})
	for i, v in enumerate(SPECIAL_TOKENS_MAP.values()):
	config["added_tokens_decoder"][str(i)] = v # type: ignore[index]
	return config


	class Alphabet:
	prepend_tokens: Tuple[str, ...] = ("<pad>", "<cls>", "<eos>", "<unk>", "<mask>", "<null>")
	append_tokens: Tuple[str, ...] = ()
	tokens: Tuple[str, ...]
	nmers: int

	def __init__(
	self,
	tokens: Sequence[str],
	prepend_tokens: Tuple[str, ...] \| None = None,
	append_tokens: Tuple[str, ...] \| None = None,
	nmers: int = 1,
	):
	if isinstance(tokens, Alphabet):
	tokens = tokens.tokens
	self.tokens = tuple(tokens)
	if prepend_tokens is not None:
	self.prepend_tokens = tuple(prepend_tokens)
	if append_tokens is not None:
	self.append_tokens = tuple(append_tokens)
	self.nmers = nmers

	@property
	def vocabulary(self) -> Tuple[str, ...]:
	return self._vocabulary(self.prepend_tokens, self.tokens, self.nmers, self.append_tokens)

	@staticmethod
	@lru_cache(maxsize=None)
	def _vocabulary(
	prepend_tokens: Tuple[str, ...], tokens: Tuple[str, ...], nmers: int, append_tokens: Tuple[str, ...]
	) -> Tuple[str, ...]:
	return prepend_tokens + generate_kmer_vocabulary(tokens, nmers) + append_tokens

	def __iter__(self):
	return iter(self.vocabulary)

	def __len__(self):
	return len(self.vocabulary)

	def __contains__(self, item: str):
	return item in self.vocabulary

	def __repr__(self) -> str:
	repr_parts = [f"Alphabet(tokens={self.tokens}"]
	if self.nmers > 1:
	repr_parts.append(f"nmers={self.nmers}")
	repr_parts.append(f"prepend_tokens={self.prepend_tokens}")
	repr_parts.append(f"append_tokens={self.append_tokens})")
	return ", ".join(repr_parts)


	def _merge_extra_special_tokens(
	additional_special_tokens: List \| Tuple \| None,
	kwargs: dict[str, Any],
	) -> List \| Tuple \| None:
	if "extra_special_tokens" not in kwargs:
	return additional_special_tokens

	extra_special_tokens = kwargs.pop("extra_special_tokens")
	if additional_special_tokens is None:
	merged_special_tokens = []
	else:
	merged_special_tokens = list(additional_special_tokens)

	if isinstance(extra_special_tokens, dict):
	extra_tokens = list(extra_special_tokens.values())
	elif isinstance(extra_special_tokens, (list, tuple)):
	extra_tokens = list(extra_special_tokens)
	else:
	raise TypeError(
	f"extra_special_tokens must be dict, list, or tuple, got {type(extra_special_tokens).__name__}"
	)

	for token in extra_tokens:
	token_value = token
	if isinstance(token, dict) and "content" in token:
	token_value = token["content"]
	if token_value not in merged_special_tokens:
	merged_special_tokens.append(token_value)
	return merged_special_tokens


	def generate_kmer_vocabulary(vocabulary: Tuple[str, ...], nmers: int = 1) -> Tuple[str, ...]:
	"""
	Generates a kmer vocabulary given an original vocabulary and the size of kmer.

	Args:
	vocabulary (List[str]): The original vocabulary.
	nmers (int, defaults to 1): The size of kmer to generate.

	Returns:
	vocabulary (List[str]): The kmer vocabulary.
	"""

	if nmers <= 1:
	return vocabulary

	special_tokens, tokens = [], []
	for token in vocabulary:
	if token.startswith("<") or token.startswith("["):
	special_tokens.append(token)
	else:
	tokens.append(token)

	return tuple(special_tokens) + tuple("".join(kmer) for kmer in product(tokens, repeat=nmers))


	class Tokenizer(PreTrainedTokenizer):
	"""
	Constructs a Base tokenizer.

	Args:
	alphabet: List of tokens or an Alphabet object to use in tokenization.
	Either alphabet or vocab_file must be specified.
	bos_token: A special token representing the beginning of a sequence.
	cls_token: A special token representing the classification token.
	pad_token: A special token representing padding.
	eos_token: A special token representing the end of a sequence.
	sep_token: A special token representing the separator token.
	unk_token: A special token representing unknown tokens.
	mask_token: A special token representing the mask token.
	null_token: A special token representing the null token.
	additional_special_tokens: Additional special tokens to add to the vocabulary.
	do_upper_case: Whether to convert input to uppercase.
	vocab_file: Path to a vocabulary file.
	Either alphabet or vocab_file must be specified.

	Examples:
	>>> from multimolecule.tokenisers import Tokenizer
	>>> tokenizer = Tokenizer(["A", "C", "G", "T", "N"], unk_token="N")
	>>> tokenizer('ACGTN')["input_ids"]
	[0, 1, 2, 3, 4]
	>>> tokenizer('acgtn')["input_ids"]
	[0, 1, 2, 3, 4]
	>>> len(tokenizer)
	5
	>>> tokenizer = Tokenizer(["A", "C", "G", "T", "N"], unk_token="N", do_upper_case=False)
	>>> tokenizer('ACGTN')["input_ids"]
	[0, 1, 2, 3, 4]
	>>> tokenizer('acgtn')["input_ids"]
	[4, 4, 4, 4, 4]
	>>> tokenizer('ACgtN')["input_ids"]
	[0, 1, 4, 4, 4]
	>>> tokenizer = Tokenizer(["<pad>", "<cls>", "A", "C", "G", "T", "N", "<mask>", "<eos>"])
	>>> tokenizer('ACGTN')["input_ids"]
	[1, 2, 3, 4, 5, 6, 8]
	>>> tokenizer('AC<mask>GTN')["input_ids"]
	[1, 2, 3, 7, 4, 5, 6, 8]
	>>> tokenizer(['TATATAT', 'ATCGN'], padding=True)["input_ids"]
	[[1, 5, 2, 5, 2, 5, 2, 5, 8], [1, 2, 5, 3, 4, 6, 8, 0, 0]]
	"""

	model_input_names = ["input_ids", "attention_mask"]
	vocab_files_names = VOCAB_FILES_NAMES
	do_upper_case: bool = True

	def __init__(
	self,
	alphabet: Alphabet \| List[str] \| None = None,
	bos_token: str \| None = ..., # type: ignore[assignment]
	cls_token: str \| None = ..., # type: ignore[assignment]
	pad_token: str \| None = ..., # type: ignore[assignment]
	eos_token: str \| None = ..., # type: ignore[assignment]
	sep_token: str \| None = ..., # type: ignore[assignment]
	unk_token: str \| None = ..., # type: ignore[assignment]
	mask_token: str \| None = ..., # type: ignore[assignment]
	null_token: str \| None = ..., # type: ignore[assignment]
	additional_special_tokens: List \| Tuple \| None = None,
	do_upper_case: bool = True,
	vocab_file: str \| None = None,
	**kwargs,
	):
	if alphabet is None and vocab_file is None:
	raise ValueError("You must specify either alphabet or vocab_file")

	if vocab_file is not None:
	alphabet = self.load_vocabulary(vocab_file)

	self._id_to_token: OrderedDict[int, str] = OrderedDict(enumerate(alphabet))
	self._token_to_id: OrderedDict[str, int] = OrderedDict({tok: ind for ind, tok in enumerate(alphabet)})

	if cls_token is ...:
	cls_token = self.identify_special_token(alphabet, "cls")
	if bos_token is ...:
	bos_token = cls_token
	if pad_token is ...:
	pad_token = self.identify_special_token(alphabet, "pad")
	if eos_token is ...:
	eos_token = self.identify_special_token(alphabet, "eos")
	if sep_token is ...:
	sep_token = self.identify_special_token(alphabet, "sep") or self.identify_special_token(alphabet, "eos")
	if unk_token is ...:
	unk_token = self.identify_special_token(alphabet, "unk")
	if mask_token is ...:
	mask_token = self.identify_special_token(alphabet, "mask")
	if null_token is ...:
	null_token = self.identify_special_token(alphabet, "null")
	additional_special_tokens = _merge_extra_special_tokens(additional_special_tokens, kwargs)
	if additional_special_tokens is None:
	additional_special_tokens = []
	if null_token in alphabet and null_token not in additional_special_tokens: # type: ignore[operator]
	additional_special_tokens = list(additional_special_tokens)
	additional_special_tokens.append(null_token)

	super().__init__(
	bos_token=bos_token,
	cls_token=cls_token,
	pad_token=pad_token,
	eos_token=eos_token,
	sep_token=sep_token,
	unk_token=unk_token,
	mask_token=mask_token,
	additional_special_tokens=additional_special_tokens,
	**kwargs,
	)
	self.do_upper_case = do_upper_case
	self._id_to_token.update(self.added_tokens_decoder)
	self._token_to_id.update(self.added_tokens_encoder)

	# TODO, all the tokens are added? But they are also part of the vocab... bit strange.
	# none of them are special, but they all need special splitting.

	# self.unique_no_split_tokens = self.all_tokens
	# self._update_trie(self.unique_no_split_tokens)

	def _tokenize(self, text: str, **kwargs):
	if self.do_upper_case:
	text = text.upper()
	return list(text)

	def _convert_token_to_id(self, token: str) -> int:
	id = self._token_to_id.get(token, self.unk_token_id)
	if id is None:
	raise ValueError(f"Token {token} is not in the vocabulary, and no UNK token is set!")
	return id

	def _convert_id_to_token(self, index: int) -> str:
	token = self._id_to_token.get(index, self.unk_token)
	if token is None:
	raise ValueError(f"ID {index} is not in the vocabulary, and no UNK token is set!")
	return token

	def token_to_id(self, token: str) -> int:
	return self._convert_token_to_id(token)

	def id_to_token(self, index: int) -> str:
	return self._convert_id_to_token(index)

	def build_inputs_with_special_tokens(
	self, token_ids_0: List[int], token_ids_1: List[int] \| None = None
	) -> List[int]:
	bos = [self.bos_token_id] # points to cls
	sep = [self.sep_token_id] # points to eos
	eos = [self.eos_token_id] # eos is eos
	if token_ids_1 is None:
	if self.bos_token_id is None:
	if self.eos_token_id is None:
	return token_ids_0
	return token_ids_0 + eos
	if self.eos_token_id is None:
	return bos + token_ids_0
	return bos + token_ids_0 + eos
	if self.eos_token_id is None:
	raise ValueError("Cannot tokenize multiple sequences when EOS token is not set!")
	return bos + token_ids_0 + sep + token_ids_1 + eos

	def get_special_tokens_mask(
	self, token_ids_0: List[int], token_ids_1: List[int] \| None = None, already_has_special_tokens: bool = False
	) -> List[int]:
	"""
	Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
	special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

	Args:
	token_ids_0 (`List[int]`):
	List of ids of the first sequence.
	token_ids_1 (`List[int]`, optional):
	List of ids of the second sequence.
	already_has_special_tokens (`bool`, optional, defaults to `False`):
	Whether or not the token list is already formatted with special tokens for the model.

	Returns:
	A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
	"""
	if already_has_special_tokens:
	if token_ids_1 is not None:
	raise ValueError(
	"You should not supply a second sequence if the provided sequence of "
	"ids is already formatted with special tokens for the model."
	)

	return [1 if token in self.all_special_ids else 0 for token in token_ids_0]
	mask = [0] * len(token_ids_0)
	if self.bos_token_id is not None:
	mask = [1] + mask
	if self.sep_token_id is not None:
	mask += [1]
	if token_ids_1 is not None:
	mask += [0] * len(token_ids_1)
	if self.eos_token_id is not None:
	mask += [1]
	return mask

	@staticmethod
	def load_vocabulary(vocab_file: str \| Path) -> List[str]:
	with open(vocab_file, encoding="utf-8") as reader:
	vocabulary = reader.read().splitlines()
	return vocabulary

	def save_vocabulary(self, save_directory: str, filename_prefix: str \| None = None):
	vocab_file = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.txt")
	with open(vocab_file, "w") as f:
	f.write("\n".join(self.all_tokens))
	return (vocab_file,)

	@staticmethod
	def identify_special_token(alphabet: Alphabet \| List[str], token) -> str \| None:
	tokens = [i for i in alphabet if token in i.lower()]
	if len(tokens) == 1:
	return tokens[0]
	if len(tokens) == 0:
	return None
	raise ValueError(f"Token {token} is ambiguous, could be {tokens}")

	def get_vocab(self):
	return dict(self.vocab, **self.added_tokens_encoder)

	@property
	def vocab(self) -> OrderedDict[str, int]:
	return self._token_to_id.copy()

	@property
	def all_tokens(self) -> List[str]:
	return list(self.get_vocab().keys())

	@property
	def vocab_size(self) -> int:
	return len(self.all_tokens)


	class RnaTokenizer(Tokenizer):
	"""
	Tokenizer for RNA sequences.

	Args:
	alphabet: alphabet to use for tokenization.

	- If is `None`, the standard RNA alphabet will be used.
	- If is a `string`, it should correspond to the name of a predefined alphabet. The options include
	+ `standard`
	+ `extended`
	+ `streamline`
	+ `nucleobase`
	- If is an alphabet or a list of characters, that specific alphabet will be used.
	nmers: Size of kmer to tokenize.
	codon: Whether to tokenize into codons.
	replace_T_with_U: Whether to replace T with U.
	do_upper_case: Whether to convert input to uppercase.

	Examples:
	>>> from multimolecule import RnaTokenizer
	>>> tokenizer = RnaTokenizer()
	>>> tokenizer('<pad><cls><eos><unk><mask><null>ACGUNRYSWKMBDHV.X*-I')["input_ids"]
	[1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 2]
	>>> tokenizer('acgu')["input_ids"]
	[1, 6, 7, 8, 9, 2]
	>>> tokenizer('acgt')["input_ids"]
	[1, 6, 7, 8, 9, 2]
	>>> tokenizer = RnaTokenizer(replace_T_with_U=False)
	>>> tokenizer('acgt')["input_ids"]
	[1, 6, 7, 8, 3, 2]
	>>> tokenizer = RnaTokenizer(nmers=3)
	>>> tokenizer('uagcuuauc')["input_ids"]
	[1, 83, 17, 64, 49, 96, 84, 22, 2]
	>>> tokenizer = RnaTokenizer(codon=True)
	>>> tokenizer('uagcuuauc')["input_ids"]
	[1, 83, 49, 22, 2]
	>>> tokenizer('uagcuuauca')["input_ids"]
	Traceback (most recent call last):
	ValueError: length of input sequence must be a multiple of 3 for codon tokenization, but got 10
	"""

	model_input_names = ["input_ids", "attention_mask"]

	def __init__(
	self,
	alphabet: Alphabet \| str \| List[str] \| None = None,
	nmers: int = 1,
	codon: bool = True,
	replace_T_with_U: bool = True,
	do_upper_case: bool = True,
	additional_special_tokens: List \| Tuple \| None = None,
	**kwargs,
	):
	if codon and (nmers > 1 and nmers != 3):
	raise ValueError("Codon and nmers cannot be used together.")
	if codon:
	nmers = 3 # set to 3 to get correct vocab
	if not isinstance(alphabet, Alphabet):
	alphabet = get_alphabet(alphabet, nmers=nmers)
	additional_special_tokens = _merge_extra_special_tokens(additional_special_tokens, kwargs)
	super().__init__(
	alphabet=alphabet,
	nmers=nmers,
	codon=codon,
	replace_T_with_U=replace_T_with_U,
	do_upper_case=do_upper_case,
	additional_special_tokens=additional_special_tokens,
	**kwargs,
	)
	self.replace_T_with_U = replace_T_with_U
	self.nmers = nmers
	self.codon = codon

	def _tokenize(self, text: str, **kwargs):
	if self.do_upper_case:
	text = text.upper()
	if self.replace_T_with_U:
	text = text.replace("T", "U")
	if self.codon:
	if len(text) % 3 != 0:
	raise ValueError(
	f"length of input sequence must be a multiple of 3 for codon tokenization, but got {len(text)}"
	)
	return [text[i : i + 3] for i in range(0, len(text), 3)]
	if self.nmers > 1:
	return [text[i : i + self.nmers] for i in range(len(text) - self.nmers + 1)] # noqa: E203
	return list(text)


	class RotaryEmbedding(nn.Module):
	"""
	Rotary position embeddings based on those in
	[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer).

	Query and keys are transformed by rotation
	matrices which depend on their relative positions.

	Tip: Cache
	The inverse frequency buffer is cached and updated only when the sequence length changes or the device changes.

	Success: Sequence Length
	Rotary Embedding is irrespective of the sequence length and can be used for any sequence length.
	Use the `scale` parameter to extend context length beyond training (e.g., scale=2.0 doubles effective context).

	Example:
	>>> embedding = RotaryEmbedding(embedding_dim=64)
	>>> query, key = torch.randn(2, 4, 28, 64), torch.randn(2, 4, 28, 64)
	>>> query, key = embedding(query, key)
	>>> query.shape
	torch.Size([2, 4, 28, 64])
	>>> # For extended context length
	>>> embedding_extended = RotaryEmbedding(embedding_dim=64, scale=2.0)
	>>> embedding.state_dict() # no weight in state_dict
	OrderedDict()
	"""

	_seq_len_cached: int \| None = None
	_cos_cached: Tensor \| None = None
	_sin_cached: Tensor \| None = None

	def __init__(
	self,
	embedding_dim: int,
	base: float = 10000.0,
	scale: float = 1.0,
	dtype: torch.dtype = torch.float32,
	):
	"""
	Initialize rotary position embeddings.

	Args:
	embedding_dim: Dimension of the embeddings (must be even)
	base: Base for computing inverse frequencies. Defaults to 10000.0.
	scale: Scaling factor for frequencies. Values > 1.0 extend context length
	(e.g., scale=2.0 doubles the effective context). Defaults to 1.0.
	dtype: Data type for computations. Defaults to torch.float32.
	"""
	super().__init__()
	inv_freq = 1.0 / (base ** (torch.arange(0, embedding_dim, 2, dtype=dtype) / embedding_dim))
	self.register_buffer("inv_freq", inv_freq, persistent=False)
	self.scale = scale

	def forward(self, q: Tensor, k: Tensor, offset: int = 0, seq_length: int \| None = None) -> Tuple[Tensor, Tensor]:
	"""
	Apply rotary position embeddings to query and key tensors.

	Args:
	q: Query tensor of shape `(batch_size, num_heads, seq_length, embedding_dim)`
	k: Key tensor of shape `(batch_size, num_heads, seq_length, embedding_dim)`
	offset: Position offset for the start of the sequence (used with past_key_values).
	Defaults to 0.
	seq_length: Full sequence length including offset. If None, uses the sequence length
	from the input tensors. Required when offset > 0.

	Returns:
	Tuple of (rotated_query, rotated_key) tensors with the same shapes as inputs.
	"""
	if offset > 0 and seq_length is None:
	raise ValueError("seq_length must be provided when offset > 0")

	if seq_length is None:
	seq_length = k.shape[-2]

	self._update_cos_sin_tables(k, seq_len_dim=-2, seq_length=seq_length)
	return self.apply_rotary_pos_emb(q, offset=offset), self.apply_rotary_pos_emb(k, offset=offset)

	def _update_cos_sin_tables(
	self, x: Tensor, seq_len_dim: int = 2, seq_length: int \| None = None
	) -> Tuple[Tensor, Tensor]:
	"""
	Update cached cos/sin tables for rotary embeddings.

	Args:
	x: Input tensor to determine device and dtype
	seq_len_dim: Dimension containing sequence length (default: -2)
	seq_length: Full sequence length to cache. If None, uses x.shape[seq_len_dim]
	"""
	if seq_length is None:
	seq_length = x.shape[seq_len_dim]

	if seq_length != self._seq_len_cached or self._cos_cached is None or self._cos_cached.device != x.device:
	self._seq_len_cached = seq_length
	inv_freq = self.inv_freq
	if not isinstance(inv_freq, Tensor):
	raise RuntimeError("inv_freq buffer is not a Tensor")
	t = torch.arange(seq_length, device=x.device, dtype=inv_freq.dtype)
	# Apply scaling: divide frequencies by scale to extend context length
	freqs = torch.outer(t, inv_freq) / self.scale
	emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
	self._cos_cached = emb.cos()[None, None, :, :]
	self._sin_cached = emb.sin()[None, None, :, :]
	# At this point, _cos_cached and _sin_cached are guaranteed to be Tensor
	assert self._cos_cached is not None and self._sin_cached is not None
	return self._cos_cached, self._sin_cached

	def apply_rotary_pos_emb(self, x: Tensor, offset: int = 0) -> Tensor:
	"""
	Apply rotary position embeddings to a tensor.

	Args:
	x: Input tensor of shape `(batch_size, num_heads, seq_length, embedding_dim)`
	offset: Position offset for the start of the sequence (used with past_key_values).
	Defaults to 0.

	Returns:
	Rotated tensor with the same shape as input.
	"""
	if self._cos_cached is None or self._sin_cached is None:
	raise RuntimeError("Cos/sin tables not initialized. Call forward() or _update_cos_sin_tables() first.")

	cos = self._cos_cached[:, :, offset : offset + x.shape[-2], :]
	sin = self._sin_cached[:, :, offset : offset + x.shape[-2], :]
	return (x * cos) + (self.rotate_half(x) * sin)

	@staticmethod
	def rotate_half(x: Tensor) -> Tensor:
	x1, x2 = x.chunk(2, dim=-1)
	return torch.cat((-x2, x1), dim=-1)