Infinite-World / infworld /models /umt5.py

Upload folder using huggingface_hub

01c7703 verified 1 day ago

19.5 kB

	# Modified from transformers.models.t5.modeling_t5
	# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
	import os
	import html
	import math
	import ftfy
	import string
	import logging
	import regex as re

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from transformers import AutoTokenizer


	__all__ = [
	'T5Model',
	'T5Encoder',
	'T5Decoder',
	'T5EncoderModel',
	'HuggingfaceTokenizer',
	]


	def basic_clean(text):
	text = ftfy.fix_text(text)
	text = html.unescape(html.unescape(text))
	return text.strip()


	def whitespace_clean(text):
	text = re.sub(r'\s+', ' ', text)
	text = text.strip()
	return text


	def canonicalize(text, keep_punctuation_exact_string=None):
	text = text.replace('_', ' ')
	if keep_punctuation_exact_string:
	text = keep_punctuation_exact_string.join(
	part.translate(str.maketrans('', '', string.punctuation))
	for part in text.split(keep_punctuation_exact_string))
	else:
	text = text.translate(str.maketrans('', '', string.punctuation))
	text = text.lower()
	text = re.sub(r'\s+', ' ', text)
	return text.strip()


	class HuggingfaceTokenizer:

	def __init__(self, name, seq_len=None, clean=None, **kwargs):
	assert clean in (None, 'whitespace', 'lower', 'canonicalize')
	self.name = name
	self.seq_len = seq_len
	self.clean = clean

	# init tokenizer
	self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
	self.vocab_size = self.tokenizer.vocab_size

	def __call__(self, sequence, **kwargs):
	return_mask = kwargs.pop('return_mask', False)

	# arguments
	_kwargs = {'return_tensors': 'pt'}
	if self.seq_len is not None:
	_kwargs.update({
	'padding': 'max_length',
	'truncation': True,
	'max_length': self.seq_len
	})
	_kwargs.update(**kwargs)

	# tokenization
	if isinstance(sequence, str):
	sequence = [sequence]
	if self.clean:
	sequence = [self._clean(u) for u in sequence]
	ids = self.tokenizer(sequence, **_kwargs)

	# output
	if return_mask:
	return ids.input_ids, ids.attention_mask
	else:
	return ids.input_ids

	def _clean(self, text):
	if self.clean == 'whitespace':
	text = whitespace_clean(basic_clean(text))
	elif self.clean == 'lower':
	text = whitespace_clean(basic_clean(text)).lower()
	elif self.clean == 'canonicalize':
	text = canonicalize(basic_clean(text))
	return text

	def fp16_clamp(x):
	if x.dtype == torch.float16 and torch.isinf(x).any():
	clamp = torch.finfo(x.dtype).max - 1000
	x = torch.clamp(x, min=-clamp, max=clamp)
	return x


	def init_weights(m):
	if isinstance(m, T5LayerNorm):
	nn.init.ones_(m.weight)
	elif isinstance(m, T5Model):
	nn.init.normal_(m.token_embedding.weight, std=1.0)
	elif isinstance(m, T5FeedForward):
	nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
	nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
	nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
	elif isinstance(m, T5Attention):
	nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn)**-0.5)
	nn.init.normal_(m.k.weight, std=m.dim**-0.5)
	nn.init.normal_(m.v.weight, std=m.dim**-0.5)
	nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn)**-0.5)
	elif isinstance(m, T5RelativeEmbedding):
	nn.init.normal_(
	m.embedding.weight, std=(2 * m.num_buckets * m.num_heads)**-0.5)


	class GELU(nn.Module):

	def forward(self, x):
	return 0.5 * x * (1.0 + torch.tanh(
	math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))


	class T5LayerNorm(nn.Module):

	def __init__(self, dim, eps=1e-6):
	super(T5LayerNorm, self).__init__()
	self.dim = dim
	self.eps = eps
	self.weight = nn.Parameter(torch.ones(dim))

	def forward(self, x):
	x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) +
	self.eps)
	if self.weight.dtype in [torch.float16, torch.bfloat16]:
	x = x.type_as(self.weight)
	return self.weight * x


	class T5Attention(nn.Module):

	def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
	assert dim_attn % num_heads == 0
	super(T5Attention, self).__init__()
	self.dim = dim
	self.dim_attn = dim_attn
	self.num_heads = num_heads
	self.head_dim = dim_attn // num_heads

	# layers
	self.q = nn.Linear(dim, dim_attn, bias=False)
	self.k = nn.Linear(dim, dim_attn, bias=False)
	self.v = nn.Linear(dim, dim_attn, bias=False)
	self.o = nn.Linear(dim_attn, dim, bias=False)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x, context=None, mask=None, pos_bias=None):
	"""
	x: [B, L1, C].
	context: [B, L2, C] or None.
	mask: [B, L2] or [B, L1, L2] or None.
	"""
	# check inputs
	context = x if context is None else context
	b, n, c = x.size(0), self.num_heads, self.head_dim

	# compute query, key, value
	q = self.q(x).view(b, -1, n, c)
	k = self.k(context).view(b, -1, n, c)
	v = self.v(context).view(b, -1, n, c)

	# attention bias
	attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
	if pos_bias is not None:
	attn_bias += pos_bias
	if mask is not None:
	assert mask.ndim in [2, 3]
	mask = mask.view(b, 1, 1,
	-1) if mask.ndim == 2 else mask.unsqueeze(1)
	attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)

	# compute attention (T5 does not use scaling)
	attn = torch.einsum('binc,bjnc->bnij', q, k) + attn_bias
	attn = F.softmax(attn.float(), dim=-1).type_as(attn)
	x = torch.einsum('bnij,bjnc->binc', attn, v)

	# output
	x = x.reshape(b, -1, n * c)
	x = self.o(x)
	x = self.dropout(x)
	return x


	class T5FeedForward(nn.Module):

	def __init__(self, dim, dim_ffn, dropout=0.1):
	super(T5FeedForward, self).__init__()
	self.dim = dim
	self.dim_ffn = dim_ffn

	# layers
	self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
	self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
	self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	x = self.fc1(x) * self.gate(x)
	x = self.dropout(x)
	x = self.fc2(x)
	x = self.dropout(x)
	return x


	class T5SelfAttention(nn.Module):

	def __init__(self,
	dim,
	dim_attn,
	dim_ffn,
	num_heads,
	num_buckets,
	shared_pos=True,
	dropout=0.1):
	super(T5SelfAttention, self).__init__()
	self.dim = dim
	self.dim_attn = dim_attn
	self.dim_ffn = dim_ffn
	self.num_heads = num_heads
	self.num_buckets = num_buckets
	self.shared_pos = shared_pos

	# layers
	self.norm1 = T5LayerNorm(dim)
	self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
	self.norm2 = T5LayerNorm(dim)
	self.ffn = T5FeedForward(dim, dim_ffn, dropout)
	self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
	num_buckets, num_heads, bidirectional=True)

	def forward(self, x, mask=None, pos_bias=None):
	e = pos_bias if self.shared_pos else self.pos_embedding(
	x.size(1), x.size(1))
	x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
	x = fp16_clamp(x + self.ffn(self.norm2(x)))
	return x


	class T5CrossAttention(nn.Module):

	def __init__(self,
	dim,
	dim_attn,
	dim_ffn,
	num_heads,
	num_buckets,
	shared_pos=True,
	dropout=0.1):
	super(T5CrossAttention, self).__init__()
	self.dim = dim
	self.dim_attn = dim_attn
	self.dim_ffn = dim_ffn
	self.num_heads = num_heads
	self.num_buckets = num_buckets
	self.shared_pos = shared_pos

	# layers
	self.norm1 = T5LayerNorm(dim)
	self.self_attn = T5Attention(dim, dim_attn, num_heads, dropout)
	self.norm2 = T5LayerNorm(dim)
	self.cross_attn = T5Attention(dim, dim_attn, num_heads, dropout)
	self.norm3 = T5LayerNorm(dim)
	self.ffn = T5FeedForward(dim, dim_ffn, dropout)
	self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
	num_buckets, num_heads, bidirectional=False)

	def forward(self,
	x,
	mask=None,
	encoder_states=None,
	encoder_mask=None,
	pos_bias=None):
	e = pos_bias if self.shared_pos else self.pos_embedding(
	x.size(1), x.size(1))
	x = fp16_clamp(x + self.self_attn(self.norm1(x), mask=mask, pos_bias=e))
	x = fp16_clamp(x + self.cross_attn(
	self.norm2(x), context=encoder_states, mask=encoder_mask))
	x = fp16_clamp(x + self.ffn(self.norm3(x)))
	return x


	class T5RelativeEmbedding(nn.Module):

	def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
	super(T5RelativeEmbedding, self).__init__()
	self.num_buckets = num_buckets
	self.num_heads = num_heads
	self.bidirectional = bidirectional
	self.max_dist = max_dist

	# layers
	self.embedding = nn.Embedding(num_buckets, num_heads)

	def forward(self, lq, lk):
	device = self.embedding.weight.device
	# rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
	# torch.arange(lq).unsqueeze(1).to(device)
	rel_pos = torch.arange(lk, device=device).unsqueeze(0) - \
	torch.arange(lq, device=device).unsqueeze(1)
	rel_pos = self._relative_position_bucket(rel_pos)
	rel_pos_embeds = self.embedding(rel_pos)
	rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(
	0) # [1, N, Lq, Lk]
	return rel_pos_embeds.contiguous()

	def _relative_position_bucket(self, rel_pos):
	# preprocess
	if self.bidirectional:
	num_buckets = self.num_buckets // 2
	rel_buckets = (rel_pos > 0).long() * num_buckets
	rel_pos = torch.abs(rel_pos)
	else:
	num_buckets = self.num_buckets
	rel_buckets = 0
	rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))

	# embeddings for small and large positions
	max_exact = num_buckets // 2
	rel_pos_large = max_exact + (torch.log(rel_pos.float() / max_exact) /
	math.log(self.max_dist / max_exact) *
	(num_buckets - max_exact)).long()
	rel_pos_large = torch.min(
	rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1))
	rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
	return rel_buckets


	class T5Encoder(nn.Module):

	def __init__(self,
	vocab,
	dim,
	dim_attn,
	dim_ffn,
	num_heads,
	num_layers,
	num_buckets,
	shared_pos=True,
	dropout=0.1):
	super(T5Encoder, self).__init__()
	self.dim = dim
	self.dim_attn = dim_attn
	self.dim_ffn = dim_ffn
	self.num_heads = num_heads
	self.num_layers = num_layers
	self.num_buckets = num_buckets
	self.shared_pos = shared_pos

	# layers
	self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
	else nn.Embedding(vocab, dim)
	self.pos_embedding = T5RelativeEmbedding(
	num_buckets, num_heads, bidirectional=True) if shared_pos else None
	self.dropout = nn.Dropout(dropout)
	self.blocks = nn.ModuleList([
	T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
	shared_pos, dropout) for _ in range(num_layers)
	])
	self.norm = T5LayerNorm(dim)

	# initialize weights
	self.apply(init_weights)

	def forward(self, ids, mask=None):
	x = self.token_embedding(ids)
	x = self.dropout(x)
	e = self.pos_embedding(x.size(1),
	x.size(1)) if self.shared_pos else None
	for block in self.blocks:
	x = block(x, mask, pos_bias=e)
	x = self.norm(x)
	x = self.dropout(x)
	return x


	class T5Decoder(nn.Module):

	def __init__(self,
	vocab,
	dim,
	dim_attn,
	dim_ffn,
	num_heads,
	num_layers,
	num_buckets,
	shared_pos=True,
	dropout=0.1):
	super(T5Decoder, self).__init__()
	self.dim = dim
	self.dim_attn = dim_attn
	self.dim_ffn = dim_ffn
	self.num_heads = num_heads
	self.num_layers = num_layers
	self.num_buckets = num_buckets
	self.shared_pos = shared_pos

	# layers
	self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
	else nn.Embedding(vocab, dim)
	self.pos_embedding = T5RelativeEmbedding(
	num_buckets, num_heads, bidirectional=False) if shared_pos else None
	self.dropout = nn.Dropout(dropout)
	self.blocks = nn.ModuleList([
	T5CrossAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
	shared_pos, dropout) for _ in range(num_layers)
	])
	self.norm = T5LayerNorm(dim)

	# initialize weights
	self.apply(init_weights)

	def forward(self, ids, mask=None, encoder_states=None, encoder_mask=None):
	b, s = ids.size()

	# causal mask
	if mask is None:
	mask = torch.tril(torch.ones(1, s, s).to(ids.device))
	elif mask.ndim == 2:
	mask = torch.tril(mask.unsqueeze(1).expand(-1, s, -1))

	# layers
	x = self.token_embedding(ids)
	x = self.dropout(x)
	e = self.pos_embedding(x.size(1),
	x.size(1)) if self.shared_pos else None
	for block in self.blocks:
	x = block(x, mask, encoder_states, encoder_mask, pos_bias=e)
	x = self.norm(x)
	x = self.dropout(x)
	return x


	class T5Model(nn.Module):

	def __init__(self,
	vocab_size,
	dim,
	dim_attn,
	dim_ffn,
	num_heads,
	encoder_layers,
	decoder_layers,
	num_buckets,
	shared_pos=True,
	dropout=0.1):
	super(T5Model, self).__init__()
	self.vocab_size = vocab_size
	self.dim = dim
	self.dim_attn = dim_attn
	self.dim_ffn = dim_ffn
	self.num_heads = num_heads
	self.encoder_layers = encoder_layers
	self.decoder_layers = decoder_layers
	self.num_buckets = num_buckets

	# layers
	self.token_embedding = nn.Embedding(vocab_size, dim)
	self.encoder = T5Encoder(self.token_embedding, dim, dim_attn, dim_ffn,
	num_heads, encoder_layers, num_buckets,
	shared_pos, dropout)
	self.decoder = T5Decoder(self.token_embedding, dim, dim_attn, dim_ffn,
	num_heads, decoder_layers, num_buckets,
	shared_pos, dropout)
	self.head = nn.Linear(dim, vocab_size, bias=False)

	# initialize weights
	self.apply(init_weights)

	def forward(self, encoder_ids, encoder_mask, decoder_ids, decoder_mask):
	x = self.encoder(encoder_ids, encoder_mask)
	x = self.decoder(decoder_ids, decoder_mask, x, encoder_mask)
	x = self.head(x)
	return x


	def _t5(name,
	encoder_only=False,
	decoder_only=False,
	return_tokenizer=False,
	tokenizer_kwargs={},
	dtype=torch.float32,
	device='cpu',
	**kwargs):
	# sanity check
	assert not (encoder_only and decoder_only)

	# params
	if encoder_only:
	model_cls = T5Encoder
	kwargs['vocab'] = kwargs.pop('vocab_size')
	kwargs['num_layers'] = kwargs.pop('encoder_layers')
	_ = kwargs.pop('decoder_layers')
	elif decoder_only:
	model_cls = T5Decoder
	kwargs['vocab'] = kwargs.pop('vocab_size')
	kwargs['num_layers'] = kwargs.pop('decoder_layers')
	_ = kwargs.pop('encoder_layers')
	else:
	model_cls = T5Model

	# init model
	with torch.device(device):
	model = model_cls(**kwargs)

	# set device
	model = model.to(dtype=dtype, device=device)

	# init tokenizer
	if return_tokenizer:
	tokenizer = HuggingfaceTokenizer(f'google/{name}', **tokenizer_kwargs)
	return model, tokenizer
	else:
	return model


	def umt5_xxl(**kwargs):
	cfg = dict(
	vocab_size=256384,
	dim=4096,
	dim_attn=4096,
	dim_ffn=10240,
	num_heads=64,
	encoder_layers=24,
	decoder_layers=24,
	num_buckets=32,
	shared_pos=False,
	dropout=0.1)
	cfg.update(**kwargs)
	return _t5('umt5-xxl', **cfg)


	class T5EncoderModel:

	def __init__(
	self,
	model_max_length,
	dtype=torch.bfloat16,
	device=torch.cuda.current_device(),
	checkpoint_path=None,
	tokenizer_path=None,
	shard_fn=None,
	):

	os.environ["TOKENIZERS_PARALLELISM"]="false"

	self.model_max_length = model_max_length
	self.dtype = dtype
	self.device = device
	self.checkpoint_path = checkpoint_path
	self.tokenizer_path = tokenizer_path

	# init model
	model = umt5_xxl(
	encoder_only=True,
	return_tokenizer=False,
	dtype=dtype,
	device=device).eval().requires_grad_(False)
	logging.info(f'loading {checkpoint_path}')
	model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))
	self.model = model
	if shard_fn is not None:
	self.model = shard_fn(self.model, sync_module_states=False)
	else:
	self.model.to(self.device)
	# init tokenizer
	self.tokenizer = HuggingfaceTokenizer(
	name=tokenizer_path, seq_len=model_max_length, clean='whitespace')

	self.output_dim = self.model.dim
	self.y_embedder = None

	@property
	def t5(self,):
	return self

	def encode(self, texts):
	ids, mask = self.tokenizer(
	texts, return_mask=True, add_special_tokens=True)
	ids = ids.to(self.device)
	mask = mask.to(self.device)
	seq_lens = mask.gt(0).sum(dim=1).long()
	context = self.model(ids, mask).float()
	return dict(y=context[:,None], y_mask=mask)

	def null(self, n):
	null_y = self.y_embedder.y_embedding[None].repeat(n, 1, 1)[:, None]
	return null_y