Celestia / usage.py

Update usage.py

d1a8524 verified 10 months ago

14.1 kB

	import os
	import math
	import numpy as np
	import jax
	import jax.numpy as jnp
	import flax.linen as nn
	import flax.serialization
	from tokenizers import Tokenizer

	# ---------------------------
	# Constants and File Paths
	# ---------------------------
	TOKENIZER_PATH = "Path to tokenizer.json file"
	MODEL_PARAMS_SAVE_PATH = "Path to model file"

	# ---------------------------
	# Global Definitions
	# ---------------------------
	DTYPE = jnp.bfloat16
	RMSNORM_EPS = 1e-05
	dense_init = nn.initializers.normal(stddev=0.02)
	CTX_LEN = 2048
	NUM_KV_HEADS = 4

	# ---------------------------
	# Configuration Values (from provided config)
	# ---------------------------
	config = {
	"d_model": 768,
	"nhead": 16,
	"num_layers": 24,
	"ff_hidden_dim": 3072,
	"vocab_size": 49800,
	"max_len": 2048,
	"dropout_rate": 0.1,
	"window_layer_indices": [2, 5, 8, 11, 14, 17, 20, 23],
	"moe_layer_indices": [4, 9, 14, 19],
	"window_size": 512,
	"moe_params": {"num_experts": 4, "num_experts_per_tok": 2},
	}

	# ---------------------------
	# Custom Modules (Updated Architecture)
	# ---------------------------
	class RMSNorm(nn.Module):
	epsilon: float = RMSNORM_EPS
	dtype: any = DTYPE
	@nn.compact
	def __call__(self, x):
	dim = x.shape[-1]
	scale = self.param("scale", nn.initializers.ones, (dim,))
	norm = jnp.sqrt(jnp.mean(x ** 2, axis=-1, keepdims=True) + self.epsilon)
	return (x / norm) * scale

	class RoPE(nn.Module):
	d_model: int
	max_len: int
	dtype: any = DTYPE
	def setup(self):
	self.inv_freq = 1.0 / (10000.0 ** (jnp.arange(0, self.d_model, 2, dtype=jnp.float32) / self.d_model))
	def __call__(self, x):
	seq_len = x.shape[-2]
	pos = jnp.arange(seq_len, dtype=jnp.float32)[None, None, :, None]
	inv_freq = self.inv_freq[None, None, None, :]
	freqs = pos * inv_freq
	cos = jnp.cos(freqs).astype(self.dtype)
	sin = jnp.sin(freqs).astype(self.dtype)
	x1 = x[..., ::2]
	x2 = x[..., 1::2]
	return jnp.concatenate([x1 * cos - x2 * sin, x1 * sin + x2 * cos], axis=-1)

	class FeedForward(nn.Module):
	d_model: int
	hidden_dim: int
	dropout_rate: float
	dtype: any = DTYPE
	@nn.compact
	def __call__(self, x, deterministic: bool = True):
	proj = nn.Dense(self.hidden_dim * 2, use_bias=False, kernel_init=dense_init, dtype=self.dtype)(x)
	x1, x2 = jnp.split(proj, 2, axis=-1)
	x_act = x1 * nn.silu(x2)
	x_act = nn.Dense(self.d_model, use_bias=False, kernel_init=dense_init, dtype=self.dtype)(x_act)
	return nn.Dropout(rate=self.dropout_rate)(x_act, deterministic=deterministic)

	class ExpertFFN(nn.Module):
	d_model: int
	hidden_dim: int
	dropout_rate: float
	dtype: any = DTYPE
	@nn.compact
	def __call__(self, x, deterministic: bool = True):
	hidden = nn.Dense(self.hidden_dim, use_bias=False, kernel_init=dense_init, dtype=self.dtype)(x)
	hidden = nn.silu(hidden)
	out = nn.Dense(self.d_model, use_bias=False, kernel_init=dense_init, dtype=self.dtype)(hidden)
	return out

	class MoEFeedForward(nn.Module):
	d_model: int
	hidden_dim: int
	dropout_rate: float
	num_experts: int = 4
	num_experts_per_tok: int = 2
	dtype: any = DTYPE
	@nn.compact
	def __call__(self, x, deterministic: bool = True):
	gate_logits = nn.Dense(self.num_experts, use_bias=False, dtype=self.dtype)(x)
	gate_scores = nn.softmax(gate_logits, axis=-1)
	expert_ffn = nn.vmap(ExpertFFN,
	variable_axes={'params': 0},
	split_rngs={'params': True},
	in_axes=0,
	out_axes=0)(d_model=self.d_model,
	hidden_dim=self.hidden_dim,
	dropout_rate=self.dropout_rate,
	dtype=self.dtype)
	x_expert = jnp.broadcast_to(x, (self.num_experts,) + x.shape)
	experts = expert_ffn(x_expert)
	gate_scores = jnp.transpose(gate_scores, (2, 0, 1))[..., None]
	moe_output = jnp.sum(experts * gate_scores, axis=0)
	moe_output = nn.Dropout(rate=self.dropout_rate)(moe_output, deterministic=deterministic)
	return moe_output

	class LLaMAAttention(nn.Module):
	d_model: int
	nhead: int
	num_kv_heads: int
	dropout_rate: float
	dtype: any = DTYPE
	use_sliding_window: bool = False
	window_size: int = 512
	def setup(self):
	self.head_dim = self.d_model // self.nhead
	self.q_proj = nn.Dense(self.d_model, use_bias=False, kernel_init=dense_init, dtype=self.dtype)
	self.kv_proj = nn.Dense(2 * (self.num_kv_heads * self.head_dim),
	use_bias=False, kernel_init=dense_init, dtype=self.dtype)
	self.out_proj = nn.Dense(self.d_model, use_bias=False, kernel_init=dense_init, dtype=self.dtype)
	self.dropout = nn.Dropout(rate=self.dropout_rate)
	self.rope = RoPE(d_model=self.head_dim, max_len=CTX_LEN, dtype=self.dtype)
	self.layer_scale_attn = self.param("layer_scale_attn", nn.initializers.constant(0.1), (self.d_model,))
	def __call__(self, x, deterministic: bool = True):
	B, T, _ = x.shape
	q = self.q_proj(x).reshape(B, T, self.nhead, self.head_dim)
	kv = self.kv_proj(x).reshape(B, T, self.num_kv_heads, 2 * self.head_dim)
	k, v = jnp.split(kv, 2, axis=-1)
	group_factor = self.nhead // self.num_kv_heads
	k = jnp.repeat(k, repeats=group_factor, axis=2)
	v = jnp.repeat(v, repeats=group_factor, axis=2)
	q = jnp.transpose(q, (0, 2, 1, 3))
	k = jnp.transpose(k, (0, 2, 1, 3))
	q = self.rope(q)
	k = self.rope(k)
	q = jnp.transpose(q, (0, 2, 1, 3))
	k = jnp.transpose(k, (0, 2, 1, 3))
	attn_weights = jnp.einsum("bthd,bThd->bthT", q, k) / jnp.sqrt(self.head_dim)
	if self.use_sliding_window:
	i = jnp.arange(T)[:, None]
	j = jnp.arange(T)[None, :]
	sliding_mask = (i - j < self.window_size) & (i >= j)
	sliding_mask = sliding_mask[None, :, None, :]
	attn_weights = jnp.where(sliding_mask, attn_weights, -1e10)
	else:
	causal_mask = jnp.tril(jnp.ones((T, T), dtype=bool))[None, :, None, :]
	attn_weights = jnp.where(causal_mask, attn_weights, -1e10)
	attn_probs = nn.softmax(attn_weights, axis=-1)
	attn_probs = self.dropout(attn_probs, deterministic=deterministic)
	attn_output = jnp.einsum("bthT,bThd->bthd", attn_probs, v)
	attn_output = attn_output.reshape(B, T, self.d_model)
	output = self.out_proj(attn_output)
	output = self.dropout(output, deterministic=deterministic)
	return output * self.layer_scale_attn

	class TransformerLayer(nn.Module):
	d_model: int
	nhead: int
	ff_hidden_dim: int
	dropout_rate: float
	dtype: any = DTYPE
	use_sliding_window: bool = False
	window_size: int = 512
	use_moe: bool = False
	moe_params: dict = None
	def setup(self):
	self.attn_norm = RMSNorm(dtype=self.dtype)
	self.attn = LLaMAAttention(
	d_model=self.d_model,
	nhead=self.nhead,
	num_kv_heads=NUM_KV_HEADS,
	dropout_rate=0.0,
	dtype=self.dtype,
	use_sliding_window=self.use_sliding_window,
	window_size=self.window_size
	)
	self.ff_norm = RMSNorm(dtype=self.dtype)
	if self.use_moe:
	self.ff = MoEFeedForward(
	d_model=self.d_model,
	hidden_dim=self.ff_hidden_dim,
	dropout_rate=self.dropout_rate,
	num_experts=self.moe_params.get("num_experts", 4) if self.moe_params else 4,
	num_experts_per_tok=self.moe_params.get("num_experts_per_tok", 2) if self.moe_params else 2,
	dtype=self.dtype
	)
	else:
	self.ff = FeedForward(
	d_model=self.d_model,
	hidden_dim=self.ff_hidden_dim,
	dropout_rate=self.dropout_rate,
	dtype=self.dtype
	)
	self.layer_scale_ff = self.param("layer_scale_ff", nn.initializers.constant(0.1), (self.d_model,))
	def __call__(self, x, deterministic: bool = True):
	x = x + self.attn(self.attn_norm(x), deterministic=deterministic)
	x = x + self.ff(self.ff_norm(x), deterministic=deterministic) * self.layer_scale_ff
	return x

	class DeepSeekModel(nn.Module):
	vocab_size: int
	d_model: int
	nhead: int
	num_layers: int
	ff_hidden_dim: int
	max_len: int
	dropout_rate: float
	dtype: any = DTYPE
	window_layer_indices: list = None
	moe_layer_indices: list = None
	window_size: int = 512
	moe_params: dict = None
	def setup(self):
	self.embed = nn.Embed(
	num_embeddings=self.vocab_size,
	features=self.d_model,
	embedding_init=dense_init,
	dtype=self.dtype
	)
	self.layers = [
	TransformerLayer(
	d_model=self.d_model,
	nhead=self.nhead,
	ff_hidden_dim=self.ff_hidden_dim,
	dropout_rate=self.dropout_rate,
	dtype=self.dtype,
	use_sliding_window=(self.window_layer_indices is not None and i in self.window_layer_indices),
	window_size=self.window_size,
	use_moe=(self.moe_layer_indices is not None and i in self.moe_layer_indices),
	moe_params=self.moe_params
	)
	for i in range(self.num_layers)
	]
	self.norm = RMSNorm(dtype=self.dtype)
	def __call__(self, input_ids, deterministic: bool = True):
	x = self.embed(input_ids)
	for layer in self.layers:
	x = layer(x, deterministic=deterministic)
	x = self.norm(x)
	logits = x @ self.embed.embedding.T
	return logits

	# ---------------------------
	# Load Tokenizer and Model Parameters
	# ---------------------------
	tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
	PAD_TOKEN_ID = tokenizer.token_to_id("<pad>")
	START_TOKEN_ID = tokenizer.token_to_id("<s>")
	END_SEQ_TOKEN_ID = tokenizer.token_to_id("</s>")

	model_instance = DeepSeekModel(
	vocab_size=config["vocab_size"],
	d_model=config["d_model"],
	nhead=config["nhead"],
	num_layers=config["num_layers"],
	ff_hidden_dim=config["ff_hidden_dim"],
	max_len=config["max_len"],
	dropout_rate=config["dropout_rate"],
	dtype=DTYPE,
	window_layer_indices=config["window_layer_indices"],
	moe_layer_indices=config["moe_layer_indices"],
	window_size=config["window_size"],
	moe_params=config["moe_params"]
	)

	dummy_input = jnp.ones((1, config["max_len"] - 1), dtype=jnp.int32)
	rng = jax.random.PRNGKey(0)
	init_params = model_instance.init(rng, dummy_input, deterministic=True)

	with open(MODEL_PARAMS_SAVE_PATH, "rb") as f:
	saved_params_bytes = f.read()
	saved_params = flax.serialization.from_bytes(init_params, saved_params_bytes)
	print("Loaded model parameters.")

	# ---------------------------
	# Temperature Sampling Function with Fixed Parameters
	# ---------------------------
	def temperature_sample(params, prompt_ids, model, max_length=15, temperature=0.7, top_p=0.9, end_token_id=END_SEQ_TOKEN_ID):
	"""
	Generates text token-by-token using temperature scaling and nucleus (top-p) sampling.

	Args:
	params: Model parameters.
	prompt_ids: List of token IDs for the prompt.
	model: The language model.
	max_length: Maximum number of tokens to generate.
	temperature: Temperature for scaling logits.
	top_p: Nucleus sampling threshold.
	end_token_id: End-of-sequence token ID.

	Returns:
	A list of token IDs representing the generated text.
	"""
	generated = list(prompt_ids)
	for step in range(max_length):
	input_seq = jnp.array(generated)[None, :]
	logits = model.apply(params, input_seq, deterministic=True)
	logits_last = logits[0, -1]
	scaled_logits = logits_last / temperature
	probs = jax.nn.softmax(scaled_logits)

	probs_np = np.array(probs)
	sorted_indices = np.argsort(probs_np)[::-1]
	sorted_probs = probs_np[sorted_indices]
	cumulative_probs = np.cumsum(sorted_probs)
	cutoff_idx = np.where(cumulative_probs > top_p)[0]
	cutoff = cutoff_idx[0] + 1 if len(cutoff_idx) > 0 else len(sorted_probs)
	nucleus_indices = sorted_indices[:cutoff]
	nucleus_probs = sorted_probs[:cutoff]
	nucleus_probs /= np.sum(nucleus_probs)

	token_id = int(np.random.choice(nucleus_indices, p=nucleus_probs))
	generated.append(token_id)

	token_str = tokenizer.decode([token_id]).strip()
	print(f"Step {step+1}: Generated token '{token_str}' (ID: {token_id})")

	if token_id == end_token_id:
	break
	return generated

	# ---------------------------
	# Interactive Chat Loop using Fixed Temperature Sampling
	# ---------------------------
	def chat():
	print("\nInteractive Chat (type 'exit' or 'quit' to end):")
	while True:
	user_input = input("\nUser: ").strip()
	if user_input.lower() in ["exit", "quit"]:
	break
	if not user_input.startswith("<s>"):
	user_input = "<s> " + user_input
	prompt_ids = tokenizer.encode(user_input).ids
	max_prompt_length = config["max_len"] - 1
	if len(prompt_ids) > max_prompt_length:
	prompt_ids = prompt_ids[-max_prompt_length:]

	print("\nModel generating response using temperature sampling (temp=0.7, top-p=0.9, max tokens=15)...")
	generated_ids = temperature_sample(
	saved_params, prompt_ids, model_instance,
	max_length=15, temperature=0.7, top_p=0.9, end_token_id=END_SEQ_TOKEN_ID
	)
	generated_text = tokenizer.decode(generated_ids)
	print("\nModel:", generated_text)

	if __name__ == "__main__":
	chat()