Spaces:

flappybird1084
/

text-generator

Sleeping

App Files Files Community

text-generator / app.py

flappybird1084

mod app v2

9fcf4ac 4 months ago

raw

history blame contribute delete

9.11 kB

	import torch
	import torch.nn as nn
	from torch.nn import functional as F
	import math
	import tiktoken
	import gradio as gr
	import os

	# Model definition (copied from your training script)
	# hyperparameters
	batch_size = 24 # how many independent sequences will we process in parallel?
	block_size = 256 # what is the maximum context length for predictions?
	max_iters = int(160000 * 64 / batch_size) # how many batches to train on
	eval_interval = 500 # how often to evaluate the model
	learning_rate = 3e-4 # learning rate for optimizer
	device = 'mps' if torch.backends.mps.is_available(
	) else 'cuda' if torch.cuda.is_available() else 'cpu' # use GPU if available
	eval_iters = 200 # how many batches to use for evaluation
	n_embd = 384 # embedding dimension
	n_head = 6 # number of attention heads
	n_layer = 6 # number of transformer blocks
	dropout = 0.2 # dropout rate
	sliding_window_len = 128

	# Get vocab size from tiktoken
	vocab_size = tiktoken.get_encoding("gpt2").n_vocab

	# Encoder/decoder functions


	def encode(string):
	return tiktoken.get_encoding("gpt2").encode(string)


	def decode(index):
	return tiktoken.get_encoding("gpt2").decode(index)


	class FlashAttentionHead(nn.Module):
	def __init__(self, head_size):
	super().__init__()
	self.key = nn.Linear(n_embd, head_size, bias=False)
	self.query = nn.Linear(n_embd, head_size, bias=False)
	self.value = nn.Linear(n_embd, head_size, bias=False)
	self.o_proj = nn.Linear(head_size, head_size, bias=False)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	# batch size, sequence length, embedding dimension (n_embd)
	B, T, C = x.shape
	k = self.key(x) # (B, T, head_size)
	q = self.query(x)
	value = self.value(x) # (B, T, head_size)
	output = F.scaled_dot_product_attention(
	q, k, value, attn_mask=None, dropout_p=dropout, is_causal=True)
	output = self.o_proj(output)
	output = self.dropout(output)
	return output


	class MultiHeadAttention(nn.Module):
	def __init__(self, num_heads, head_size):
	super().__init__()
	self.heads = nn.ModuleList(FlashAttentionHead(head_size)
	for _ in range(num_heads))
	self.proj = nn.Linear(head_size * num_heads, n_embd)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	out = torch.cat([h(x) for h in self.heads], dim=-1)
	out = self.dropout(self.proj(out))
	return out


	class FFN(nn.Module):
	def __init__(self, n_embd):
	super().__init__()
	self.net = nn.Sequential(
	nn.Linear(n_embd, 4 * n_embd),
	nn.ReLU(),
	nn.Linear(4 * n_embd, n_embd),
	nn.Dropout(dropout),
	)

	def forward(self, x):
	return self.net(x)


	class EfficientMoEFFN(nn.Module):
	def __init__(self, n_embd, num_experts=4, num_experts_per_token=2):
	super().__init__()
	self.num_experts_per_token = num_experts_per_token
	self.num_experts = num_experts
	self.experts = nn.ModuleList([FFN(n_embd) for _ in range(num_experts)])
	self.gate = nn.Linear(n_embd, num_experts)

	def forward(self, x):
	B, T, C = x.shape
	x_flat = x.view(BT, C) # Flatten tokens to (batchtokens, d_model)

	# Gating
	gate_scores = self.gate(x_flat) # (B*T, num_experts)
	topk_scores, topk_indices = torch.topk(
	gate_scores, self.num_experts_per_token, dim=-1
	) # (B*T, k)
	topk_probs = F.softmax(topk_scores, dim=-1) # (B*T, k), normalized

	# Output buffer
	out = torch.zeros_like(x_flat)

	# For each expert: route only the tokens assigned to it
	for expert_id, expert in enumerate(self.experts):
	# Find where this expert is selected
	mask = (topk_indices == expert_id) # (B*T, k)
	if not mask.any():
	continue # if it's not part of the top k selected experts for any token, skip it

	token_ids, which_slot = mask.nonzero(as_tuple=True)

	# Select actual tokens
	tokens_for_expert = x_flat[token_ids]

	# Apply expert FFN
	expert_out = expert(tokens_for_expert) # (num_tokens, C)

	# Scale by probability
	probs = topk_probs[token_ids, which_slot].unsqueeze(-1)
	expert_out = expert_out * probs

	# Scatter-add back to output buffer
	out.index_add_(0, token_ids, expert_out)

	return out.view(B, T, C)


	class Block(nn.Module):
	# block where you have mha and feedforward then layer normalization
	def __init__(self, n_embd, n_head):
	super().__init__()
	head_size = n_embd // n_head
	self.sa = MultiHeadAttention(n_head, head_size)
	self.ffwd = EfficientMoEFFN(n_embd, num_experts=4)
	self.ln1 = nn.LayerNorm(n_embd)
	self.ln2 = nn.LayerNorm(n_embd)

	def forward(self, x):
	x = x + self.sa(self.ln1(x))
	x = x + self.ffwd(self.ln2(x))
	return x


	class LanguageModel(nn.Module):
	def __init__(self):
	super().__init__()
	self.token_embed_table = nn.Embedding(vocab_size, n_embd)
	self.position_embed_table = nn.Embedding(block_size, n_embd)
	self.blocks = nn.Sequential(
	*[Block(n_embd, n_head) for _ in range(n_layer)])
	self.ln_f = nn.LayerNorm(n_embd) # final layer norm
	self.lm_head = nn.Linear(n_embd, vocab_size)

	def forward(self, idx, targets=None):
	B, T = idx.shape
	token_emb = self.token_embed_table(idx) # (B, T, n_embd)
	position_emb = self.position_embed_table(
	torch.arange(T, device=idx.device))

	x = token_emb + position_emb # (B, T, n_embd)
	x = self.blocks(x) # (B, T, n_embd)
	x = self.ln_f(x) # (B, T, n_embd)
	logits = self.lm_head(x) # (B, T, vocab_size)

	if targets is None:
	loss = None
	else:
	B, T, C = logits.shape
	logits = logits.view(B*T, C)
	targets = targets.view(B*T)
	loss = F.cross_entropy(logits, targets)

	return logits, loss

	def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
	# idx is (B, T) array of indices in the current context
	for _ in range(max_new_tokens):
	# crop idx to the last block_size tokens
	idx_cond = idx[:, -block_size:]
	# get the predictions
	logits, loss = self(idx_cond)
	# focus only on the last time step
	logits = logits[:, -1, :] # becomes (B, C)
	# apply temperature scaling
	if temperature != 1.0:
	logits = logits / temperature
	# optionally crop the logits to only the top k options
	if top_k is not None:
	v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
	logits[logits < v[:, [-1]]] = -float('Inf')
	# apply softmax to get probabilities
	probs = F.softmax(logits, dim=-1) # (B, C)
	# sample from the distribution
	idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
	# append sampled index to the running sequence
	idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
	return idx


	# Load the model
	model = LanguageModel().to(device)
	model_path = "./model_v6_flash_attn.pth"

	# Check if model file exists
	if os.path.exists(model_path):
	model.load_state_dict(torch.load(
	model_path, map_location=device, weights_only=False))
	model.eval()
	print("Model loaded successfully")
	else:
	print("model file not found")

	# Compile model for better performance
	model = torch.compile(model)


	def generate_text(prompt, max_tokens, temperature, top_k):
	if not os.path.exists(model_path):
	return "Model not found. Please train the model first."

	# Encode the prompt
	idx = torch.tensor(encode(prompt), dtype=torch.long,
	device=device).unsqueeze(0)

	# Generate text
	with torch.no_grad():
	generated_idx = model.generate(
	idx, max_tokens, temperature=temperature, top_k=top_k)

	# Decode the generated text
	generated_text = decode(generated_idx[0].tolist())
	return generated_text[len(prompt):] # Return only the generated part


	# Create Gradio interface
	interface = gr.Interface(
	fn=generate_text,
	inputs=[
	gr.Textbox(lines=5, label="Input Prompt",
	placeholder="Enter your text prompt here..."),
	gr.Slider(1, 500, value=100, label="Max Tokens"),
	gr.Slider(0.1, 2.0, value=1.0, label="Temperature"),
	gr.Slider(1, 100, value=50, label="Top K")
	],
	outputs=gr.Textbox(label="Generated Text", lines=10),
	title="Text Generation with Transformer Model",
	description="Generate text using a trained transformer model. Adjust the parameters to control the output."
	)

	# Launch the app
	if __name__ == "__main__":
	interface.launch()