Spaces:

KittyCat00
/

CatGPT

Sleeping

App Files Files Community

CatGPT / app.py

KittyCat00

Update app.py

02da43b verified over 1 year ago

raw

history blame contribute delete

20.9 kB

	import tiktoken
	import torch
	import time
	import math
	import re
	from torch.utils.data import Dataset, DataLoader

	import gradio as gr
	import torch.nn as nn

	class GPTModel(nn.Module):

	def __init__(self, cfg):
	super().__init__()
	self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
	self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
	self.drop_emb = nn.Dropout(cfg["drop_rate"])

	self.trf_blocks = nn.Sequential(
	*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
	)

	self.final_norm = LayerNorm(cfg["emb_dim"])
	self.out_head = nn.Linear(
	cfg["emb_dim"], cfg["vocab_size"], bias=False
	)

	def forward(self, in_idx):
	batch_size, seq_len = in_idx.shape
	tok_embeds = self.tok_emb(in_idx)
	pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
	x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]
	x = self.drop_emb(x)
	x = self.trf_blocks(x)
	x = self.final_norm(x)
	logits = self.out_head(x)
	return logits

	class TransformerBlock(nn.Module):

	def __init__(self, cfg):
	super().__init__()
	self.att = MultiHeadAttention(
	d_in=cfg["emb_dim"],
	d_out=cfg["emb_dim"],
	context_length=cfg["context_length"],
	num_heads=cfg["n_heads"],
	dropout=cfg["drop_rate"],
	qkv_bias=cfg["qkv_bias"]
	)
	self.ff = FeedForward(cfg)
	self.norm1 = LayerNorm(cfg["emb_dim"])
	self.norm2 = LayerNorm(cfg["emb_dim"])
	self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

	def forward(self, x):
	# Shortcut connection for attnetion block
	shortcut = x
	x = self.norm1(x)
	x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
	x = self.drop_shortcut(x)
	x = x + shortcut # Add the original input back

	# Shortcut connection for feed forward block
	shortcut = x
	x = self.norm2(x)
	x = self.ff(x)
	x = self.drop_shortcut(x)
	x = x + shortcut # Add the original input back

	return x

	class TransformerBlock(nn.Module):

	def __init__(self, cfg):
	super().__init__()
	self.att = MultiHeadAttention(
	d_in=cfg["emb_dim"],
	d_out=cfg["emb_dim"],
	context_length=cfg["context_length"],
	num_heads=cfg["n_heads"],
	dropout=cfg["drop_rate"],
	qkv_bias=cfg["qkv_bias"]
	)
	self.ff = FeedForward(cfg)
	self.norm1 = LayerNorm(cfg["emb_dim"])
	self.norm2 = LayerNorm(cfg["emb_dim"])
	self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

	def forward(self, x):
	# Shortcut connection for attnetion block
	shortcut = x
	x = self.norm1(x)
	x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
	x = self.drop_shortcut(x)
	x = x + shortcut # Add the original input back

	# Shortcut connection for feed forward block
	shortcut = x
	x = self.norm2(x)
	x = self.ff(x)
	x = self.drop_shortcut(x)
	x = x + shortcut # Add the original input back

	return x

	class MultiHeadAttention(nn.Module):

	def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
	super().__init__()
	assert (d_out % num_heads == 0), \
	"d_out must be divisible by num_heads"

	self.d_out = d_out
	self.num_heads = num_heads
	self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

	self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
	self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
	self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
	self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
	self.dropout = nn.Dropout(dropout)
	self.register_buffer(
	"mask",
	torch.triu(torch.ones(context_length, context_length),
	diagonal=1)
	)

	def forward(self, x):
	b, num_tokens, d_in = x.shape

	keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
	queries = self.W_query(x)
	values = self.W_value(x)

	# implicitly split the matrix by adding a `num_heads` dimension
	# Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
	keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
	values = values.view(b, num_tokens, self.num_heads, self.head_dim)
	queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

	# Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
	keys = keys.transpose(1, 2)
	queries = queries.transpose(1, 2)
	values = values.transpose(1, 2)

	# Compute scaled dot-product attention (aka self-attention) with a causal mask
	attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head

	# Original mask truncated to the number of tokens and converted to boolean
	mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

	# Use the mask to fill attention scores
	attn_scores.masked_fill_(mask_bool, -torch.inf)

	attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
	attn_weights = self.dropout(attn_weights)

	# Shape: (b, num_tokens, num_heads, head_dim)
	context_vec = (attn_weights @ values).transpose(1, 2)

	# Combine heads, where self.d_out = self.num_heads * self.head_dim
	context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
	context_vec = self.out_proj(context_vec) # optional projection

	return context_vec

	class FeedForward(nn.Module):

	def __init__(self, cfg):
	super().__init__()
	self.layers = nn.Sequential(
	nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
	GELU(),
	nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])
	)

	def forward(self, x):
	return self.layers(x)

	class GELU(nn.Module):

	def __init__(self):
	super().__init__()

	def forward(self, x):
	return 0.5 * x * (1 + torch.tanh(
	torch.sqrt(torch.tensor(2.0 / torch.pi)) *
	(x + 0.044715 * torch.pow(x, 3))
	))

	class LayerNorm(nn.Module):

	def __init__(self, emb_dim):
	super().__init__()
	self.eps = 1e-5
	self.scale = nn.Parameter(torch.ones(emb_dim))
	self.shift = nn.Parameter(torch.zeros(emb_dim))

	def forward(self, x):
	mean = x.mean(dim=-1, keepdim=True)
	var = x.var(dim=-1, keepdim=True, unbiased=False)
	norm_x = (x - mean) / torch.sqrt(var + self.eps)
	return self.scale * norm_x + self.shift




	GPT_CONFIG_124M = {
	"vocab_size": 50257, # Vocabulary size
	"context_length": 256, # Shortended context length (orig: 1024)
	"emb_dim": 768, # Embedding dimension
	"n_heads": 12, # Number of attention heads
	"n_layers": 12, # Number of layers
	"drop_rate": 0.1, # Dropout rate
	"qkv_bias": False # Query-key-value bias
	}

	model = GPTModel(GPT_CONFIG_124M)

	def generate(model, idx, max_new_tokens, context_size, tokenizer, text_to_token_ids, temperature=0.0, top_k=None, eos_id=None):

	# For-loop is the same as before: Get logits, and only focus on last time step
	for _ in range(max_new_tokens):
	idx_cond = idx[:, -context_size:]
	with torch.no_grad():
	logits = model(idx_cond)
	logits = logits[:, -1, :]

	# New: Filter logits with top_k sampling
	if top_k is not None:
	# Keep only top_k values
	top_logits, _ = torch.topk(logits, top_k)
	min_val = top_logits[:, -1]
	logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

	# New: Apply temperature scaling
	if temperature > 0.0:
	logits = logits / temperature

	# Apply softmax to get probabilities
	probs = torch.softmax(logits, dim=-1) # (batch_size, context_len)

	# Sample from the distribution
	idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1)

	# Otherwise, same as before: get the idx of the vocab entry with the highest logits value
	else:
	idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1)

	if idx_next == eos_id: # Stop generating early if end-of-sequence token is encountered and eos_id is specified
	break

	# if idx_next == text_to_token_ids(".", tokenizer):
	if idx_next == "tensor([[13]])":
	# idx_next = idx_next + text_to_token_ids("Meow.", tokenizer)
	print("\nperiod\n")

	# if idx_next == text_to_token_ids("?", tokenizer):
	if idx_next == "tensor([[30]])":
	# idx_next = idx_next + text_to_token_ids("Meow.", tokenizer)
	print("\nperiod\n")

	# if idx_next == text_to_token_ids("!", tokenizer):
	if idx_next == "tensor([[0]])":
	# idx_next = idx_next + text_to_token_ids("Meow.", tokenizer)
	print("\nperiod\n")

	# print(idx_next)
	# print("----")
	# print(idx_next + text_to_token_ids("Meow.", tokenizer))
	# test = idx_next + text_to_token_ids("Meow.", tokenizer)
	# print("------")
	# print(token_ids_to_text(idx_next, tokenizer))
	# Same as before: append sampled index to the running sequence
	idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1)
	# new_idx = re.sub(".", ". Meow.", idx)

	# return new_idx
	return idx

	def text_to_token_ids(text, tokenizer):
	encoded = tokenizer.encode(text, allowed_special={'<\|endoftext\|>'})
	encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
	return encoded_tensor

	def token_ids_to_text(token_ids, tokenizer):
	flat = token_ids.squeeze(0) # remove batch dimension
	return tokenizer.decode(flat.tolist())

	def train_model(model, train_loader, val_loader, optimizer, device,
	n_epochs, eval_freq, eval_iter, start_context, tokenizer,
	warmup_steps, initial_lr=3e-05, min_lr=1e-6):

	train_losses, val_losses, track_tokens_seen, track_lrs = [], [], [], []
	tokens_seen, global_step = 0, -1

	# Retrieve the maximum learning rate from the optimizer
	peak_lr = optimizer.param_groups[0]["lr"]

	# Calculate the total number of iterations in the training process
	total_training_steps = len(train_loader) * n_epochs

	# Calculate the learning rate increment during the warmup phase
	lr_increment = (peak_lr - initial_lr) / warmup_steps

	for epoch in range(n_epochs):
	model.train()
	for input_batch, target_batch in train_loader:
	optimizer.zero_grad()
	global_step += 1

	# Adjust the learning rate based on the current phase (warmup or cosine annealing)
	if global_step < warmup_steps:
	# Linear warmup
	lr = initial_lr + global_step * lr_increment
	else:
	# Cosine annealing after warmup
	progress = ((global_step - warmup_steps) /
	(total_training_steps - warmup_steps))
	lr = min_lr + (peak_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * progress))

	# Apply the calculated learning rate to the optimizer
	for param_group in optimizer.param_groups:
	param_group["lr"] = lr
	track_lrs.append(lr) # Store the current learning rate

	# Calculate and backpropagate the loss
	loss = calc_loss_batch(input_batch, target_batch, model, device)
	loss.backward()

	# Apply gradient clipping after the warmup phase to avoid exploding gradients
	if global_step > warmup_steps:
	torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

	optimizer.step()
	tokens_seen += input_batch.numel()

	# Periodically evaluate the model on the training and validation sets
	if global_step % eval_freq == 0:
	train_loss, val_loss = evaluate_model(
	model, train_loader, val_loader,
	device, eval_iter
	)
	train_losses.append(train_loss)
	val_losses.append(val_loss)
	track_tokens_seen.append(tokens_seen)
	# Print the current losses
	print(f"Ep {epoch+1} (Iter {global_step:06d}): "
	f"Train loss {train_loss:.3f}, "
	f"Val loss {val_loss:.3f}"
	)

	# Generate and print a sample from the model to monitor progress
	generate_and_print_sample(
	model, tokenizer, device, start_context
	)

	return train_losses, val_losses, track_tokens_seen, track_lrs

	def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
	tokenizer = tiktoken.get_encoding("gpt2") # A - Initalize the tokenizer
	dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) # B - Create dataset
	dataloader = DataLoader(
	dataset,
	batch_size=batch_size,
	shuffle=shuffle,
	drop_last=drop_last, # C - drop_last=True drops the last batch if it is shorter than the specified batch_size to prevent loss spikes during training
	num_workers=0 # D - The number of CPU processes to use for preprocessing
	)

	return dataloader



	class GPTDatasetV1(Dataset):
	def __init__(self, txt, tokenizer, max_length, stride):
	self.tokenizer = tokenizer
	self.input_ids = []
	self.target_ids = []

	token_ids = tokenizer.encode(txt) # A

	for i in range(0, len(token_ids) - max_length, stride): # B
	input_chunk = token_ids[i:i + max_length]
	target_chunk = token_ids[i + 1: i +max_length + 1]
	self.input_ids.append(torch.tensor(input_chunk))
	self.target_ids.append(torch.tensor(target_chunk))

	def __len__(self):
	return len(self.input_ids)

	def __getitem__(self, idx):
	return self.input_ids[idx], self.target_ids[idx]


	def evaluate_model(model, train_loader, val_loader, device, eval_iter):
	model.eval()
	with torch.no_grad():
	train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
	val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
	model.train()
	return train_loss, val_loss

	def generate_and_print_sample(model, tokenizer, device, start_context):
	model.eval()
	context_size = model.pos_emb.weight.shape[0]
	encoded = text_to_token_ids(start_context, tokenizer).to(device)
	with torch.no_grad():
	token_ids = generate_text_simple(
	model=model, idx=encoded,
	max_new_tokens=50, context_size=context_size
	)
	decoded_text = token_ids_to_text(token_ids, tokenizer)
	print(decoded_text.replace("\n", " ")) # Compact print format
	model.train()

	def calc_loss_batch(input_batch, target_batch, model, device):
	input_batch, target_batch = input_batch.to(device), target_batch.to(device)
	logits = model(input_batch)
	loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
	return loss

	def calc_loss_loader(data_loader, model, device, num_batches=None):
	total_loss = 0.
	if len(data_loader) == 0:
	return float("nan")
	elif num_batches is None:
	num_batches = len(data_loader)
	else:
	# Reduce the number of batches to match the total number of batches in the data loader
	# if num_batches exceeds the number of batches in the data loader
	num_batches = min(num_batches, len(data_loader))
	for i, (input_batch, target_batch) in enumerate(data_loader):
	if i < num_batches:
	loss = calc_loss_batch(input_batch, target_batch, model, device)
	total_loss += loss.item()
	else:
	break
	return total_loss / num_batches

	def generate_text_simple(model, idx, max_new_tokens, context_size):
	# idx is (batch, n_tokens) array of indices in the current context
	for _ in range(max_new_tokens):

	# Crop current context if it exceeds the supported context size
	idx_cond = idx[:, -context_size:]

	# get the predictions
	with torch.no_grad():
	logits = model(idx_cond)

	# Focus only on the last time step
	# (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
	logits = logits[:, -1, :]

	# apply softmax to get the probabilities
	probas = torch.softmax(logits, dim=-1) # (batch, vocab_size)

	# Get the idx of the vocab entry with the highest probability value
	idx_next = torch.argmax(probas, dim=-1, keepdim=True) # (batch, 1)

	# if idx_next == text_to_token_ids(".", tokenizer):
	# idx_next = idx_next + text_to_token_ids("Meow.", tokenizer)

	# if idx_next == text_to_token_ids("?", tokenizer):
	# idx_next = idx_next + text_to_token_ids("Meow.", tokenizer)

	# if idx_next == text_to_token_ids("!", tokenizer):
	# idx_next = idx_next + text_to_token_ids("Meow.", tokenizer)

	# Append sampled index to the running sequence
	idx = torch.cat((idx, idx_next), dim=1) # (batch , n_tokens+1)

	return idx

	def main(input_text, max_new_tokens):

	tokenizer = tiktoken.get_encoding("gpt2")

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	if torch.cuda.is_available():
	device = torch.device("cuda")
	elif torch.backends.mps.is_available():
	device = torch.device("mps")
	else:
	device = torch.device("cpu")

	checkpoint = torch.load("model_and_optimizer.pth", weights_only=True)

	model = GPTModel(GPT_CONFIG_124M)
	model.load_state_dict(checkpoint["model_state_dict"])

	optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.1)
	optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

	# weights = torch.load("model_and_optimizer.pth", map_location=torch.device(device))
	# weights = torch.load("model_and_optimizer.pth", weights_only=False)

	# model = GPTModel({
	# "vocab_size": 50257, # Vocabulary size
	# "context_length": 512, # Shortened context length (orig: 1024)
	# "emb_dim": 768, # Embedding dimension
	# "n_heads": 12, # Number of attention heads
	# "n_layers": 12, # Number of layers
	# "drop_rate": 0.3, # Dropout rate
	# "qkv_bias": False # Query-key-value bias
	# }).to(device)
	# model.load_state_dict(weights['model_state_dict'])
	model.eval()

	context_size = model.pos_emb.weight.shape[0]
	encoded = torch.tensor(tokenizer.encode(input_text.strip())).unsqueeze(0).to(device)

	with torch.no_grad():
	token_ids = generate(
	model=model, idx=encoded,
	max_new_tokens=max_new_tokens, context_size=context_size,
	top_k=25, temperature=1.4, text_to_token_ids=text_to_token_ids, tokenizer=tokenizer
	)
	thingy = tokenizer.decode(token_ids.squeeze(0).tolist())
	new_thingy = re.sub("\.", ". Meow.", thingy)
	# return tokenizer.decode(token_ids.squeeze(0).tolist())
	# return tokenizer.decode(new_thing.squeeze(0).tolist())
	print(thingy)
	return new_thingy

	# if __name__ == "__main__":
	# gr.Interface(fn=main, inputs=[gr.Textbox(label='Starting context'), gr.Number(label="Maximum output tokens")], outputs=[gr.Textbox(label="Response:")], title="CatGPT", article="Meow").launch()

	# thing_old = gr.Interface(fn=main, theme=gr.themes.Soft(primary_hue="pink", secondary_hue="stone"), inputs=[gr.Textbox(label='Starting context'), gr.Number(label="Maximum output tokens")], outputs=[gr.Textbox(label="Response:")], title="CatGPT", article="Meow")
	thing = gr.Interface(fn=main,
	theme='ParityError/Anime',
	inputs=[gr.Textbox(label='Starting context'),
	gr.Number(label="Maximum output tokens")],
	outputs=[gr.Textbox(label="Response:")],
	title="CatGPT",
	article="Meow")

	if __name__ == "__main__":
	thing.launch()