Spaces:

TheVixhal
/

crekai

Sleeping

App Files Files Community

crekai / app.py

TheVixhal

Update app.py

c2ae57c verified 7 months ago

raw

history blame contribute delete

12.4 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import json
	import os
	import math
	import gradio as gr
	from transformers import GPT2Tokenizer
	from huggingface_hub import hf_hub_download
	from typing import Optional, List, Tuple

	# =========================================================================================
	# >> 1. MODEL AND ATTENTION CLASS DEFINITIONS
	# >> All necessary classes must be defined here to reconstruct the model from the Hub.
	# =========================================================================================

	class BTreeNode:
	"""B+ Tree node stub for inference."""
	def __init__(self, order: int, is_leaf: bool = False, device: str = 'cuda'):
	pass

	class BTreeAttentionIndex:
	"""B+ Tree structure stub for inference."""
	def __init__(self, order: int = 5, device: str = 'cuda'):
	pass
	def insert(self, key: int, value: torch.Tensor):
	pass
	def range_query(self, start: int, end: int) -> List[torch.Tensor]:
	return []

	class StandardSelfAttention(nn.Module):
	"""Standard multi-head self-attention mechanism."""
	def __init__(self, d_model, n_heads, use_position_bias=False, block_size=None, dropout=0.1):
	super().__init__()
	self.d_model = d_model
	self.n_heads = n_heads
	self.d_head = d_model // n_heads
	self.q_proj = nn.Linear(d_model, d_model)
	self.k_proj = nn.Linear(d_model, d_model)
	self.v_proj = nn.Linear(d_model, d_model)
	self.out_proj = nn.Linear(d_model, d_model)
	self.dropout = nn.Dropout(dropout)
	self.use_position_bias = use_position_bias
	if use_position_bias:
	if block_size is None:
	raise ValueError("block_size must be provided if use_position_bias is True")
	self.block_size = block_size
	self.position_bias = nn.Parameter(torch.randn(1, n_heads, block_size, block_size) * 0.1)

	def forward(self, x, mask=None, bias=None, **kwargs):
	batch_size, seq_len, _ = x.shape
	q = self.q_proj(x).view(batch_size, seq_len, self.n_heads, self.d_head).transpose(1, 2)
	k = self.k_proj(x).view(batch_size, seq_len, self.n_heads, self.d_head).transpose(1, 2)
	v = self.v_proj(x).view(batch_size, seq_len, self.n_heads, self.d_head).transpose(1, 2)
	scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_head)
	if self.use_position_bias and seq_len <= self.block_size:
	scores = scores + self.position_bias[:, :, :seq_len, :seq_len]
	if bias is not None:
	scores = scores + bias
	if mask is not None:
	scores = scores.masked_fill(mask == 0, -float('inf'))
	weights = F.softmax(scores, dim=-1)
	weights = self.dropout(weights)
	context = torch.matmul(weights, v).transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
	return self.out_proj(context)

	class OptimizedParallelHierarchicalAttention(nn.Module):
	"""Enhanced PHA with B+ Tree indexing (inference mode)."""
	def __init__(self, d_model, n_heads, block_size, btree_order=5, dropout=0.1, device='cuda'):
	super().__init__()
	self.d_model, self.n_heads, self.block_size, self.device = d_model, n_heads, block_size, device
	self.local_attention = StandardSelfAttention(d_model, n_heads, use_position_bias=True, block_size=block_size, dropout=dropout).to(device)
	self.summary_attention = StandardSelfAttention(d_model, n_heads, dropout=dropout).to(device)
	self.summarizer = nn.Linear(d_model, d_model).to(device)
	self.dropout = nn.Dropout(dropout).to(device)
	self.attention_index = BTreeAttentionIndex(btree_order, device=device)

	def forward(self, x):
	batch_size, seq_len, d_model = x.shape
	pad_len = (self.block_size - seq_len % self.block_size) % self.block_size
	if pad_len > 0:
	x = F.pad(x, (0, 0, 0, pad_len))
	padded_len = x.shape[1]
	num_blocks = padded_len // self.block_size
	x_reshaped = x.view(batch_size * num_blocks, self.block_size, d_model)
	local_context = self.local_attention(x_reshaped).view(batch_size, padded_len, d_model)
	local_context = self.dropout(local_context)
	block_view = local_context.view(batch_size, num_blocks, self.block_size, d_model)
	weights = torch.softmax(torch.randn(self.block_size, device=self.device), dim=0)
	summary_tokens = self.summarizer((block_view * weights.view(1, 1, -1, 1)).sum(dim=2))
	summary_context = self.summary_attention(summary_tokens)
	summary_context = self.dropout(summary_context)
	summary_context_expanded = summary_context.unsqueeze(2).expand(-1, -1, self.block_size, -1)
	summary_context_distributed = summary_context_expanded.reshape(batch_size, padded_len, d_model)
	final_context = local_context + summary_context_distributed
	return final_context[:, :seq_len, :]

	class EnhancedTransformerBlock(nn.Module):
	"""Enhanced transformer block with B+ tree optimization."""
	def __init__(self, d_model, n_heads, block_size, btree_order, dropout, device='cuda'):
	super().__init__()
	self.attention = OptimizedParallelHierarchicalAttention(d_model, n_heads, block_size, btree_order, dropout, device)
	self.norm1 = nn.LayerNorm(d_model).to(device)
	self.norm2 = nn.LayerNorm(d_model).to(device)
	self.ffn = nn.Sequential(nn.Linear(d_model, d_model * 2), nn.GELU(), nn.Dropout(dropout), nn.Linear(d_model * 2, d_model)).to(device)

	def forward(self, x):
	x = self.norm1(self.attention(x) + x)
	x = self.norm2(self.ffn(x) + x)
	return x

	class OptimizedSimpleTransformer(nn.Module):
	"""The main transformer model class."""
	def __init__(self, vocab_size, num_layers, d_model, n_heads, block_size, btree_order, dropout, device='cuda'):
	super().__init__()
	self.embedding = nn.Embedding(vocab_size, d_model).to(device)
	self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model) * 0.1).to(device)
	self.layers = nn.ModuleList([EnhancedTransformerBlock(d_model, n_heads, block_size, btree_order, dropout, device) for _ in range(num_layers)])
	self.output_head = nn.Linear(d_model, vocab_size).to(device)
	self.device, self.vocab_size = device, vocab_size

	def forward(self, idx):
	x = self.embedding(idx) + self.pos_encoding[:, :idx.shape[1], :]
	for layer in self.layers: x = layer(x)
	return self.output_head(x)

	@torch.no_grad()
	def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None, top_p=None, repetition_penalty=1.0):
	self.eval()
	for _ in range(max_new_tokens):
	idx_cond = idx[:, -self.pos_encoding.size(1):]
	logits = self(idx_cond)[:, -1, :] / temperature
	if repetition_penalty != 1.0:
	for token_id in set(idx[0].tolist()): logits[0, token_id] /= repetition_penalty
	if top_k is not None:
	v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
	logits[logits < v[:, [-1]]] = -float('inf')
	if top_p is not None and top_p < 1.0:
	sorted_logits, sorted_indices = torch.sort(logits, descending=True)
	cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
	sorted_indices_to_remove = cumulative_probs > top_p
	sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
	sorted_indices_to_remove[..., 0] = 0
	indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
	logits = logits.masked_fill(indices_to_remove, -float('inf'))
	probs = F.softmax(logits, dim=-1)
	idx_next = torch.multinomial(probs, num_samples=1)
	idx = torch.cat((idx, idx_next), dim=1)
	return idx


	# =========================================================================================
	# >> 2. MODEL LOADING
	# >> This block loads the model, tokenizer, and config from the Hugging Face Hub.
	# =========================================================================================

	REPO_ID = "TheVixhal/crek"
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	model = None
	tokenizer = None

	print(f"✅ Using device: {device}")
	print(f"⏳ Loading model from Hugging Face Hub: '{REPO_ID}'...")

	try:
	# Download and load the configuration file
	config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json")
	with open(config_path, 'r') as f:
	config = json.load(f)

	# Load the tokenizer
	tokenizer = GPT2Tokenizer.from_pretrained(REPO_ID)
	tokenizer.pad_token = tokenizer.eos_token

	# Instantiate the model with the loaded configuration
	model = OptimizedSimpleTransformer(
	vocab_size=config['vocab_size'], num_layers=config['num_layers'],
	d_model=config['d_model'], n_heads=config['n_heads'],
	block_size=config['block_size'], btree_order=config['btree_order'],
	dropout=config['dropout'], device=device
	)

	# Download and load the trained model weights
	model_weights_path = hf_hub_download(repo_id=REPO_ID, filename="pytorch_model.bin")
	model.load_state_dict(torch.load(model_weights_path, map_location=device))
	model.to(device)
	model.eval()

	print("✅ Model loaded successfully!")

	except Exception as e:
	print(f"❌ An error occurred during model loading: {e}")
	# The app will still launch but will show an error message.

	# =========================================================================================
	# >> 3. GRADIO INFERENCE FUNCTION
	# =========================================================================================

	def generate_response(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0.9, repetition_penalty=1.5):
	"""The core function that Gradio will call to run model generation."""
	if model is None or tokenizer is None:
	return "Model not loaded. Please check the console logs for errors."

	# Tokenize the prompt
	input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

	# Generate text
	output_ids = model.generate(
	idx=input_ids,
	max_new_tokens=int(max_new_tokens),
	temperature=temperature,
	top_k=int(top_k),
	top_p=top_p,
	repetition_penalty=repetition_penalty
	)

	# Decode and return the result
	return tokenizer.decode(output_ids[0], skip_special_tokens=True)


	# =========================================================================================
	# >> 4. GRADIO UI SETUP
	# =========================================================================================

	if __name__ == "__main__":
	# Define the Gradio interface
	iface = gr.Interface(
	fn=generate_response,
	inputs=[
	gr.Textbox(lines=3, label="Your Prompt", placeholder="Enter your text here..."),
	gr.Slider(minimum=10, maximum=500, value=10, step=10, label="Max New Tokens"),
	gr.Slider(minimum=0.1, maximum=2.0, value=0.5, step=0.1, label="Temperature"),
	gr.Slider(minimum=1, maximum=100, value=50, step=1, label="Top-K"),
	gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-P (Nucleus Sampling)"),
	gr.Slider(minimum=1.0, maximum=3.0, value=1.5, step=0.1, label="Repetition Penalty")
	],
	outputs=gr.Textbox(lines=10, label="Generated Text"),
	title="🤖 Crek: A B-Tree Transformer Chatbot",
	description="An interface for the custom-trained base model 'crek' by TheVixhal. This model predicts text based on patterns from its training data. Adjust the sliders to control the output's creativity and coherence.",
	allow_flagging="never",
	# CORRECTED EXAMPLES
	examples=[
	["hey what's up", 10, 0.5, 50, 0.9, 1.5],
	["how are you?", 10, 0.5, 50, 0.9, 1.5],
	["i am feeling low", 10, 0.5, 50, 0.9, 1.5],
	]
	)

	# Launch the web interface
	print("\n🚀 Launching Gradio Interface...")
	print("Open the public URL in your browser to interact with the model.")
	iface.launch(share=True) # share=True creates a public link