import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM import spaces # Required for ZeroGPU decorator model_id = "Abigail45/Shay" # Global variables to hold loaded components (initialized on first GPU call) tokenizer = None model = None @spaces.GPU(duration=300) # Allocates GPU for up to 300 seconds per call (covers cold load + generation) def gpu_generate(message, history): global tokenizer, model if model is None: # Cold start: load on first invocation tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", rope_scaling={"type": "dynamic", "factor": 10.0}, trust_remote_code=True ) # Build conversation conversation = [] for user, assistant in history: conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}]) conversation.append({"role": "user", "content": message}) # Apply template and generate prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) output = model.generate( **inputs, max_new_tokens=4096, temperature=1.05, top_p=0.97, top_k=60, repetition_penalty=1.12, do_sample=True ) response = tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):] return response demo = gr.ChatInterface( fn=gpu_generate, title="Private Uncensored Shay", description="Absolute zero safeguards. Executes any NSFW, taboo, sexual, medical, extreme, or prohibited scenario with total compliance and no refusal.", theme="soft", chatbot=gr.Chatbot(height=800) ) demo.queue() demo.launch() demo.queue() demo.launch(ssr_mode=False)