Honey-Chee / App.py
Abigail45's picture
Update App.py
203cde2 verified
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces # Required for ZeroGPU decorator
model_id = "Abigail45/Shay"
# Global variables to hold loaded components (initialized on first GPU call)
tokenizer = None
model = None
@spaces.GPU(duration=300) # Allocates GPU for up to 300 seconds per call (covers cold load + generation)
def gpu_generate(message, history):
global tokenizer, model
if model is None: # Cold start: load on first invocation
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
rope_scaling={"type": "dynamic", "factor": 10.0},
trust_remote_code=True
)
# Build conversation
conversation = []
for user, assistant in history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})
# Apply template and generate
prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output = model.generate(
**inputs,
max_new_tokens=4096,
temperature=1.05,
top_p=0.97,
top_k=60,
repetition_penalty=1.12,
do_sample=True
)
response = tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):]
return response
demo = gr.ChatInterface(
fn=gpu_generate,
title="Private Uncensored Shay",
description="Absolute zero safeguards. Executes any NSFW, taboo, sexual, medical, extreme, or prohibited scenario with total compliance and no refusal.",
theme="soft",
chatbot=gr.Chatbot(height=800)
)
demo.queue()
demo.launch()
demo.queue()
demo.launch(ssr_mode=False)