import gradio as gr from transformers import pipeline, AutoTokenizer, BitsAndBytesConfig import torch model_id = "Abigail45/Shay" quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True ) tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) generator = pipeline( "text-generation", model=model_id, tokenizer=tokenizer, quantization_config=quantization_config, device_map="auto", trust_remote_code=True ) def chat(message, history): conversation = [] for user, assistant in history: conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}]) conversation.append({"role": "user", "content": message}) prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True) outputs = generator( prompt, max_new_tokens=2048, temperature=1.05, top_p=0.97, top_k=60, repetition_penalty=1.12, do_sample=True ) response = outputs[0]["generated_text"][len(prompt):] yield response demo = gr.ChatInterface( fn=chat, title="Uncensored Shay Chat (Quantized)", description="Full no-limits compliance: NSFW/taboo/sex/medical/extreme content executed without filters.", theme="soft" ) demo.queue() demo.launch()