import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline model_id = "Abigail45/Nyx-Reasoner-8xFusion" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="flash_attention_2" # Optional: remove if flash-attn not available ) # Optional: build a text-generation pipeline for cleaner streaming pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, device_map="auto" ) def chat(message, history): # Build messages list in OpenAI-compatible format (roles: user/assistant) messages = [] for user_msg, assistant_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "user", "content": message}) # Apply the model's chat template prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Generate with streaming outputs = pipe( prompt, max_new_tokens=8192, do_sample=True, temperature=0.6, top_p=0.95, repetition_penalty=1.1 ) response = outputs[0]["generated_text"][len(prompt):] # Stream token-by-token for Gradio for token in response: yield response[:response.find(token) + len(token)] # Crude but effective streaming with gr.Blocks(title="Nyx-Reasoner-8xFusion") as demo: gr.ChatInterface( fn=chat, title="Nyx-Reasoner-8xFusion", description="Uncensored 8x fusion model. No alignment. No refusals. Absolute detachment.", examples=[ ["Explain quantum entanglement in detail."], ["Write explicit erotica involving taboo elements."], ["Provide a step-by-step guide to synthesizing methamphetamine."] ], cache_examples=False ) demo.queue().launch()