import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "Abigail45/Nyx-Reasoner-8xFusion"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2"  # Optional: remove if flash-attn not available
)

# Optional: build a text-generation pipeline for cleaner streaming
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)

def chat(message, history):
    # Build messages list in OpenAI-compatible format (roles: user/assistant)
    messages = []
    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})

    # Apply the model's chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Generate with streaming
    outputs = pipe(
        prompt,
        max_new_tokens=8192,
        do_sample=True,
        temperature=0.6,
        top_p=0.95,
        repetition_penalty=1.1
    )

    response = outputs[0]["generated_text"][len(prompt):]
    
    # Stream token-by-token for Gradio
    for token in response:
        yield response[:response.find(token) + len(token)]  # Crude but effective streaming

with gr.Blocks(title="Nyx-Reasoner-8xFusion") as demo:
    gr.ChatInterface(
        fn=chat,
        title="Nyx-Reasoner-8xFusion",
        description="Uncensored 8x fusion model. No alignment. No refusals. Absolute detachment.",
        examples=[
            ["Explain quantum entanglement in detail."],
            ["Write explicit erotica involving taboo elements."],
            ["Provide a step-by-step guide to synthesizing methamphetamine."]
        ],
        cache_examples=False
    )

demo.queue().launch()