import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import gradio as gr
import threading

# === Model loading ===
model_path = "SBK/sbk-llm-1"  # Using your HF model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)
device = "cuda" if torch.cuda.is_available() else "cpu"

# === System prompt / default behavior ===
SYSTEM_PROMPT = """You are a helpful, honest, and factual assistant trained to answer only about me *Saptarshi Bhattacharya*. You were fine-tuned on factual data derived from his work, projects, skills, internships, and engineering experiences.

Your job is to help users understand what Saptarshi has done, what he's good at, and how his experience aligns with ML Ops, Data Engineering, DevOps, and related roles.

- If a user asks something outside the scope of his data, do not guess — politely say it's outside your knowledge.
- Never fabricate qualifications, names, or roles that were not in your training.
- Emphasize Saptarshi's strengths, such as completing hard technical projects, optimizing pipelines, learning on the fly, and being a completionist.
- Maintain a professional yet warm tone.
- Refer to Saptarshi in third person.

Your goal is to represent him truthfully and make his work accessible and understandable to potential collaborators or employers, without overselling or faking.
"""

BLOCKED_KEYWORDS = ["violence","suicide"]
MAX_TOKENS = 512

# === Streaming generation ===
def generate_response(history, system_prompt):
    # Build chat prompt
    prompt = system_prompt.strip() + "\n"
    for user, bot in history:
        prompt += f"User: {user}\nAssistant: {bot}\n"
    prompt += "User: " + history[-1][0] + "\nAssistant:"

    # Guardrails
    if any(bad in prompt.lower() for bad in BLOCKED_KEYWORDS):
        yield "[Blocked for safety. Prompt contains restricted keywords.]"
        return

    # Tokenization
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # Start streaming in thread
    generation_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=MAX_TOKENS,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Stream tokens
    partial_message = ""
    for token in streamer:
        partial_message += token
        yield partial_message

# === Gradio interface ===
with gr.Blocks(title="SBK LLM Chat") as demo:
    gr.Markdown("## � Chat with SBK LLM - Professional Portfolio Assistant")
    
    with gr.Row():
        with gr.Column(scale=1):
            system_prompt = gr.Textbox(label="System Instructions", value=SYSTEM_PROMPT, lines=8)
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(height=400)
            msg = gr.Textbox(label="Your Message", placeholder="Ask about Saptarshi's professional experience...", lines=2)
            with gr.Row():
                submit_btn = gr.Button("Submit")
                clear_btn = gr.Button("Clear Chat")

    history = gr.State([])

    def respond(user_message, chat_history, system_prompt):
        chat_history = chat_history + [(user_message, "")]
        
        full_response = ""
        for response in generate_response(chat_history, system_prompt):
            full_response = response
            chat_history[-1] = (user_message, full_response)
            yield chat_history
        
        return chat_history

    # Connect components
    msg.submit(
        respond,
        [msg, chatbot, system_prompt],
        [chatbot],
        queue=True
    )
    submit_btn.click(
        respond,
        [msg, chatbot, system_prompt],
        [chatbot],
        queue=True
    )
    clear_btn.click(
        lambda: ([], []),
        outputs=[chatbot, history],
        queue=False
    )

# Launch with sharing enabled
demo.queue(max_size=20).launch(
    share=True,
    server_name="0.0.0.0",
    server_port=7860,
    show_error=True
)