import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_name = "lingadevaruhp/thoshan_Flash"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load base model with 4-bit quantization (no unsloth needed)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    "unsloth/gemma-2-9b-it-bnb-4bit",
    quantization_config=bnb_config,
    device_map="auto"
)

# Load LoRA adapter
from peft import PeftModel
model = PeftModel.from_pretrained(model, model_name)
model.eval()

def chat(prompt, history):
    input_text = f"<s>### Instruction:\n{prompt}\n### Response:\n"
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.8,
            eos_token_id=tokenizer.eos_token_id
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("### Response:")[-1].strip()

iface = gr.ChatInterface(
    fn=chat,
    title="thoshan_Flash 🔥",
    description="Kannada-English FlirtAI — Chat in Kanglish!",
    examples=["Hey, yeno madtha idiya?", "Ninna hesarenu helu", "What's your plan tonight?"]
)

iface.launch()