# app.py (LoRA-only loading)
import gradio as gr
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, pipeline
import torch
import os
import re
import json
import time
from datetime import datetime
from huggingface_hub import model_info

# ===== Settings =====
device = 0 if torch.cuda.is_available() else -1
lora_repo = "rahul7star/GPT-Diffuser-v1"  # ONLY LoRA fine-tuned repo

log_lines = []

def log(msg):
    line = f"[{datetime.now().strftime('%H:%M:%S')}] {msg}"
    print(line)
    log_lines.append(line)

log(f"🚀 Loading LoRA-only model from {lora_repo}")
log(f"Device: {'GPU' if device==0 else 'CPU'}")

# ====== Tokenizer ======
try:
    tokenizer = AutoTokenizer.from_pretrained(lora_repo, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    log(f"✅ Tokenizer loaded: vocab size {tokenizer.vocab_size}")
except Exception as e:
    log(f"❌ Tokenizer load failed: {e}")
    tokenizer = None

# ====== LoRA-only model ======
model = None
pipe = None
try:
    model = AutoModelForCausalLM.from_pretrained(
        lora_repo,
        trust_remote_code=True,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None,
    )
    model.eval()
    log("✅ LoRA-only model loaded successfully")
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device=device,
    )
    log("✅ Pipeline ready for inference")
except Exception as e:
    log(f"❌ LoRA model load failed: {e}")

# ====== Chat Function ======
def chat_with_model(message, history):
    log_lines.clear()
    log(f"💭 User message: {message}")

    if pipe is None:
        return "", history, "⚠️ Model pipeline not loaded."

    context = "The following is a conversation between a user and an AI assistant trained on GIT souce code.\n"
    for user, bot in history:
        context += f"User: {user}\nAssistant: {bot}\n"
    context += f"User: {message}\nAssistant:"

    log("📄 Built conversation context")
    log(context)

    start_time = time.time()
    try:
        output = pipe(
            context,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
        )[0]["generated_text"]
        log(f"⏱️ Inference took {time.time() - start_time:.2f}s")
    except Exception as e:
        log(f"❌ Generation failed: {e}")
        return "", history, "\n".join(log_lines)

    # Clean reply
    reply = output[len(context):].strip()
    reply = re.sub(r"(ContentLoaded|<\/?[^>]+>|[\r\n]{2,})", " ", reply)
    reply = re.sub(r"\s{2,}", " ", reply).strip()
    reply = reply.split("User:")[0].split("Assistant:")[0].strip()

    log(f"🪄 Model reply: {reply}")
    history.append((message, reply))
    return "", history, "\n".join(log_lines)

# ===== Gradio =====
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
    gr.Markdown("## 💬 Qwen LoRA-only — Bhagavad Gita Assistant")

    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(height=500)
            msg = gr.Textbox(placeholder="Ask about the Gita...", label="Your Message")
            clear = gr.Button("Clear")
        with gr.Column(scale=1):
            log_box = gr.Textbox(label="Detailed Model Log", lines=25, interactive=False)

    msg.submit(chat_with_model, [msg, chatbot], [msg, chatbot, log_box])
    clear.click(lambda: (None, None, ""), None, [chatbot, log_box], queue=False)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)