import os, json, torch, gc
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr

# Configuration
HF_TOKEN = os.environ.get("HF_TOKEN")  # Securely load token from Hugging Face secrets
MODEL_ID = "Qwen/Qwen1.5-14B-Chat"
CACHE_DIR = "./qwen-cache"
MEMORY_FILE = os.path.join(CACHE_DIR, "chat_history.json")
os.makedirs(CACHE_DIR, exist_ok=True)

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    cache_dir=CACHE_DIR,
    token=HF_TOKEN,
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    cache_dir=CACHE_DIR,
    token=HF_TOKEN,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()

# Chat function
def chat_fn(message, history):
    msgs = [{"role": "system", "content": "You are Obsidian, a helpful AI assistant."}]
    for user_msg, assistant_msg in history:
        msgs.append({"role": "user", "content": user_msg})
        msgs.append({"role": "assistant", "content": assistant_msg})
    msgs.append({"role": "user", "content": message})

    prompt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output_ids = model.generate(
        inputs.input_ids,
        max_new_tokens=32000,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

    generated_ids = output_ids[0][inputs.input_ids.shape[-1]:]
    reply = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    updated_history = history + [(message, reply)]
    with open(MEMORY_FILE, "w") as f:
        json.dump(updated_history[-20:], f)

    gc.collect()
    torch.cuda.empty_cache()
    return reply

# Launch Gradio app
gr.ChatInterface(
    fn=chat_fn,
    title="Obsidian Chatbot",
    theme="soft",
).launch()