import os, json, torch, gc from transformers import AutoTokenizer, AutoModelForCausalLM import gradio as gr # Configuration HF_TOKEN = os.environ.get("HF_TOKEN") # Securely load token from Hugging Face secrets MODEL_ID = "Qwen/Qwen1.5-14B-Chat" CACHE_DIR = "./qwen-cache" MEMORY_FILE = os.path.join(CACHE_DIR, "chat_history.json") os.makedirs(CACHE_DIR, exist_ok=True) # Load tokenizer & model tokenizer = AutoTokenizer.from_pretrained( MODEL_ID, cache_dir=CACHE_DIR, token=HF_TOKEN, trust_remote_code=True, ) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, cache_dir=CACHE_DIR, token=HF_TOKEN, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) model.eval() # Chat function def chat_fn(message, history): msgs = [{"role": "system", "content": "You are Obsidian, a helpful AI assistant."}] for user_msg, assistant_msg in history: msgs.append({"role": "user", "content": user_msg}) msgs.append({"role": "assistant", "content": assistant_msg}) msgs.append({"role": "user", "content": message}) prompt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) output_ids = model.generate( inputs.input_ids, max_new_tokens=32000, eos_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7, top_p=0.9 ) generated_ids = output_ids[0][inputs.input_ids.shape[-1]:] reply = tokenizer.decode(generated_ids, skip_special_tokens=True).strip() updated_history = history + [(message, reply)] with open(MEMORY_FILE, "w") as f: json.dump(updated_history[-20:], f) gc.collect() torch.cuda.empty_cache() return reply # Launch Gradio app gr.ChatInterface( fn=chat_fn, title="Obsidian Chatbot", theme="soft", ).launch()