import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import os # Settings BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct" ADAPTER_PATH = "important/finetuning/models/ora_adapter" # Global Model model = None tokenizer = None device = "cuda" if torch.cuda.is_available() else "cpu" def load_model(): global model, tokenizer print(f"Loading ORA Model on {device}...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.float16 if device == "cuda" else torch.float32, device_map=device, low_cpu_mem_usage=True ) if os.path.exists(ADAPTER_PATH): print(f"Loading adapter from {ADAPTER_PATH}...") model = PeftModel.from_pretrained(base_model, ADAPTER_PATH) else: model = base_model print("Model Loaded.") def chat_response(message, history): system_prompt = "You are ORA, a spiritual assistant specializing in theological insights and biblical wisdom. Provide discerning, compassionate, and doctrine-aware responses." # Simple history construction # Gradio history is [[user, bot], [user, bot]] messages = [{"role": "system", "content": system_prompt}] for human, assistant in history: messages.append({"role": "user", "content": human}) messages.append({"role": "assistant", "content": assistant}) messages.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt" ).to(device) terminators = [ tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>") ] outputs = model.generate( input_ids, max_new_tokens=256, eos_token_id=terminators, do_sample=True, temperature=0.7, top_p=0.9, ) response_tokens = outputs[0][input_ids.shape[-1]:] response = tokenizer.decode(response_tokens, skip_special_tokens=True) return response # Load now load_model() # UI with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple")) as demo: gr.Markdown("# ORA Spiritual Assistant") gr.ChatInterface(fn=chat_response) if __name__ == "__main__": demo.launch(share=True)