import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel, PeftConfig import sys # Settings BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct" ADAPTER_PATH = "important/finetuning/models/ora_adapter" def chat(): print("Loading ORA (may take a minute)...") device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Device: {device}") # 1. Load Base Model tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.float16 if device == "cuda" else torch.float32, device_map=device, low_cpu_mem_usage=True ) # 2. Load Adapter print(f"Loading adapter from {ADAPTER_PATH}...") try: model = PeftModel.from_pretrained(model, ADAPTER_PATH) print("Adapter loaded successfully!") except Exception as e: print(f"Error loading adapter: {e}") print("Running with Base Model only.") # 3. Chat Loop print("\n" + "="*40) print("ORA: Peace be with you. How can I guide you today?") print("="*40 + "\n") history = [] # System Prompt system_prompt = "You are ORA, a spiritual assistant specializing in theological insights and biblical wisdom. Provide discerning, compassionate, and doctrine-aware responses." while True: try: user_input = input("You: ") if user_input.lower() in ["quit", "exit"]: break # Construct Prompt (Llama 3 format) messages = [ {"role": "system", "content": system_prompt}, ] # Add history (last 2 turns context) messages.extend(history[-4:]) messages.append({"role": "user", "content": user_input}) input_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt" ).to(device) terminators = [ tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>") ] outputs = model.generate( input_ids, max_new_tokens=256, eos_token_id=terminators, do_sample=True, temperature=0.7, top_p=0.9, ) response = outputs[0][input_ids.shape[-1]:] decoded_response = tokenizer.decode(response, skip_special_tokens=True) print(f"ORA: {decoded_response}\n") history.append({"role": "user", "content": user_input}) history.append({"role": "assistant", "content": decoded_response}) except KeyboardInterrupt: print("\nExiting...") break if __name__ == "__main__": chat()