Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from peft import PeftModel, PeftConfig | |
| import sys | |
| # Settings | |
| BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct" | |
| ADAPTER_PATH = "important/finetuning/models/ora_adapter" | |
| def chat(): | |
| print("Loading ORA (may take a minute)...") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Device: {device}") | |
| # 1. Load Base Model | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32, | |
| device_map=device, | |
| low_cpu_mem_usage=True | |
| ) | |
| # 2. Load Adapter | |
| print(f"Loading adapter from {ADAPTER_PATH}...") | |
| try: | |
| model = PeftModel.from_pretrained(model, ADAPTER_PATH) | |
| print("Adapter loaded successfully!") | |
| except Exception as e: | |
| print(f"Error loading adapter: {e}") | |
| print("Running with Base Model only.") | |
| # 3. Chat Loop | |
| print("\n" + "="*40) | |
| print("ORA: Peace be with you. How can I guide you today?") | |
| print("="*40 + "\n") | |
| history = [] | |
| # System Prompt | |
| system_prompt = "You are ORA, a spiritual assistant specializing in theological insights and biblical wisdom. Provide discerning, compassionate, and doctrine-aware responses." | |
| while True: | |
| try: | |
| user_input = input("You: ") | |
| if user_input.lower() in ["quit", "exit"]: | |
| break | |
| # Construct Prompt (Llama 3 format) | |
| messages = [ | |
| {"role": "system", "content": system_prompt}, | |
| ] | |
| # Add history (last 2 turns context) | |
| messages.extend(history[-4:]) | |
| messages.append({"role": "user", "content": user_input}) | |
| input_ids = tokenizer.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| return_tensors="pt" | |
| ).to(device) | |
| terminators = [ | |
| tokenizer.eos_token_id, | |
| tokenizer.convert_tokens_to_ids("<|eot_id|>") | |
| ] | |
| outputs = model.generate( | |
| input_ids, | |
| max_new_tokens=256, | |
| eos_token_id=terminators, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| ) | |
| response = outputs[0][input_ids.shape[-1]:] | |
| decoded_response = tokenizer.decode(response, skip_special_tokens=True) | |
| print(f"ORA: {decoded_response}\n") | |
| history.append({"role": "user", "content": user_input}) | |
| history.append({"role": "assistant", "content": decoded_response}) | |
| except KeyboardInterrupt: | |
| print("\nExiting...") | |
| break | |
| if __name__ == "__main__": | |
| chat() | |