Spaces:
Sleeping
Sleeping
File size: 2,392 Bytes
5e0532d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os
# Settings
BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct"
ADAPTER_PATH = "important/finetuning/models/ora_adapter"
# Global Model
model = None
tokenizer = None
device = "cuda" if torch.cuda.is_available() else "cpu"
def load_model():
global model, tokenizer
print(f"Loading ORA Model on {device}...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
device_map=device,
low_cpu_mem_usage=True
)
if os.path.exists(ADAPTER_PATH):
print(f"Loading adapter from {ADAPTER_PATH}...")
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
else:
model = base_model
print("Model Loaded.")
def chat_response(message, history):
system_prompt = "You are ORA, a spiritual assistant specializing in theological insights and biblical wisdom. Provide discerning, compassionate, and doctrine-aware responses."
# Simple history construction
# Gradio history is [[user, bot], [user, bot]]
messages = [{"role": "system", "content": system_prompt}]
for human, assistant in history:
messages.append({"role": "user", "content": human})
messages.append({"role": "assistant", "content": assistant})
messages.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(device)
terminators = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
outputs = model.generate(
input_ids,
max_new_tokens=256,
eos_token_id=terminators,
do_sample=True,
temperature=0.7,
top_p=0.9,
)
response_tokens = outputs[0][input_ids.shape[-1]:]
response = tokenizer.decode(response_tokens, skip_special_tokens=True)
return response
# Load now
load_model()
# UI
with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple")) as demo:
gr.Markdown("# ORA Spiritual Assistant")
gr.ChatInterface(fn=chat_response)
if __name__ == "__main__":
demo.launch(share=True)
|