Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from peft import PeftModel | |
| import os | |
| # Settings | |
| BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct" | |
| ADAPTER_PATH = "important/finetuning/models/ora_adapter" | |
| # Global Model | |
| model = None | |
| tokenizer = None | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| def load_model(): | |
| global model, tokenizer | |
| print(f"Loading ORA Model on {device}...") | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32, | |
| device_map=device, | |
| low_cpu_mem_usage=True | |
| ) | |
| if os.path.exists(ADAPTER_PATH): | |
| print(f"Loading adapter from {ADAPTER_PATH}...") | |
| model = PeftModel.from_pretrained(base_model, ADAPTER_PATH) | |
| else: | |
| model = base_model | |
| print("Model Loaded.") | |
| def chat_response(message, history): | |
| system_prompt = "You are ORA, a spiritual assistant specializing in theological insights and biblical wisdom. Provide discerning, compassionate, and doctrine-aware responses." | |
| # Simple history construction | |
| # Gradio history is [[user, bot], [user, bot]] | |
| messages = [{"role": "system", "content": system_prompt}] | |
| for human, assistant in history: | |
| messages.append({"role": "user", "content": human}) | |
| messages.append({"role": "assistant", "content": assistant}) | |
| messages.append({"role": "user", "content": message}) | |
| input_ids = tokenizer.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| return_tensors="pt" | |
| ).to(device) | |
| terminators = [ | |
| tokenizer.eos_token_id, | |
| tokenizer.convert_tokens_to_ids("<|eot_id|>") | |
| ] | |
| outputs = model.generate( | |
| input_ids, | |
| max_new_tokens=256, | |
| eos_token_id=terminators, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| ) | |
| response_tokens = outputs[0][input_ids.shape[-1]:] | |
| response = tokenizer.decode(response_tokens, skip_special_tokens=True) | |
| return response | |
| # Load now | |
| load_model() | |
| # UI | |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple")) as demo: | |
| gr.Markdown("# ORA Spiritual Assistant") | |
| gr.ChatInterface(fn=chat_response) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) | |