TherapyEnglish / app.py
Raemih's picture
Update app.py
7b3c687 verified
import subprocess
import sys
# --- THE STABILIZER BLOCK ---
print("πŸ› οΈ Stabilizing environment and fixing Gradio compatibility...")
subprocess.check_call([
sys.executable, "-m", "pip", "install",
"tokenizers==0.20.1",
"transformers==4.45.2",
"huggingface-hub==0.24.7", # THE FIX: Pinning this prevents the HfFolder ImportError
"gradio==4.44.1"
])
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_REPO = "E-motionAssistant/llama-3.2-3b-english-therapy-merged"
TOKENIZER_REPO = "unsloth/Llama-3.2-3B-Instruct"
SYSTEM_PROMPT = "You are an empathetic therapist. Provide supportive, caring responses."
model = None
tokenizer = None
def load_model():
global model, tokenizer
if model is None:
print(f"πŸ“₯ Loading tokenizer from {TOKENIZER_REPO}...")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f"πŸ“₯ Loading model weights (Full Precision for CPU)...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_REPO,
# CHANGE: Use float32 because CPU doesn't support 'Half' (float16)
torch_dtype=torch.float32,
device_map="cpu", # Explicitly target CPU
low_cpu_mem_usage=True
)
print("βœ… Success: System is online on CPU!")
load_model()
def chat(message, history):
if not message.strip():
return ""
try:
# Build prompt using Llama 3.2 Instruct format
# This format helps the model understand it's a conversation
prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{SYSTEM_PROMPT}<|eot_id|>"
for user_msg, bot_msg in history[-3:]:
prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{bot_msg}<|eot_id|>"
prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
# Tokenize and move to the exact same device as the model
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.6, # Slightly lower for more stable therapy responses
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id
)
# Decode only the new tokens
input_len = inputs.input_ids.shape[1]
response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
return response.strip()
except Exception as e:
print(f"❌ Generation Error: {e}")
return f"I'm sorry, I encountered an error: {str(e)}. Please try again."
demo = gr.ChatInterface(
fn=chat,
title="πŸ’š E.motion Therapy Assistant",
theme=gr.themes.Soft(),
chatbot=gr.Chatbot(height=450),
)
if __name__ == "__main__":
demo.launch()