import subprocess import sys # --- THE STABILIZER BLOCK --- print("🛠️ Stabilizing environment and fixing Gradio compatibility...") subprocess.check_call([ sys.executable, "-m", "pip", "install", "tokenizers==0.20.1", "transformers==4.45.2", "huggingface-hub==0.24.7", # THE FIX: Pinning this prevents the HfFolder ImportError "gradio==4.44.1" ]) import torch import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer MODEL_REPO = "E-motionAssistant/llama-3.2-3b-english-therapy-merged" TOKENIZER_REPO = "unsloth/Llama-3.2-3B-Instruct" SYSTEM_PROMPT = "You are an empathetic therapist. Provide supportive, caring responses." model = None tokenizer = None def load_model(): global model, tokenizer if model is None: print(f"📥 Loading tokenizer from {TOKENIZER_REPO}...") tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print(f"📥 Loading model weights (Full Precision for CPU)...") model = AutoModelForCausalLM.from_pretrained( MODEL_REPO, # CHANGE: Use float32 because CPU doesn't support 'Half' (float16) torch_dtype=torch.float32, device_map="cpu", # Explicitly target CPU low_cpu_mem_usage=True ) print("✅ Success: System is online on CPU!") load_model() def chat(message, history): if not message.strip(): return "" try: # Build prompt using Llama 3.2 Instruct format # This format helps the model understand it's a conversation prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{SYSTEM_PROMPT}<|eot_id|>" for user_msg, bot_msg in history[-3:]: prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{bot_msg}<|eot_id|>" prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" # Tokenize and move to the exact same device as the model inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=256, temperature=0.6, # Slightly lower for more stable therapy responses top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id ) # Decode only the new tokens input_len = inputs.input_ids.shape[1] response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True) return response.strip() except Exception as e: print(f"❌ Generation Error: {e}") return f"I'm sorry, I encountered an error: {str(e)}. Please try again." demo = gr.ChatInterface( fn=chat, title="💚 E.motion Therapy Assistant", theme=gr.themes.Soft(), chatbot=gr.Chatbot(height=450), ) if __name__ == "__main__": demo.launch()