import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel base_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" adapter_model = "Sanjay002/falcon-7b-mental-health-finetuned" device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( base_model, device_map=None, # Don't map to GPU torch_dtype=torch.float32 if device == "cpu" else torch.bfloat16, trust_remote_code=True ) model = PeftModel.from_pretrained(model, adapter_model) model.to(device) model.eval() def chat(message): inputs = tokenizer(message, return_tensors="pt").to(device) outputs = model.generate(**inputs, max_new_tokens=150, do_sample=True, temperature=0.7) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response gr.Interface(fn=chat, inputs="text", outputs="text", title="🧠 Mental Health Chatbot").queue().launch()