import torch import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer device = "cuda" if torch.cuda.is_available() else "cpu" # Automatically detect GPU or CPU model_name = "tanusrich/Mental_Health_Chatbot" model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, # Reduce memory usage device_map="cpu", # Automatically assigns to GPU if available low_cpu_mem_usage=True, max_memory={0: "3.5GiB", "cpu": "12GiB"}, # Optimize CPU memory offload_folder=None ) tokenizer = AutoTokenizer.from_pretrained(model_name) ''' model_save_path = "./model" # Save model model.save_pretrained(model_save_path) # Save tokenizer tokenizer.save_pretrained(model_save_path)''' def generate_response(user_input): inputs = tokenizer(user_input, return_tensors="pt").to("cpu") with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=150, temperature=0.7, top_k=50, top_p=0.9, repetition_penalty=1.2, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(output[0], skip_special_tokens=True) # Extract only chatbot's latest response chatbot_response = response.split("Chatbot:")[-1].strip() # Update conversation history conversation_history += chatbot_response + "\n" return chatbot_response # Continuous conversation loop '''while True: user_input = input("You: ") # Take user input if user_input.lower() in ["exit", "quit", "stop"]: print("Chatbot: Goodbye!") break response = generate_response(user_input) print("Chatbot:", response)''' # Initialize the ChatInterface chatbot = gr.ChatInterface(fn=generate_response, title="Mental Health Chatbot") chatbot.launch() ''' # Example user_input = "I'm feeling suicidal." response = generate_response(user_input) print("Chatbot: ", response) '''