import torch from transformers import AutoTokenizer, AutoModelForCausalLM import gradio as gr # Initialize model and tokenizer MODEL_ID = "abdelac/Mistral_Test" def load_model(): tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float16, device_map="auto" ) return tokenizer, model tokenizer, model = load_model() def respond(message, history): # Format chat history prompt = "" for user_msg, assistant_msg in history: prompt += f"Human: {user_msg}\nAssistant: {assistant_msg}\n" prompt += f"Human: {message}\nAssistant:" # Tokenize inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # Generate outputs = model.generate( **inputs, max_new_tokens=256, temperature=0.7, do_sample=True, pad_token_id=tokenizer.eos_token_id ) # Decode response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) return response # Create chat interface gr.ChatInterface( respond, title="TinyLlama Chat", description="Chat with TinyLlama model", ).launch()