import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float32 ) def build_prompt(message, history): prompt = "" for user_msg, bot_msg in history: prompt += f"<|user|>\n{user_msg}\n<|assistant|>\n{bot_msg}\n" prompt += f"<|user|>\n{message}\n<|assistant|>\n" return prompt def chat(message, history): prompt = build_prompt(message, history) inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=200, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.1, pad_token_id=tokenizer.eos_token_id ) full_text = tokenizer.decode(outputs[0], skip_special_tokens=True) answer = full_text[len(tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)):].strip() if not answer: answer = "I am here. Ask me something." return answer demo = gr.ChatInterface( fn=chat, title="My Local LLM Chat", description="TinyLlama chatbot running locally without HF_TOKEN", examples=[ "Hello", "Who are you?", "Explain pain in simple words", "Write a short Python code" ] ) if __name__ == "__main__": demo.launch()