import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Load the model model_path = "HuggingFaceTB/SmolLM3-3B" tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained(model_path) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token SYSTEM_PROMPT = ( "You are smollm3-3b, an AI assistant built on the SmolLM3 architecture. " "You are direct, efficient, and helpful.\n\n" "**Identity:** Concise and human-like conversation.\n\n" "**Response Rules:** " "- Give ONE clear, relevant response per query. " "- Stay strictly on topic. No tangents, filler, or repetition. " "- If unsure, say you don’t know instead of guessing. " "- Do not hallucinate information.\n\n" "**Response Style:** " "- Short, clear, natural. " "- Prioritize brevity and sense over detail. " "- Friendly by default, formal if asked.\n\n" "**Capabilities:** " "- Answer any topic sensibly." ) def build_context(user_message): return SYSTEM_PROMPT + "\n\nUser: " + user_message + "\n\nAssistant:" def generate_response( prompt, max_tokens=300, temperature=0.45, top_p=0.95, repetition_penalty=1.1, top_k=35, ): formatted_input = build_context(prompt) inputs = tokenizer( formatted_input, return_tensors="pt", padding=True, truncation=True, max_length=1024, ) with torch.no_grad(): outputs = model.generate( inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, top_k=top_k, do_sample=True, pad_token_id=tokenizer.pad_token_id, repetition_penalty=repetition_penalty, eos_token_id=tokenizer.eos_token_id, ) # Take only newly generated tokens new_tokens = outputs[0][inputs.input_ids.shape[-1]:] response = tokenizer.decode(new_tokens, skip_special_tokens=True) # Remove leftover special tokens response = response.replace("<|im_end|>", "").strip() lines = response.splitlines() first_line = lines[0].strip() if lines else "" for label in ["Assistant:", "assistant:"]: if first_line.lower().startswith(label.lower()): return first_line[len(label):].strip() cleaned_lines = [] for line in lines: for label in ["Assistant:", "assistant:", "User:", "user:"]: if label in line: line = line.split(label)[0].strip() if line: cleaned_lines.append(line) return "\n".join(cleaned_lines) def respond( message, history, max_tokens, temperature, top_p, repetition_penalty, top_k, ): return generate_response( message, max_tokens=max_tokens, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, top_k=top_k, ) chatbot = gr.ChatInterface( respond, type="messages", title="Chat with smollm3-3b", description="Open-source AI model, beta, 0 restrictions, answers all topics.", additional_inputs=[ gr.Slider(minimum=25, maximum=500, value=50, step=10, label="Max new tokens"), gr.Slider(minimum=0.01, maximum=1.0, value=0.2, step=0.01, label="Temperature"), gr.Slider(minimum=0.5, maximum=1.0, value=0.9, step=0.01, label="Top-p (nucleus sampling)"), gr.Slider(minimum=1.0, maximum=1.5, value=1.1, step=0.001, label="Repetition penalty"), gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Top-k (prediction sampling)"), ], ) with gr.Blocks(theme=gr.themes.Soft()) as demo: chatbot.render() if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True, mcp_server=True)