import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer MODEL_NAME = "basmala12/smollm_finetuning5" # Load tokenizer & model once at startup tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) model.eval() def respond(message, history, system_message, max_tokens, temperature, top_p): """ Safer, generic factual mode: - uses chat template properly - deterministic decoding (no sampling) - generic conciseness filter (1–2 sentences, word cap) - NO hardcoded answers for specific questions """ # Build conversation for chat template messages = [{"role": "system", "content": system_message}] # history is a list of {"role": "user"/"assistant", "content": str} messages.extend(history) # Add current user message messages.append({"role": "user", "content": message}) # Apply chat template prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) inputs = tokenizer(prompt, return_tensors="pt") # Deterministic generation: safer, less hallucination than sampling with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_tokens, do_sample=False, # no randomness temperature=0.0, # ignored when do_sample=False, but explicit ) # Take only the newly generated tokens (after the prompt) generated_tokens = outputs[0][inputs["input_ids"].shape[1]:] answer = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip() # ---------- Generic conciseness: first 1–2 sentences, word cap ---------- import re # Keep only first 1–2 sentences sentences = re.split(r'(?<=[.!?])\s+', answer) answer = " ".join(sentences[:2]) # Word cap (e.g. ~40 words) words = answer.split() if len(words) > 40: answer = " ".join(words[:40]) + "." return answer chatbot = gr.ChatInterface( fn=respond, type="messages", additional_inputs=[ gr.Textbox( value=( "Give short, factual answers with brief logical reasoning. " "If you are not sure, say you are not sure instead of guessing." ), label="System message", ), gr.Slider(1, 512, value=256, step=1, label="Max new tokens"), gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature (ignored in deterministic mode)"), gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p (ignored in deterministic mode)"), ], ) if __name__ == "__main__": chatbot.launch()