Spaces:
Sleeping
Sleeping
File size: 2,717 Bytes
2204b4f 96b0750 2204b4f 600d68d 2204b4f 27ed1d2 600d68d 96b0750 2204b4f 600d68d 96b0750 27ed1d2 96b0750 0a70310 27ed1d2 96b0750 2204b4f 96b0750 83b358b 126e7fa 96b0750 83b358b 27ed1d2 600d68d 96b0750 600d68d 0a70310 600d68d 96b0750 600d68d 27ed1d2 96b0750 27ed1d2 96b0750 73e272c 27ed1d2 96b0750 83b358b 27ed1d2 2204b4f 27ed1d2 83b358b 126e7fa 2204b4f 73e272c 2204b4f 96b0750 27ed1d2 96b0750 af83bc6 27ed1d2 2204b4f 73e272c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_NAME = "basmala12/smollm_finetuning5"
# Load tokenizer & model once at startup
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.eval()
def respond(message, history, system_message, max_tokens, temperature, top_p):
"""
Safer, generic factual mode:
- uses chat template properly
- deterministic decoding (no sampling)
- generic conciseness filter (1–2 sentences, word cap)
- NO hardcoded answers for specific questions
"""
# Build conversation for chat template
messages = [{"role": "system", "content": system_message}]
# history is a list of {"role": "user"/"assistant", "content": str}
messages.extend(history)
# Add current user message
messages.append({"role": "user", "content": message})
# Apply chat template
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
inputs = tokenizer(prompt, return_tensors="pt")
# Deterministic generation: safer, less hallucination than sampling
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
do_sample=False, # no randomness
temperature=0.0, # ignored when do_sample=False, but explicit
)
# Take only the newly generated tokens (after the prompt)
generated_tokens = outputs[0][inputs["input_ids"].shape[1]:]
answer = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
# ---------- Generic conciseness: first 1–2 sentences, word cap ----------
import re
# Keep only first 1–2 sentences
sentences = re.split(r'(?<=[.!?])\s+', answer)
answer = " ".join(sentences[:2])
# Word cap (e.g. ~40 words)
words = answer.split()
if len(words) > 40:
answer = " ".join(words[:40]) + "."
return answer
chatbot = gr.ChatInterface(
fn=respond,
type="messages",
additional_inputs=[
gr.Textbox(
value=(
"Give short, factual answers with brief logical reasoning. "
"If you are not sure, say you are not sure instead of guessing."
),
label="System message",
),
gr.Slider(1, 512, value=256, step=1, label="Max new tokens"),
gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature (ignored in deterministic mode)"),
gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p (ignored in deterministic mode)"),
],
)
if __name__ == "__main__":
chatbot.launch()
|