File size: 2,717 Bytes
2204b4f
96b0750
 
2204b4f
600d68d
2204b4f
27ed1d2
600d68d
 
96b0750
2204b4f
 
600d68d
96b0750
27ed1d2
 
 
 
 
96b0750
0a70310
27ed1d2
96b0750
2204b4f
96b0750
 
83b358b
126e7fa
96b0750
83b358b
27ed1d2
600d68d
96b0750
600d68d
0a70310
600d68d
 
96b0750
600d68d
27ed1d2
96b0750
 
 
 
27ed1d2
 
96b0750
73e272c
27ed1d2
96b0750
 
83b358b
27ed1d2
 
 
 
 
 
2204b4f
27ed1d2
 
 
 
 
 
83b358b
126e7fa
2204b4f
73e272c
2204b4f
 
96b0750
27ed1d2
 
 
 
96b0750
 
af83bc6
27ed1d2
 
2204b4f
 
 
 
73e272c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "basmala12/smollm_finetuning5"

# Load tokenizer & model once at startup
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.eval()


def respond(message, history, system_message, max_tokens, temperature, top_p):
    """
    Safer, generic factual mode:
    - uses chat template properly
    - deterministic decoding (no sampling)
    - generic conciseness filter (1–2 sentences, word cap)
    - NO hardcoded answers for specific questions
    """

    # Build conversation for chat template
    messages = [{"role": "system", "content": system_message}]

    # history is a list of {"role": "user"/"assistant", "content": str}
    messages.extend(history)

    # Add current user message
    messages.append({"role": "user", "content": message})

    # Apply chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer(prompt, return_tensors="pt")

    # Deterministic generation: safer, less hallucination than sampling
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,   # no randomness
            temperature=0.0,   # ignored when do_sample=False, but explicit
        )

    # Take only the newly generated tokens (after the prompt)
    generated_tokens = outputs[0][inputs["input_ids"].shape[1]:]
    answer = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()

    # ---------- Generic conciseness: first 1–2 sentences, word cap ----------
    import re

    # Keep only first 1–2 sentences
    sentences = re.split(r'(?<=[.!?])\s+', answer)
    answer = " ".join(sentences[:2])

    # Word cap (e.g. ~40 words)
    words = answer.split()
    if len(words) > 40:
        answer = " ".join(words[:40]) + "."

    return answer


chatbot = gr.ChatInterface(
    fn=respond,
    type="messages",
    additional_inputs=[
        gr.Textbox(
            value=(
                "Give short, factual answers with brief logical reasoning. "
                "If you are not sure, say you are not sure instead of guessing."
            ),
            label="System message",
        ),
        gr.Slider(1, 512, value=256, step=1, label="Max new tokens"),
        gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature (ignored in deterministic mode)"),
        gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p (ignored in deterministic mode)"),
    ],
)

if __name__ == "__main__":
    chatbot.launch()