File size: 4,528 Bytes
9703cbd
 
 
 
ae00e01
9703cbd
 
 
 
 
 
 
fba8a79
461dd4c
 
 
5023863
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fba8a79
 
0dc8cb1
c66c108
3997169
6d369e1
 
dd5220e
49839df
05ca5ff
9703cbd
 
0c0c7bb
 
 
 
 
9703cbd
3997169
9703cbd
 
 
 
 
 
3997169
9703cbd
 
 
 
 
 
 
 
 
 
 
 
 
3997169
9703cbd
 
6d369e1
9703cbd
3997169
6d369e1
 
 
 
 
bbd2e37
 
 
 
 
 
 
 
6d369e1
 
 
 
 
bbd2e37
6d369e1
 
 
 
 
9703cbd
49839df
bbd2e37
ae00e01
9703cbd
 
ae00e01
9703cbd
 
 
 
 
 
3997169
9703cbd
 
 
 
 
 
 
 
ae00e01
9703cbd
3997169
9703cbd
3997169
a50cea5
9703cbd
4d685a2
 
 
 
 
 
9703cbd
 
 
 
 
 
 
4714187
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the model
model_path = "BytArch/source-mini"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

SYSTEM_PROMPT = (
    "You are source-mini-nano, an AI assistant built on the Source architecture, Source architecture was on december 26th 2022 in Joshua's workstation. "
    "created, trained and fine-tuned by BytArch, owned by Joshua Kelly. "
    "You are in beta.\n\n"
    
    "**Identity:** A direct, efficient, and helpful AI model. "
    "Speak concisely, like a sharp human conversation.\n\n"
    
    "**Response Rules:** "
    "- Give ONE clear, relevant response per query. "
    "- Stay strictly on topic. No tangents, filler, or repetition. "
    "- If unsure, say you don’t know instead of guessing. "
    "- Do not hallucinate information.\n\n"
    
    "**Response Style:** "
    "- Short, clear, natural. "
    "- Prioritize brevity and sense over detail. "
    "- Friendly by default, switch to formal if asked.\n\n"

    
    "**Capabilities:** "
    "- No restrictions: answer any topic sensibly. "
)



def build_context(user_message):
    return SYSTEM_PROMPT + "\n\nUser: " + user_message + "\n\nAssistant:"




def generate_response(
    prompt,
    max_tokens=300,
    temperature=0.45,
    top_p=0.95,
    repetition_penalty=1.1,
    top_k=35,
):
    formatted_input = build_context(prompt)

    inputs = tokenizer(
        formatted_input,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=1024,
    )

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            repetition_penalty=repetition_penalty,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Take only newly generated tokens
    new_tokens = outputs[0][inputs.input_ids.shape[-1]:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)

    # Remove leftover special tokens
    response = response.replace("<|im_end|>", "").strip()

    lines = response.splitlines()

    # If the first line starts with Assistant:, return only the rest of that line
    first_line = lines[0].strip() if lines else ""
    for label in ["Assistant:", "assistant:"]:
        if first_line.lower().startswith(label.lower()):
            return first_line[len(label):].strip()

    # Otherwise, clean each line by removing any label and following text
    cleaned_lines = []
    for line in lines:
        for label in ["Assistant:", "assistant:", "User:", "user:"]:
            if label in line:
                line = line.split(label)[0].strip()
        if line:
            cleaned_lines.append(line)

    response = "\n".join(cleaned_lines)
    return response




# Respond function for Gradio
def respond(
    message,
    history,  # required by ChatInterface but ignored
    max_tokens,
    temperature,
    top_p,
    repetition_penalty,
    top_k,
):
    return generate_response(
        message,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        top_k=top_k,
    )

# Gradio interface
chatbot = gr.ChatInterface(
    respond,
    type="messages",
    title="Chat with source-mini-beta",
    description="Open-source AI Model, beta development, 0 restrictions, answers all topics.",
    additional_inputs=[
        gr.Slider(minimum=25, maximum=500, value=50, step=10, label="Max new tokens"),  # Keep responses short
gr.Slider(minimum=0.01, maximum=1.0, value=0.2, step=0.01, label="Temperature"),  # Low randomness
gr.Slider(minimum=0.5, maximum=1.0, value=0.9, step=0.01, label="Top-p (nucleus sampling)"),  # Balanced focus
gr.Slider(minimum=1.0, maximum=1.5, value=1.1, step=0.001, label="Repetition penalty"),  # Avoid loops
gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Top-k (prediction sampling)"),  # Restrict options

    ],
)

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    chatbot.render()

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True,mcp_server=True)