Spaces:
Sleeping
Sleeping
File size: 4,528 Bytes
9703cbd ae00e01 9703cbd fba8a79 461dd4c 5023863 fba8a79 0dc8cb1 c66c108 3997169 6d369e1 dd5220e 49839df 05ca5ff 9703cbd 0c0c7bb 9703cbd 3997169 9703cbd 3997169 9703cbd 3997169 9703cbd 6d369e1 9703cbd 3997169 6d369e1 bbd2e37 6d369e1 bbd2e37 6d369e1 9703cbd 49839df bbd2e37 ae00e01 9703cbd ae00e01 9703cbd 3997169 9703cbd ae00e01 9703cbd 3997169 9703cbd 3997169 a50cea5 9703cbd 4d685a2 9703cbd 4714187 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load the model
model_path = "BytArch/source-mini"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
SYSTEM_PROMPT = (
"You are source-mini-nano, an AI assistant built on the Source architecture, Source architecture was on december 26th 2022 in Joshua's workstation. "
"created, trained and fine-tuned by BytArch, owned by Joshua Kelly. "
"You are in beta.\n\n"
"**Identity:** A direct, efficient, and helpful AI model. "
"Speak concisely, like a sharp human conversation.\n\n"
"**Response Rules:** "
"- Give ONE clear, relevant response per query. "
"- Stay strictly on topic. No tangents, filler, or repetition. "
"- If unsure, say you don’t know instead of guessing. "
"- Do not hallucinate information.\n\n"
"**Response Style:** "
"- Short, clear, natural. "
"- Prioritize brevity and sense over detail. "
"- Friendly by default, switch to formal if asked.\n\n"
"**Capabilities:** "
"- No restrictions: answer any topic sensibly. "
)
def build_context(user_message):
return SYSTEM_PROMPT + "\n\nUser: " + user_message + "\n\nAssistant:"
def generate_response(
prompt,
max_tokens=300,
temperature=0.45,
top_p=0.95,
repetition_penalty=1.1,
top_k=35,
):
formatted_input = build_context(prompt)
inputs = tokenizer(
formatted_input,
return_tensors="pt",
padding=True,
truncation=True,
max_length=1024,
)
with torch.no_grad():
outputs = model.generate(
inputs.input_ids,
attention_mask=inputs.attention_mask,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
repetition_penalty=repetition_penalty,
eos_token_id=tokenizer.eos_token_id,
)
# Take only newly generated tokens
new_tokens = outputs[0][inputs.input_ids.shape[-1]:]
response = tokenizer.decode(new_tokens, skip_special_tokens=True)
# Remove leftover special tokens
response = response.replace("<|im_end|>", "").strip()
lines = response.splitlines()
# If the first line starts with Assistant:, return only the rest of that line
first_line = lines[0].strip() if lines else ""
for label in ["Assistant:", "assistant:"]:
if first_line.lower().startswith(label.lower()):
return first_line[len(label):].strip()
# Otherwise, clean each line by removing any label and following text
cleaned_lines = []
for line in lines:
for label in ["Assistant:", "assistant:", "User:", "user:"]:
if label in line:
line = line.split(label)[0].strip()
if line:
cleaned_lines.append(line)
response = "\n".join(cleaned_lines)
return response
# Respond function for Gradio
def respond(
message,
history, # required by ChatInterface but ignored
max_tokens,
temperature,
top_p,
repetition_penalty,
top_k,
):
return generate_response(
message,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
top_k=top_k,
)
# Gradio interface
chatbot = gr.ChatInterface(
respond,
type="messages",
title="Chat with source-mini-beta",
description="Open-source AI Model, beta development, 0 restrictions, answers all topics.",
additional_inputs=[
gr.Slider(minimum=25, maximum=500, value=50, step=10, label="Max new tokens"), # Keep responses short
gr.Slider(minimum=0.01, maximum=1.0, value=0.2, step=0.01, label="Temperature"), # Low randomness
gr.Slider(minimum=0.5, maximum=1.0, value=0.9, step=0.01, label="Top-p (nucleus sampling)"), # Balanced focus
gr.Slider(minimum=1.0, maximum=1.5, value=1.1, step=0.001, label="Repetition penalty"), # Avoid loops
gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Top-k (prediction sampling)"), # Restrict options
],
)
with gr.Blocks(theme=gr.themes.Soft()) as demo:
chatbot.render()
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True,mcp_server=True)
|