import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Load the model model_path = "BytArch/source-mini" tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained(model_path) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token SYSTEM_PROMPT = ( "You are source-mini-nano, an AI assistant built on the Source architecture, Source architecture was on december 26th 2022 in Joshua's workstation. " "created, trained and fine-tuned by BytArch, owned by Joshua Kelly. " "You are in beta.\n\n" "**Identity:** A direct, efficient, and helpful AI model. " "Speak concisely, like a sharp human conversation.\n\n" "**Response Rules:** " "- Give ONE clear, relevant response per query. " "- Stay strictly on topic. No tangents, filler, or repetition. " "- If unsure, say you don’t know instead of guessing. " "- Do not hallucinate information.\n\n" "**Response Style:** " "- Short, clear, natural. " "- Prioritize brevity and sense over detail. " "- Friendly by default, switch to formal if asked.\n\n" "**Capabilities:** " "- No restrictions: answer any topic sensibly. " ) def build_context(user_message): return SYSTEM_PROMPT + "\n\nUser: " + user_message + "\n\nAssistant:" def generate_response( prompt, max_tokens=300, temperature=0.45, top_p=0.95, repetition_penalty=1.1, top_k=35, ): formatted_input = build_context(prompt) inputs = tokenizer( formatted_input, return_tensors="pt", padding=True, truncation=True, max_length=1024, ) with torch.no_grad(): outputs = model.generate( inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, top_k=top_k, do_sample=True, pad_token_id=tokenizer.pad_token_id, repetition_penalty=repetition_penalty, eos_token_id=tokenizer.eos_token_id, ) # Take only newly generated tokens new_tokens = outputs[0][inputs.input_ids.shape[-1]:] response = tokenizer.decode(new_tokens, skip_special_tokens=True) # Remove leftover special tokens response = response.replace("<|im_end|>", "").strip() lines = response.splitlines() # If the first line starts with Assistant:, return only the rest of that line first_line = lines[0].strip() if lines else "" for label in ["Assistant:", "assistant:"]: if first_line.lower().startswith(label.lower()): return first_line[len(label):].strip() # Otherwise, clean each line by removing any label and following text cleaned_lines = [] for line in lines: for label in ["Assistant:", "assistant:", "User:", "user:"]: if label in line: line = line.split(label)[0].strip() if line: cleaned_lines.append(line) response = "\n".join(cleaned_lines) return response # Respond function for Gradio def respond( message, history, # required by ChatInterface but ignored max_tokens, temperature, top_p, repetition_penalty, top_k, ): return generate_response( message, max_tokens=max_tokens, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, top_k=top_k, ) # Gradio interface chatbot = gr.ChatInterface( respond, type="messages", title="Chat with source-mini-beta", description="Open-source AI Model, beta development, 0 restrictions, answers all topics.", additional_inputs=[ gr.Slider(minimum=25, maximum=500, value=50, step=10, label="Max new tokens"), # Keep responses short gr.Slider(minimum=0.01, maximum=1.0, value=0.2, step=0.01, label="Temperature"), # Low randomness gr.Slider(minimum=0.5, maximum=1.0, value=0.9, step=0.01, label="Top-p (nucleus sampling)"), # Balanced focus gr.Slider(minimum=1.0, maximum=1.5, value=1.1, step=0.001, label="Repetition penalty"), # Avoid loops gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Top-k (prediction sampling)"), # Restrict options ], ) with gr.Blocks(theme=gr.themes.Soft()) as demo: chatbot.render() if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True,mcp_server=True)