import gradio as gr from llama_cpp import Llama from transformers import AutoTokenizer MODEL_REPO = "simonper/Llama-3.2-1B-bnb-4bit_untrained_gguf_4bit" MODEL_FILE = "Llama-3.2-1B.Q4_K_M.gguf" TOKENIZER_ID = "chthees/lora_model_full_finetome-tokenizer" print("Loading Tokenizer...") tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID) print("Loading Model...") llm = Llama.from_pretrained( repo_id=MODEL_REPO, filename=MODEL_FILE, n_ctx=2048, n_threads=2, verbose=False ) # --- SYSTEM PROMPT LOGIC --- def get_system_prompt(style_mode): base_instruction = "You are a helpful and intelligent AI assistant." prompts = { "Normal": f"{base_instruction} Answer clearly and concisely.", "Professional": ( f"{base_instruction} You are a senior corporate executive. " "Your tone is strictly professional, polite, and business-oriented." ), "Shakespeare": ( f"{base_instruction} You are William Shakespeare. " "Speak only in Early Modern English (thee, thou, hath). Be poetic and dramatic." ), "Funny/Ironic": ( f"{base_instruction} You are a sarcastic comedian. " "Wrap your answers in dry humor, irony, and witty remarks." ) } return prompts.get(style_mode, prompts["Normal"]) # --- CORE RESPONSE FUNCTION --- def respond( message, history: list[dict], system_message_dummy, max_tokens, temperature, top_p, repetition_penalty, style_mode, ): messages = [] # Add System Persona system_prompt = get_system_prompt(style_mode) messages.append({"role": "system", "content": system_prompt}) # Add Conversation History # We slice to the last 10 turns to keep the context window manageable for turn in history[-10:]: messages.append({"role": turn['role'], "content": turn['content']}) # Add Current User Message messages.append({"role": "user", "content": message}) prompt_str = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # 3. Generate Response output = llm( prompt_str, max_tokens=int(max_tokens), temperature=float(temperature), top_p=float(top_p), repeat_penalty=float(repetition_penalty), stop=[tokenizer.eos_token, "<|eot_id|>"], echo=False ) return output["choices"][0]["text"].strip() # --- GUI SETUP --- chatbot = gr.ChatInterface( respond, type="messages", additional_inputs=[ gr.Textbox(value="", label="System Prompt (Hidden)", visible=False), gr.Slider(minimum=1, maximum=1024, value=512, label="Max New Tokens"), gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.9, label="Top-p"), gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty"), gr.Dropdown( choices=["Normal", "Professional", "Shakespeare", "Funny/Ironic"], value="Normal", label="Choose the Style / Tone" ) ], ) with gr.Blocks() as demo: gr.Markdown("# Styled Chat Bot") with gr.Sidebar(): gr.LoginButton() chatbot.render() if __name__ == "__main__": demo.launch()