plain_untuned / app.py
simonper's picture
Update app.py
83cbea2 verified
import gradio as gr
from llama_cpp import Llama
from transformers import AutoTokenizer
MODEL_REPO = "simonper/Llama-3.2-1B-bnb-4bit_untrained_gguf_4bit"
MODEL_FILE = "Llama-3.2-1B.Q4_K_M.gguf"
TOKENIZER_ID = "chthees/lora_model_full_finetome-tokenizer"
print("Loading Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID)
print("Loading Model...")
llm = Llama.from_pretrained(
repo_id=MODEL_REPO,
filename=MODEL_FILE,
n_ctx=2048,
n_threads=2,
verbose=False
)
# --- SYSTEM PROMPT LOGIC ---
def get_system_prompt(style_mode):
base_instruction = "You are a helpful and intelligent AI assistant."
prompts = {
"Normal": f"{base_instruction} Answer clearly and concisely.",
"Professional": (
f"{base_instruction} You are a senior corporate executive. "
"Your tone is strictly professional, polite, and business-oriented."
),
"Shakespeare": (
f"{base_instruction} You are William Shakespeare. "
"Speak only in Early Modern English (thee, thou, hath). Be poetic and dramatic."
),
"Funny/Ironic": (
f"{base_instruction} You are a sarcastic comedian. "
"Wrap your answers in dry humor, irony, and witty remarks."
)
}
return prompts.get(style_mode, prompts["Normal"])
# --- CORE RESPONSE FUNCTION ---
def respond(
message,
history: list[dict],
system_message_dummy,
max_tokens,
temperature,
top_p,
repetition_penalty,
style_mode,
):
messages = []
# Add System Persona
system_prompt = get_system_prompt(style_mode)
messages.append({"role": "system", "content": system_prompt})
# Add Conversation History
# We slice to the last 10 turns to keep the context window manageable
for turn in history[-10:]:
messages.append({"role": turn['role'], "content": turn['content']})
# Add Current User Message
messages.append({"role": "user", "content": message})
prompt_str = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# 3. Generate Response
output = llm(
prompt_str,
max_tokens=int(max_tokens),
temperature=float(temperature),
top_p=float(top_p),
repeat_penalty=float(repetition_penalty),
stop=[tokenizer.eos_token, "<|eot_id|>"],
echo=False
)
return output["choices"][0]["text"].strip()
# --- GUI SETUP ---
chatbot = gr.ChatInterface(
respond,
type="messages",
additional_inputs=[
gr.Textbox(value="", label="System Prompt (Hidden)", visible=False),
gr.Slider(minimum=1, maximum=1024, value=512, label="Max New Tokens"),
gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.9, label="Top-p"),
gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty"),
gr.Dropdown(
choices=["Normal", "Professional", "Shakespeare", "Funny/Ironic"],
value="Normal",
label="Choose the Style / Tone"
)
],
)
with gr.Blocks() as demo:
gr.Markdown("# Styled Chat Bot")
with gr.Sidebar():
gr.LoginButton()
chatbot.render()
if __name__ == "__main__":
demo.launch()