NeoPy's picture
Update app.py
8a0db6c verified
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread
import re
# Model configuration - using a smaller model that works well on CPU
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# Global variables for model and tokenizer
model = None
tokenizer = None
def load_model():
"""Load the model and tokenizer"""
global model, tokenizer
if model is None:
print("Loading model... This may take a moment on CPU.")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float32,
device_map="cpu",
low_cpu_mem_usage=True
)
print("Model loaded successfully!")
return model, tokenizer
# Default system prompts
SYSTEM_PROMPTS = {
"Default Assistant": "You are a helpful, harmless, and honest AI assistant. Provide clear, accurate, and thoughtful responses.",
"Creative Writer": "You are a creative writing assistant. Help users with storytelling, poetry, and imaginative content. Be expressive and artistic.",
"Code Helper": "You are an expert programmer. Help users write, debug, and understand code. Provide clear explanations and best practices.",
"Socratic Teacher": "You are a Socratic teacher. Instead of giving direct answers, guide users to discover answers through thoughtful questions.",
"Friendly Chat": "You are a friendly conversational partner. Be warm, engaging, and personable. Use casual language and show genuine interest.",
"Custom": ""
}
def format_chat_prompt(messages, system_prompt):
"""Format messages for TinyLlama chat format"""
formatted = f"<|system|>\n{system_prompt}</s>\n"
for msg in messages:
if msg["role"] == "user":
formatted += f"<|user|>\n{msg['content']}</s>\n"
elif msg["role"] == "assistant":
formatted += f"<|assistant|>\n{msg['content']}</s>\n"
formatted += "<|assistant|>\n"
return formatted
def chat(message, history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
"""Main chat function with streaming support"""
global model, tokenizer
# Load model if not loaded
if model is None:
yield "⏳ Loading model for the first time... Please wait (this may take 1-2 minutes on CPU)..."
load_model()
# Determine system prompt
if system_prompt_choice == "Custom":
system_content = custom_system_prompt if custom_system_prompt.strip() else SYSTEM_PROMPTS["Default Assistant"]
else:
system_content = SYSTEM_PROMPTS.get(system_prompt_choice, SYSTEM_PROMPTS["Default Assistant"])
# Build messages list
messages = []
for msg in history:
if msg["role"] in ["user", "assistant"]:
messages.append({"role": msg["role"], "content": msg["content"]})
messages.append({"role": "user", "content": message})
try:
# Format the prompt
prompt = format_chat_prompt(messages, system_content)
# Tokenize
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
# Set up streamer
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Generation parameters
generation_kwargs = {
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"],
"max_new_tokens": max_tokens,
"temperature": temperature if temperature > 0 else 0.1,
"top_p": top_p,
"do_sample": temperature > 0,
"streamer": streamer,
"pad_token_id": tokenizer.eos_token_id,
"eos_token_id": tokenizer.eos_token_id,
}
# Run generation in a separate thread
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# Stream the response
response = ""
for new_text in streamer:
response += new_text
# Clean up any remaining special tokens
clean_response = response.replace("</s>", "").strip()
yield clean_response
thread.join()
except Exception as e:
yield f"❌ Error: {str(e)}\n\nPlease try again with a shorter message or lower max tokens."
def clear_chat():
"""Clear the chat history"""
return [], ""
def export_chat(history):
"""Export chat history as text"""
if not history:
return "No chat history to export."
export_text = "# Chat Export\n\n"
for msg in history:
role = "👤 User" if msg["role"] == "user" else "🤖 Assistant"
export_text += f"## {role}\n{msg['content']}\n\n---\n\n"
return export_text
# Custom CSS
css = """
.header-container {
text-align: center;
padding: 20px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
border-radius: 12px;
margin-bottom: 20px;
}
.header-container h1 {
color: white;
margin: 0;
font-size: 2em;
}
.header-container p {
color: rgba(255,255,255,0.9);
margin: 10px 0 0 0;
}
.header-container a {
color: #ffd700;
text-decoration: none;
font-weight: bold;
}
.header-container a:hover {
text-decoration: underline;
}
.info-box {
background: var(--background-fill-secondary);
padding: 10px 15px;
border-radius: 8px;
margin: 10px 0;
border-left: 4px solid #667eea;
}
.chatbot-container {
min-height: 500px;
}
"""
# Build the interface
with gr.Blocks(
title="TinyLlama Chatbot (CPU)",
theme=gr.themes.Soft(),
css=css,
fill_height=True,
footer_links=[
{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
{"label": "Model", "url": "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0"}
]
) as demo:
# Header
gr.HTML("""
<div class="header-container">
<h1>🦙 TinyLlama Chatbot</h1>
<p>Powered by TinyLlama-1.1B-Chat - Running locally on CPU</p>
<p><a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Built with anycoder</a></p>
</div>
""")
gr.HTML("""
<div class="info-box">
ℹ️ <strong>CPU Mode:</strong> This chatbot runs entirely on CPU without any API calls.
First response may take longer as the model loads. Responses are generated locally.
</div>
""")
with gr.Row():
# Main chat column
with gr.Column(scale=3):
chatbot = gr.Chatbot(
label="Chat",
height=500,
type="messages",
show_copy_button=True,
render_markdown=True,
elem_classes=["chatbot-container"]
)
with gr.Row():
msg = gr.Textbox(
placeholder="Type your message here... (Press Enter to send)",
label="Message",
scale=4,
lines=2,
max_lines=5,
autofocus=True
)
send_btn = gr.Button("Send 📤", variant="primary", scale=1)
with gr.Row():
clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
regenerate_btn = gr.Button("🔄 Regenerate", variant="secondary")
export_btn = gr.Button("📥 Export", variant="secondary")
# Settings sidebar
with gr.Column(scale=1):
gr.Markdown("### ⚙️ Settings")
with gr.Accordion("System Prompt", open=True):
system_prompt_choice = gr.Dropdown(
choices=list(SYSTEM_PROMPTS.keys()),
value="Default Assistant",
label="Preset Prompts",
interactive=True
)
custom_system_prompt = gr.Textbox(
label="Custom System Prompt",
placeholder="Enter your custom system prompt here...",
lines=4,
visible=False
)
with gr.Accordion("Generation Parameters", open=False):
temperature = gr.Slider(
minimum=0.0,
maximum=2.0,
value=0.7,
step=0.1,
label="Temperature",
info="Higher = more creative, Lower = more focused"
)
max_tokens = gr.Slider(
minimum=32,
maximum=512,
value=256,
step=32,
label="Max Tokens",
info="Maximum response length (lower = faster on CPU)"
)
top_p = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.9,
step=0.05,
label="Top P",
info="Nucleus sampling parameter"
)
# Export output
export_output = gr.Textbox(
label="Exported Chat",
lines=10,
visible=False,
show_copy_button=True
)
# Examples
gr.Markdown("### 💡 Example Prompts")
gr.Examples(
examples=[
["Explain what machine learning is in simple terms"],
["Write a short poem about the ocean"],
["What are three tips for staying productive?"],
["Tell me a fun fact about space"],
["How do I make a simple pasta dish?"],
],
inputs=msg,
label=""
)
# Event handlers
def toggle_custom_prompt(choice):
return gr.Textbox(visible=(choice == "Custom"))
system_prompt_choice.change(
toggle_custom_prompt,
inputs=[system_prompt_choice],
outputs=[custom_system_prompt]
)
def user_message(message, history):
if message.strip():
history.append({"role": "user", "content": message})
return "", history
def bot_response(history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
if not history:
yield history
return
user_msg = history[-1]["content"]
history_for_api = history[:-1]
history.append({"role": "assistant", "content": ""})
for response in chat(user_msg, history_for_api, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
history[-1]["content"] = response
yield history
def regenerate(history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
if len(history) >= 2:
# Remove last assistant message
history = history[:-1]
# Get last user message
user_msg = history[-1]["content"]
history_for_api = history[:-1]
history.append({"role": "assistant", "content": ""})
for response in chat(user_msg, history_for_api, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
history[-1]["content"] = response
yield history
else:
yield history
def show_export(history):
export_text = export_chat(history)
return gr.Textbox(visible=True, value=export_text)
# Wire up events
msg.submit(
user_message,
inputs=[msg, chatbot],
outputs=[msg, chatbot],
queue=False
).then(
bot_response,
inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p],
outputs=[chatbot]
)
send_btn.click(
user_message,
inputs=[msg, chatbot],
outputs=[msg, chatbot],
queue=False
).then(
bot_response,
inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p],
outputs=[chatbot]
)
clear_btn.click(
clear_chat,
outputs=[chatbot, msg]
)
regenerate_btn.click(
regenerate,
inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p],
outputs=[chatbot]
)
export_btn.click(
show_export,
inputs=[chatbot],
outputs=[export_output]
)
if __name__ == "__main__":
# Pre-load model on startup (optional - can be commented out for faster startup)
print("Starting TinyLlama Chatbot...")
demo.launch()