🦙 TinyLlama Chatbot


import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread
import re

# Model configuration - using a smaller model that works well on CPU
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Global variables for model and tokenizer
model = None
tokenizer = None

def load_model():
    """Load the model and tokenizer"""
    global model, tokenizer
    
    if model is None:
        print("Loading model... This may take a moment on CPU.")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float32,
            device_map="cpu",
            low_cpu_mem_usage=True
        )
        print("Model loaded successfully!")
    
    return model, tokenizer

# Default system prompts
SYSTEM_PROMPTS = {
    "Default Assistant": "You are a helpful, harmless, and honest AI assistant. Provide clear, accurate, and thoughtful responses.",
    "Creative Writer": "You are a creative writing assistant. Help users with storytelling, poetry, and imaginative content. Be expressive and artistic.",
    "Code Helper": "You are an expert programmer. Help users write, debug, and understand code. Provide clear explanations and best practices.",
    "Socratic Teacher": "You are a Socratic teacher. Instead of giving direct answers, guide users to discover answers through thoughtful questions.",
    "Friendly Chat": "You are a friendly conversational partner. Be warm, engaging, and personable. Use casual language and show genuine interest.",
    "Custom": ""
}

def format_chat_prompt(messages, system_prompt):
    """Format messages for TinyLlama chat format"""
    formatted = f"<|system|>\n{system_prompt}</s>\n"
    
    for msg in messages:
        if msg["role"] == "user":
            formatted += f"<|user|>\n{msg['content']}</s>\n"
        elif msg["role"] == "assistant":
            formatted += f"<|assistant|>\n{msg['content']}</s>\n"
    
    formatted += "<|assistant|>\n"
    return formatted

def chat(message, history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
    """Main chat function with streaming support"""
    global model, tokenizer
    
    # Load model if not loaded
    if model is None:
        yield "⏳ Loading model for the first time... Please wait (this may take 1-2 minutes on CPU)..."
        load_model()
    
    # Determine system prompt
    if system_prompt_choice == "Custom":
        system_content = custom_system_prompt if custom_system_prompt.strip() else SYSTEM_PROMPTS["Default Assistant"]
    else:
        system_content = SYSTEM_PROMPTS.get(system_prompt_choice, SYSTEM_PROMPTS["Default Assistant"])
    
    # Build messages list
    messages = []
    for msg in history:
        if msg["role"] in ["user", "assistant"]:
            messages.append({"role": msg["role"], "content": msg["content"]})
    
    messages.append({"role": "user", "content": message})
    
    try:
        # Format the prompt
        prompt = format_chat_prompt(messages, system_content)
        
        # Tokenize
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
        
        # Set up streamer
        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
        
        # Generation parameters
        generation_kwargs = {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "max_new_tokens": max_tokens,
            "temperature": temperature if temperature > 0 else 0.1,
            "top_p": top_p,
            "do_sample": temperature > 0,
            "streamer": streamer,
            "pad_token_id": tokenizer.eos_token_id,
            "eos_token_id": tokenizer.eos_token_id,
        }
        
        # Run generation in a separate thread
        thread = Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()
        
        # Stream the response
        response = ""
        for new_text in streamer:
            response += new_text
            # Clean up any remaining special tokens
            clean_response = response.replace("</s>", "").strip()
            yield clean_response
        
        thread.join()
        
    except Exception as e:
        yield f"❌ Error: {str(e)}\n\nPlease try again with a shorter message or lower max tokens."

def clear_chat():
    """Clear the chat history"""
    return [], ""

def export_chat(history):
    """Export chat history as text"""
    if not history:
        return "No chat history to export."
    
    export_text = "# Chat Export\n\n"
    for msg in history:
        role = "👤 User" if msg["role"] == "user" else "🤖 Assistant"
        export_text += f"## {role}\n{msg['content']}\n\n---\n\n"
    
    return export_text

# Custom CSS
css = """
.header-container {
    text-align: center;
    padding: 20px;
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    border-radius: 12px;
    margin-bottom: 20px;
}
.header-container h1 {
    color: white;
    margin: 0;
    font-size: 2em;
}
.header-container p {
    color: rgba(255,255,255,0.9);
    margin: 10px 0 0 0;
}
.header-container a {
    color: #ffd700;
    text-decoration: none;
    font-weight: bold;
}
.header-container a:hover {
    text-decoration: underline;
}
.info-box {
    background: var(--background-fill-secondary);
    padding: 10px 15px;
    border-radius: 8px;
    margin: 10px 0;
    border-left: 4px solid #667eea;
}
.chatbot-container {
    min-height: 500px;
}
"""

# Build the interface
with gr.Blocks(
    title="TinyLlama Chatbot (CPU)",
    theme=gr.themes.Soft(),
    css=css,
    fill_height=True,
    footer_links=[
        {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
        {"label": "Model", "url": "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0"}
    ]
) as demo:
    
    # Header
    gr.HTML("""
        <div class="header-container">
            <h1>🦙 TinyLlama Chatbot</h1>
            <p>Powered by TinyLlama-1.1B-Chat - Running locally on CPU</p>
            <p><a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Built with anycoder</a></p>
        </div>
    """)
    
    gr.HTML("""
        <div class="info-box">
            ℹ️ <strong>CPU Mode:</strong> This chatbot runs entirely on CPU without any API calls. 
            First response may take longer as the model loads. Responses are generated locally.
        </div>
    """)
    
    with gr.Row():
        # Main chat column
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(
                label="Chat",
                height=500,
                type="messages",
                show_copy_button=True,
                render_markdown=True,
                elem_classes=["chatbot-container"]
            )
            
            with gr.Row():
                msg = gr.Textbox(
                    placeholder="Type your message here... (Press Enter to send)",
                    label="Message",
                    scale=4,
                    lines=2,
                    max_lines=5,
                    autofocus=True
                )
                send_btn = gr.Button("Send 📤", variant="primary", scale=1)
            
            with gr.Row():
                clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
                regenerate_btn = gr.Button("🔄 Regenerate", variant="secondary")
                export_btn = gr.Button("📥 Export", variant="secondary")
        
        # Settings sidebar
        with gr.Column(scale=1):
            gr.Markdown("### ⚙️ Settings")
            
            with gr.Accordion("System Prompt", open=True):
                system_prompt_choice = gr.Dropdown(
                    choices=list(SYSTEM_PROMPTS.keys()),
                    value="Default Assistant",
                    label="Preset Prompts",
                    interactive=True
                )
                
                custom_system_prompt = gr.Textbox(
                    label="Custom System Prompt",
                    placeholder="Enter your custom system prompt here...",
                    lines=4,
                    visible=False
                )
            
            with gr.Accordion("Generation Parameters", open=False):
                temperature = gr.Slider(
                    minimum=0.0,
                    maximum=2.0,
                    value=0.7,
                    step=0.1,
                    label="Temperature",
                    info="Higher = more creative, Lower = more focused"
                )
                
                max_tokens = gr.Slider(
                    minimum=32,
                    maximum=512,
                    value=256,
                    step=32,
                    label="Max Tokens",
                    info="Maximum response length (lower = faster on CPU)"
                )
                
                top_p = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    value=0.9,
                    step=0.05,
                    label="Top P",
                    info="Nucleus sampling parameter"
                )
            
            # Export output
            export_output = gr.Textbox(
                label="Exported Chat",
                lines=10,
                visible=False,
                show_copy_button=True
            )
    
    # Examples
    gr.Markdown("### 💡 Example Prompts")
    gr.Examples(
        examples=[
            ["Explain what machine learning is in simple terms"],
            ["Write a short poem about the ocean"],
            ["What are three tips for staying productive?"],
            ["Tell me a fun fact about space"],
            ["How do I make a simple pasta dish?"],
        ],
        inputs=msg,
        label=""
    )
    
    # Event handlers
    def toggle_custom_prompt(choice):
        return gr.Textbox(visible=(choice == "Custom"))
    
    system_prompt_choice.change(
        toggle_custom_prompt,
        inputs=[system_prompt_choice],
        outputs=[custom_system_prompt]
    )
    
    def user_message(message, history):
        if message.strip():
            history.append({"role": "user", "content": message})
        return "", history
    
    def bot_response(history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
        if not history:
            yield history
            return
            
        user_msg = history[-1]["content"]
        history_for_api = history[:-1]
        
        history.append({"role": "assistant", "content": ""})
        
        for response in chat(user_msg, history_for_api, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
            history[-1]["content"] = response
            yield history
    
    def regenerate(history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
        if len(history) >= 2:
            # Remove last assistant message
            history = history[:-1]
            # Get last user message
            user_msg = history[-1]["content"]
            history_for_api = history[:-1]
            
            history.append({"role": "assistant", "content": ""})
            
            for response in chat(user_msg, history_for_api, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p):
                history[-1]["content"] = response
                yield history
        else:
            yield history
    
    def show_export(history):
        export_text = export_chat(history)
        return gr.Textbox(visible=True, value=export_text)
    
    # Wire up events
    msg.submit(
        user_message,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot],
        queue=False
    ).then(
        bot_response,
        inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p],
        outputs=[chatbot]
    )
    
    send_btn.click(
        user_message,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot],
        queue=False
    ).then(
        bot_response,
        inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p],
        outputs=[chatbot]
    )
    
    clear_btn.click(
        clear_chat,
        outputs=[chatbot, msg]
    )
    
    regenerate_btn.click(
        regenerate,
        inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p],
        outputs=[chatbot]
    )
    
    export_btn.click(
        show_export,
        inputs=[chatbot],
        outputs=[export_output]
    )

if __name__ == "__main__":
    # Pre-load model on startup (optional - can be commented out for faster startup)
    print("Starting TinyLlama Chatbot...")
    demo.launch()