import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import torch from threading import Thread import re # Model configuration - using a smaller model that works well on CPU MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Global variables for model and tokenizer model = None tokenizer = None def load_model(): """Load the model and tokenizer""" global model, tokenizer if model is None: print("Loading model... This may take a moment on CPU.") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float32, device_map="cpu", low_cpu_mem_usage=True ) print("Model loaded successfully!") return model, tokenizer # Default system prompts SYSTEM_PROMPTS = { "Default Assistant": "You are a helpful, harmless, and honest AI assistant. Provide clear, accurate, and thoughtful responses.", "Creative Writer": "You are a creative writing assistant. Help users with storytelling, poetry, and imaginative content. Be expressive and artistic.", "Code Helper": "You are an expert programmer. Help users write, debug, and understand code. Provide clear explanations and best practices.", "Socratic Teacher": "You are a Socratic teacher. Instead of giving direct answers, guide users to discover answers through thoughtful questions.", "Friendly Chat": "You are a friendly conversational partner. Be warm, engaging, and personable. Use casual language and show genuine interest.", "Custom": "" } def format_chat_prompt(messages, system_prompt): """Format messages for TinyLlama chat format""" formatted = f"<|system|>\n{system_prompt}\n" for msg in messages: if msg["role"] == "user": formatted += f"<|user|>\n{msg['content']}\n" elif msg["role"] == "assistant": formatted += f"<|assistant|>\n{msg['content']}\n" formatted += "<|assistant|>\n" return formatted def chat(message, history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p): """Main chat function with streaming support""" global model, tokenizer # Load model if not loaded if model is None: yield "âŗ Loading model for the first time... Please wait (this may take 1-2 minutes on CPU)..." load_model() # Determine system prompt if system_prompt_choice == "Custom": system_content = custom_system_prompt if custom_system_prompt.strip() else SYSTEM_PROMPTS["Default Assistant"] else: system_content = SYSTEM_PROMPTS.get(system_prompt_choice, SYSTEM_PROMPTS["Default Assistant"]) # Build messages list messages = [] for msg in history: if msg["role"] in ["user", "assistant"]: messages.append({"role": msg["role"], "content": msg["content"]}) messages.append({"role": "user", "content": message}) try: # Format the prompt prompt = format_chat_prompt(messages, system_content) # Tokenize inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048) # Set up streamer streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # Generation parameters generation_kwargs = { "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "max_new_tokens": max_tokens, "temperature": temperature if temperature > 0 else 0.1, "top_p": top_p, "do_sample": temperature > 0, "streamer": streamer, "pad_token_id": tokenizer.eos_token_id, "eos_token_id": tokenizer.eos_token_id, } # Run generation in a separate thread thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # Stream the response response = "" for new_text in streamer: response += new_text # Clean up any remaining special tokens clean_response = response.replace("", "").strip() yield clean_response thread.join() except Exception as e: yield f"❌ Error: {str(e)}\n\nPlease try again with a shorter message or lower max tokens." def clear_chat(): """Clear the chat history""" return [], "" def export_chat(history): """Export chat history as text""" if not history: return "No chat history to export." export_text = "# Chat Export\n\n" for msg in history: role = "👤 User" if msg["role"] == "user" else "🤖 Assistant" export_text += f"## {role}\n{msg['content']}\n\n---\n\n" return export_text # Custom CSS css = """ .header-container { text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 12px; margin-bottom: 20px; } .header-container h1 { color: white; margin: 0; font-size: 2em; } .header-container p { color: rgba(255,255,255,0.9); margin: 10px 0 0 0; } .header-container a { color: #ffd700; text-decoration: none; font-weight: bold; } .header-container a:hover { text-decoration: underline; } .info-box { background: var(--background-fill-secondary); padding: 10px 15px; border-radius: 8px; margin: 10px 0; border-left: 4px solid #667eea; } .chatbot-container { min-height: 500px; } """ # Build the interface with gr.Blocks( title="TinyLlama Chatbot (CPU)", theme=gr.themes.Soft(), css=css, fill_height=True, footer_links=[ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, {"label": "Model", "url": "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0"} ] ) as demo: # Header gr.HTML("""

đŸĻ™ TinyLlama Chatbot

Powered by TinyLlama-1.1B-Chat - Running locally on CPU

Built with anycoder

""") gr.HTML("""
â„šī¸ CPU Mode: This chatbot runs entirely on CPU without any API calls. First response may take longer as the model loads. Responses are generated locally.
""") with gr.Row(): # Main chat column with gr.Column(scale=3): chatbot = gr.Chatbot( label="Chat", height=500, type="messages", show_copy_button=True, render_markdown=True, elem_classes=["chatbot-container"] ) with gr.Row(): msg = gr.Textbox( placeholder="Type your message here... (Press Enter to send)", label="Message", scale=4, lines=2, max_lines=5, autofocus=True ) send_btn = gr.Button("Send 📤", variant="primary", scale=1) with gr.Row(): clear_btn = gr.Button("đŸ—‘ī¸ Clear Chat", variant="secondary") regenerate_btn = gr.Button("🔄 Regenerate", variant="secondary") export_btn = gr.Button("đŸ“Ĩ Export", variant="secondary") # Settings sidebar with gr.Column(scale=1): gr.Markdown("### âš™ī¸ Settings") with gr.Accordion("System Prompt", open=True): system_prompt_choice = gr.Dropdown( choices=list(SYSTEM_PROMPTS.keys()), value="Default Assistant", label="Preset Prompts", interactive=True ) custom_system_prompt = gr.Textbox( label="Custom System Prompt", placeholder="Enter your custom system prompt here...", lines=4, visible=False ) with gr.Accordion("Generation Parameters", open=False): temperature = gr.Slider( minimum=0.0, maximum=2.0, value=0.7, step=0.1, label="Temperature", info="Higher = more creative, Lower = more focused" ) max_tokens = gr.Slider( minimum=32, maximum=512, value=256, step=32, label="Max Tokens", info="Maximum response length (lower = faster on CPU)" ) top_p = gr.Slider( minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top P", info="Nucleus sampling parameter" ) # Export output export_output = gr.Textbox( label="Exported Chat", lines=10, visible=False, show_copy_button=True ) # Examples gr.Markdown("### 💡 Example Prompts") gr.Examples( examples=[ ["Explain what machine learning is in simple terms"], ["Write a short poem about the ocean"], ["What are three tips for staying productive?"], ["Tell me a fun fact about space"], ["How do I make a simple pasta dish?"], ], inputs=msg, label="" ) # Event handlers def toggle_custom_prompt(choice): return gr.Textbox(visible=(choice == "Custom")) system_prompt_choice.change( toggle_custom_prompt, inputs=[system_prompt_choice], outputs=[custom_system_prompt] ) def user_message(message, history): if message.strip(): history.append({"role": "user", "content": message}) return "", history def bot_response(history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p): if not history: yield history return user_msg = history[-1]["content"] history_for_api = history[:-1] history.append({"role": "assistant", "content": ""}) for response in chat(user_msg, history_for_api, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p): history[-1]["content"] = response yield history def regenerate(history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p): if len(history) >= 2: # Remove last assistant message history = history[:-1] # Get last user message user_msg = history[-1]["content"] history_for_api = history[:-1] history.append({"role": "assistant", "content": ""}) for response in chat(user_msg, history_for_api, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p): history[-1]["content"] = response yield history else: yield history def show_export(history): export_text = export_chat(history) return gr.Textbox(visible=True, value=export_text) # Wire up events msg.submit( user_message, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False ).then( bot_response, inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p], outputs=[chatbot] ) send_btn.click( user_message, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False ).then( bot_response, inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p], outputs=[chatbot] ) clear_btn.click( clear_chat, outputs=[chatbot, msg] ) regenerate_btn.click( regenerate, inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p], outputs=[chatbot] ) export_btn.click( show_export, inputs=[chatbot], outputs=[export_output] ) if __name__ == "__main__": # Pre-load model on startup (optional - can be commented out for faster startup) print("Starting TinyLlama Chatbot...") demo.launch()