Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| import torch | |
| from threading import Thread | |
| import re | |
| # Model configuration - using a smaller model that works well on CPU | |
| MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" | |
| # Global variables for model and tokenizer | |
| model = None | |
| tokenizer = None | |
| def load_model(): | |
| """Load the model and tokenizer""" | |
| global model, tokenizer | |
| if model is None: | |
| print("Loading model... This may take a moment on CPU.") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch.float32, | |
| device_map="cpu", | |
| low_cpu_mem_usage=True | |
| ) | |
| print("Model loaded successfully!") | |
| return model, tokenizer | |
| # Default system prompts | |
| SYSTEM_PROMPTS = { | |
| "Default Assistant": "You are a helpful, harmless, and honest AI assistant. Provide clear, accurate, and thoughtful responses.", | |
| "Creative Writer": "You are a creative writing assistant. Help users with storytelling, poetry, and imaginative content. Be expressive and artistic.", | |
| "Code Helper": "You are an expert programmer. Help users write, debug, and understand code. Provide clear explanations and best practices.", | |
| "Socratic Teacher": "You are a Socratic teacher. Instead of giving direct answers, guide users to discover answers through thoughtful questions.", | |
| "Friendly Chat": "You are a friendly conversational partner. Be warm, engaging, and personable. Use casual language and show genuine interest.", | |
| "Custom": "" | |
| } | |
| def format_chat_prompt(messages, system_prompt): | |
| """Format messages for TinyLlama chat format""" | |
| formatted = f"<|system|>\n{system_prompt}</s>\n" | |
| for msg in messages: | |
| if msg["role"] == "user": | |
| formatted += f"<|user|>\n{msg['content']}</s>\n" | |
| elif msg["role"] == "assistant": | |
| formatted += f"<|assistant|>\n{msg['content']}</s>\n" | |
| formatted += "<|assistant|>\n" | |
| return formatted | |
| def chat(message, history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p): | |
| """Main chat function with streaming support""" | |
| global model, tokenizer | |
| # Load model if not loaded | |
| if model is None: | |
| yield "⏳ Loading model for the first time... Please wait (this may take 1-2 minutes on CPU)..." | |
| load_model() | |
| # Determine system prompt | |
| if system_prompt_choice == "Custom": | |
| system_content = custom_system_prompt if custom_system_prompt.strip() else SYSTEM_PROMPTS["Default Assistant"] | |
| else: | |
| system_content = SYSTEM_PROMPTS.get(system_prompt_choice, SYSTEM_PROMPTS["Default Assistant"]) | |
| # Build messages list | |
| messages = [] | |
| for msg in history: | |
| if msg["role"] in ["user", "assistant"]: | |
| messages.append({"role": msg["role"], "content": msg["content"]}) | |
| messages.append({"role": "user", "content": message}) | |
| try: | |
| # Format the prompt | |
| prompt = format_chat_prompt(messages, system_content) | |
| # Tokenize | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048) | |
| # Set up streamer | |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
| # Generation parameters | |
| generation_kwargs = { | |
| "input_ids": inputs["input_ids"], | |
| "attention_mask": inputs["attention_mask"], | |
| "max_new_tokens": max_tokens, | |
| "temperature": temperature if temperature > 0 else 0.1, | |
| "top_p": top_p, | |
| "do_sample": temperature > 0, | |
| "streamer": streamer, | |
| "pad_token_id": tokenizer.eos_token_id, | |
| "eos_token_id": tokenizer.eos_token_id, | |
| } | |
| # Run generation in a separate thread | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| # Stream the response | |
| response = "" | |
| for new_text in streamer: | |
| response += new_text | |
| # Clean up any remaining special tokens | |
| clean_response = response.replace("</s>", "").strip() | |
| yield clean_response | |
| thread.join() | |
| except Exception as e: | |
| yield f"❌ Error: {str(e)}\n\nPlease try again with a shorter message or lower max tokens." | |
| def clear_chat(): | |
| """Clear the chat history""" | |
| return [], "" | |
| def export_chat(history): | |
| """Export chat history as text""" | |
| if not history: | |
| return "No chat history to export." | |
| export_text = "# Chat Export\n\n" | |
| for msg in history: | |
| role = "👤 User" if msg["role"] == "user" else "🤖 Assistant" | |
| export_text += f"## {role}\n{msg['content']}\n\n---\n\n" | |
| return export_text | |
| # Custom CSS | |
| css = """ | |
| .header-container { | |
| text-align: center; | |
| padding: 20px; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| border-radius: 12px; | |
| margin-bottom: 20px; | |
| } | |
| .header-container h1 { | |
| color: white; | |
| margin: 0; | |
| font-size: 2em; | |
| } | |
| .header-container p { | |
| color: rgba(255,255,255,0.9); | |
| margin: 10px 0 0 0; | |
| } | |
| .header-container a { | |
| color: #ffd700; | |
| text-decoration: none; | |
| font-weight: bold; | |
| } | |
| .header-container a:hover { | |
| text-decoration: underline; | |
| } | |
| .info-box { | |
| background: var(--background-fill-secondary); | |
| padding: 10px 15px; | |
| border-radius: 8px; | |
| margin: 10px 0; | |
| border-left: 4px solid #667eea; | |
| } | |
| .chatbot-container { | |
| min-height: 500px; | |
| } | |
| """ | |
| # Build the interface | |
| with gr.Blocks( | |
| title="TinyLlama Chatbot (CPU)", | |
| theme=gr.themes.Soft(), | |
| css=css, | |
| fill_height=True, | |
| footer_links=[ | |
| {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, | |
| {"label": "Model", "url": "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0"} | |
| ] | |
| ) as demo: | |
| # Header | |
| gr.HTML(""" | |
| <div class="header-container"> | |
| <h1>🦙 TinyLlama Chatbot</h1> | |
| <p>Powered by TinyLlama-1.1B-Chat - Running locally on CPU</p> | |
| <p><a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Built with anycoder</a></p> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div class="info-box"> | |
| ℹ️ <strong>CPU Mode:</strong> This chatbot runs entirely on CPU without any API calls. | |
| First response may take longer as the model loads. Responses are generated locally. | |
| </div> | |
| """) | |
| with gr.Row(): | |
| # Main chat column | |
| with gr.Column(scale=3): | |
| chatbot = gr.Chatbot( | |
| label="Chat", | |
| height=500, | |
| type="messages", | |
| show_copy_button=True, | |
| render_markdown=True, | |
| elem_classes=["chatbot-container"] | |
| ) | |
| with gr.Row(): | |
| msg = gr.Textbox( | |
| placeholder="Type your message here... (Press Enter to send)", | |
| label="Message", | |
| scale=4, | |
| lines=2, | |
| max_lines=5, | |
| autofocus=True | |
| ) | |
| send_btn = gr.Button("Send 📤", variant="primary", scale=1) | |
| with gr.Row(): | |
| clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary") | |
| regenerate_btn = gr.Button("🔄 Regenerate", variant="secondary") | |
| export_btn = gr.Button("📥 Export", variant="secondary") | |
| # Settings sidebar | |
| with gr.Column(scale=1): | |
| gr.Markdown("### ⚙️ Settings") | |
| with gr.Accordion("System Prompt", open=True): | |
| system_prompt_choice = gr.Dropdown( | |
| choices=list(SYSTEM_PROMPTS.keys()), | |
| value="Default Assistant", | |
| label="Preset Prompts", | |
| interactive=True | |
| ) | |
| custom_system_prompt = gr.Textbox( | |
| label="Custom System Prompt", | |
| placeholder="Enter your custom system prompt here...", | |
| lines=4, | |
| visible=False | |
| ) | |
| with gr.Accordion("Generation Parameters", open=False): | |
| temperature = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature", | |
| info="Higher = more creative, Lower = more focused" | |
| ) | |
| max_tokens = gr.Slider( | |
| minimum=32, | |
| maximum=512, | |
| value=256, | |
| step=32, | |
| label="Max Tokens", | |
| info="Maximum response length (lower = faster on CPU)" | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.9, | |
| step=0.05, | |
| label="Top P", | |
| info="Nucleus sampling parameter" | |
| ) | |
| # Export output | |
| export_output = gr.Textbox( | |
| label="Exported Chat", | |
| lines=10, | |
| visible=False, | |
| show_copy_button=True | |
| ) | |
| # Examples | |
| gr.Markdown("### 💡 Example Prompts") | |
| gr.Examples( | |
| examples=[ | |
| ["Explain what machine learning is in simple terms"], | |
| ["Write a short poem about the ocean"], | |
| ["What are three tips for staying productive?"], | |
| ["Tell me a fun fact about space"], | |
| ["How do I make a simple pasta dish?"], | |
| ], | |
| inputs=msg, | |
| label="" | |
| ) | |
| # Event handlers | |
| def toggle_custom_prompt(choice): | |
| return gr.Textbox(visible=(choice == "Custom")) | |
| system_prompt_choice.change( | |
| toggle_custom_prompt, | |
| inputs=[system_prompt_choice], | |
| outputs=[custom_system_prompt] | |
| ) | |
| def user_message(message, history): | |
| if message.strip(): | |
| history.append({"role": "user", "content": message}) | |
| return "", history | |
| def bot_response(history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p): | |
| if not history: | |
| yield history | |
| return | |
| user_msg = history[-1]["content"] | |
| history_for_api = history[:-1] | |
| history.append({"role": "assistant", "content": ""}) | |
| for response in chat(user_msg, history_for_api, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p): | |
| history[-1]["content"] = response | |
| yield history | |
| def regenerate(history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p): | |
| if len(history) >= 2: | |
| # Remove last assistant message | |
| history = history[:-1] | |
| # Get last user message | |
| user_msg = history[-1]["content"] | |
| history_for_api = history[:-1] | |
| history.append({"role": "assistant", "content": ""}) | |
| for response in chat(user_msg, history_for_api, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p): | |
| history[-1]["content"] = response | |
| yield history | |
| else: | |
| yield history | |
| def show_export(history): | |
| export_text = export_chat(history) | |
| return gr.Textbox(visible=True, value=export_text) | |
| # Wire up events | |
| msg.submit( | |
| user_message, | |
| inputs=[msg, chatbot], | |
| outputs=[msg, chatbot], | |
| queue=False | |
| ).then( | |
| bot_response, | |
| inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p], | |
| outputs=[chatbot] | |
| ) | |
| send_btn.click( | |
| user_message, | |
| inputs=[msg, chatbot], | |
| outputs=[msg, chatbot], | |
| queue=False | |
| ).then( | |
| bot_response, | |
| inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p], | |
| outputs=[chatbot] | |
| ) | |
| clear_btn.click( | |
| clear_chat, | |
| outputs=[chatbot, msg] | |
| ) | |
| regenerate_btn.click( | |
| regenerate, | |
| inputs=[chatbot, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p], | |
| outputs=[chatbot] | |
| ) | |
| export_btn.click( | |
| show_export, | |
| inputs=[chatbot], | |
| outputs=[export_output] | |
| ) | |
| if __name__ == "__main__": | |
| # Pre-load model on startup (optional - can be commented out for faster startup) | |
| print("Starting TinyLlama Chatbot...") | |
| demo.launch() |