import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import torch from threading import Thread import re # Model configuration - using a smaller model that works well on CPU MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Global variables for model and tokenizer model = None tokenizer = None def load_model(): """Load the model and tokenizer""" global model, tokenizer if model is None: print("Loading model... This may take a moment on CPU.") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float32, device_map="cpu", low_cpu_mem_usage=True ) print("Model loaded successfully!") return model, tokenizer # Default system prompts SYSTEM_PROMPTS = { "Default Assistant": "You are a helpful, harmless, and honest AI assistant. Provide clear, accurate, and thoughtful responses.", "Creative Writer": "You are a creative writing assistant. Help users with storytelling, poetry, and imaginative content. Be expressive and artistic.", "Code Helper": "You are an expert programmer. Help users write, debug, and understand code. Provide clear explanations and best practices.", "Socratic Teacher": "You are a Socratic teacher. Instead of giving direct answers, guide users to discover answers through thoughtful questions.", "Friendly Chat": "You are a friendly conversational partner. Be warm, engaging, and personable. Use casual language and show genuine interest.", "Custom": "" } def format_chat_prompt(messages, system_prompt): """Format messages for TinyLlama chat format""" formatted = f"<|system|>\n{system_prompt}\n" for msg in messages: if msg["role"] == "user": formatted += f"<|user|>\n{msg['content']}\n" elif msg["role"] == "assistant": formatted += f"<|assistant|>\n{msg['content']}\n" formatted += "<|assistant|>\n" return formatted def chat(message, history, system_prompt_choice, custom_system_prompt, temperature, max_tokens, top_p): """Main chat function with streaming support""" global model, tokenizer # Load model if not loaded if model is None: yield "âŗ Loading model for the first time... Please wait (this may take 1-2 minutes on CPU)..." load_model() # Determine system prompt if system_prompt_choice == "Custom": system_content = custom_system_prompt if custom_system_prompt.strip() else SYSTEM_PROMPTS["Default Assistant"] else: system_content = SYSTEM_PROMPTS.get(system_prompt_choice, SYSTEM_PROMPTS["Default Assistant"]) # Build messages list messages = [] for msg in history: if msg["role"] in ["user", "assistant"]: messages.append({"role": msg["role"], "content": msg["content"]}) messages.append({"role": "user", "content": message}) try: # Format the prompt prompt = format_chat_prompt(messages, system_content) # Tokenize inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048) # Set up streamer streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # Generation parameters generation_kwargs = { "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "max_new_tokens": max_tokens, "temperature": temperature if temperature > 0 else 0.1, "top_p": top_p, "do_sample": temperature > 0, "streamer": streamer, "pad_token_id": tokenizer.eos_token_id, "eos_token_id": tokenizer.eos_token_id, } # Run generation in a separate thread thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # Stream the response response = "" for new_text in streamer: response += new_text # Clean up any remaining special tokens clean_response = response.replace("", "").strip() yield clean_response thread.join() except Exception as e: yield f"â Error: {str(e)}\n\nPlease try again with a shorter message or lower max tokens." def clear_chat(): """Clear the chat history""" return [], "" def export_chat(history): """Export chat history as text""" if not history: return "No chat history to export." export_text = "# Chat Export\n\n" for msg in history: role = "đ¤ User" if msg["role"] == "user" else "đ¤ Assistant" export_text += f"## {role}\n{msg['content']}\n\n---\n\n" return export_text # Custom CSS css = """ .header-container { text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 12px; margin-bottom: 20px; } .header-container h1 { color: white; margin: 0; font-size: 2em; } .header-container p { color: rgba(255,255,255,0.9); margin: 10px 0 0 0; } .header-container a { color: #ffd700; text-decoration: none; font-weight: bold; } .header-container a:hover { text-decoration: underline; } .info-box { background: var(--background-fill-secondary); padding: 10px 15px; border-radius: 8px; margin: 10px 0; border-left: 4px solid #667eea; } .chatbot-container { min-height: 500px; } """ # Build the interface with gr.Blocks( title="TinyLlama Chatbot (CPU)", theme=gr.themes.Soft(), css=css, fill_height=True, footer_links=[ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, {"label": "Model", "url": "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0"} ] ) as demo: # Header gr.HTML("""
""") gr.HTML("""