import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import time import random # Model configuration - using TinyLlama for efficient CPU inference MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Global variables for model components tokenizer = None model = None text_generator = None def load_model(): """Load the Smol LLM model and tokenizer""" global tokenizer, model, text_generator try: print(f"Loading model: {MODEL_NAME}") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float32, # Use float32 for CPU device_map="auto" ) # Create text generation pipeline text_generator = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, temperature=0.7, top_p=0.95, do_sample=True ) # Set pad token if not present if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token return "✅ Model loaded successfully!" except Exception as e: return f"❌ Error loading model: {str(e)}" def format_prompt(prompt, system_prompt=None): """Format the prompt for chat-style models""" if system_prompt: formatted = f"<|system|>\n{system_prompt}\n<|user|>\n{prompt}\n<|assistant|>" else: formatted = f"<|user|>\n{prompt}\n<|assistant|>" return formatted def generate_text( prompt, max_length=200, temperature=0.7, top_p=0.95, repetition_penalty=1.1, system_prompt="You are a helpful AI assistant. Provide clear and concise answers." ): """Generate text using the loaded model""" global text_generator if text_generator is None: return "⚠️ Please load the model first using the 'Load Model' button." if not prompt.strip(): return "⚠️ Please enter a prompt." try: # Format the prompt formatted_prompt = format_prompt(prompt, system_prompt) # Update pipeline parameters text_generator.max_new_tokens = max_length text_generator.temperature = temperature text_generator.top_p = top_p text_generator.repetition_penalty = repetition_penalty # Generate response start_time = time.time() result = text_generator( formatted_prompt, max_new_tokens=max_length, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, do_sample=True, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id ) generation_time = time.time() - start_time # Extract the generated text generated_text = result[0]["generated_text"] # Extract only the assistant's response if "<|assistant|>" in generated_text: response = generated_text.split("<|assistant|>")[-1].strip() else: response = generated_text # Format output with metadata output = f"**Response:**\n{response}\n\n---\n*Generated in {generation_time:.2f} seconds*" return output except Exception as e: return f"❌ Error during generation: {str(e)}" def clear_chat(): """Clear the chat interface""" return "", "" # Create custom theme custom_theme = gr.themes.Soft( primary_hue="blue", secondary_hue="indigo", neutral_hue="slate", font=gr.themes.GoogleFont("Inter"), text_size="lg", spacing_size="lg", radius_size="md" ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_700", block_title_text_weight="600", ) # Build the Gradio interface with gr.Blocks() as demo: gr.Markdown( """ # 🤖 Smol LLM Inference GUI **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)** - Efficient text generation using TinyLlama This application runs a compact language model locally for text generation. Perfect for chat, completion tasks, and creative writing. """ ) with gr.Row(): with gr.Column(scale=2): # Model loading section with gr.Group(): gr.Markdown("### 📦 Model Management") model_status = gr.Textbox( label="Model Status", value="Model not loaded. Click 'Load Model' to start.", interactive=False ) load_btn = gr.Button( "🔄 Load Model", variant="primary", size="lg" ) # Generation parameters gr.Markdown("### ⚙️ Generation Parameters") with gr.Row(): max_length = gr.Slider( minimum=50, maximum=1024, value=200, step=50, label="Max Tokens" ) temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature" ) with gr.Row(): top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p" ) repetition_penalty = gr.Slider( minimum=1.0, maximum=2.0, value=1.1, step=0.1, label="Repetition Penalty" ) system_prompt = gr.Textbox( label="System Prompt", value="You are a helpful AI assistant. Provide clear and concise answers.", lines=3, placeholder="Enter a system prompt to guide the model's behavior..." ) with gr.Column(scale=3): # Main interface with gr.Group(): gr.Markdown("### 💬 Text Generation") prompt_input = gr.Textbox( label="Enter your prompt", placeholder="Type your message here...", lines=4, autofocus=True ) with gr.Row(): generate_btn = gr.Button( "🚀 Generate", variant="primary", size="lg" ) clear_btn = gr.Button( "🗑️ Clear", variant="secondary" ) output_text = gr.Markdown( label="Generated Response", value="*Response will appear here...*" ) # Example prompts with gr.Accordion("📝 Example Prompts", open=False): gr.Examples( examples=[ ["Write a short story about a robot discovering music."], ["Explain quantum computing in simple terms."], ["Create a poem about the changing seasons."], ["What are the benefits of renewable energy?"], ["Write a Python function to calculate fibonacci numbers."], ["Describe the perfect day in your own words."], ["Explain the concept of machine learning to a beginner."], ["Create a dialogue between two friends planning a trip."] ], inputs=[prompt_input], label="Click an example to get started" ) # Event handlers load_btn.click( fn=load_model, outputs=[model_status], api_visibility="public" ) generate_btn.click( fn=generate_text, inputs=[ prompt_input, max_length, temperature, top_p, repetition_penalty, system_prompt ], outputs=[output_text], api_visibility="public" ) clear_btn.click( fn=clear_chat, outputs=[prompt_input], api_visibility="private" ) # Allow Enter key to generate prompt_input.submit( fn=generate_text, inputs=[ prompt_input, max_length, temperature, top_p, repetition_penalty, system_prompt ], outputs=[output_text], api_visibility="public" ) # Launch the application demo.launch( theme=custom_theme, footer_links=[ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, {"label": "TinyLlama Model", "url": "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0"}, {"label": "Gradio", "url": "https://gradio.app"} ], share=False, show_error=True )