Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| import time | |
| import random | |
| # Model configuration - using TinyLlama for efficient CPU inference | |
| MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" | |
| # Global variables for model components | |
| tokenizer = None | |
| model = None | |
| text_generator = None | |
| def load_model(): | |
| """Load the Smol LLM model and tokenizer""" | |
| global tokenizer, model, text_generator | |
| try: | |
| print(f"Loading model: {MODEL_NAME}") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch.float32, # Use float32 for CPU | |
| device_map="auto" | |
| ) | |
| # Create text generation pipeline | |
| text_generator = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| top_p=0.95, | |
| do_sample=True | |
| ) | |
| # Set pad token if not present | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| return "β Model loaded successfully!" | |
| except Exception as e: | |
| return f"β Error loading model: {str(e)}" | |
| def format_prompt(prompt, system_prompt=None): | |
| """Format the prompt for chat-style models""" | |
| if system_prompt: | |
| formatted = f"<|system|>\n{system_prompt}\n<|user|>\n{prompt}\n<|assistant|>" | |
| else: | |
| formatted = f"<|user|>\n{prompt}\n<|assistant|>" | |
| return formatted | |
| def generate_text( | |
| prompt, | |
| max_length=200, | |
| temperature=0.7, | |
| top_p=0.95, | |
| repetition_penalty=1.1, | |
| system_prompt="You are a helpful AI assistant. Provide clear and concise answers." | |
| ): | |
| """Generate text using the loaded model""" | |
| global text_generator | |
| if text_generator is None: | |
| return "β οΈ Please load the model first using the 'Load Model' button." | |
| if not prompt.strip(): | |
| return "β οΈ Please enter a prompt." | |
| try: | |
| # Format the prompt | |
| formatted_prompt = format_prompt(prompt, system_prompt) | |
| # Update pipeline parameters | |
| text_generator.max_new_tokens = max_length | |
| text_generator.temperature = temperature | |
| text_generator.top_p = top_p | |
| text_generator.repetition_penalty = repetition_penalty | |
| # Generate response | |
| start_time = time.time() | |
| result = text_generator( | |
| formatted_prompt, | |
| max_new_tokens=max_length, | |
| temperature=temperature, | |
| top_p=top_p, | |
| repetition_penalty=repetition_penalty, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| eos_token_id=tokenizer.eos_token_id | |
| ) | |
| generation_time = time.time() - start_time | |
| # Extract the generated text | |
| generated_text = result[0]["generated_text"] | |
| # Extract only the assistant's response | |
| if "<|assistant|>" in generated_text: | |
| response = generated_text.split("<|assistant|>")[-1].strip() | |
| else: | |
| response = generated_text | |
| # Format output with metadata | |
| output = f"**Response:**\n{response}\n\n---\n*Generated in {generation_time:.2f} seconds*" | |
| return output | |
| except Exception as e: | |
| return f"β Error during generation: {str(e)}" | |
| def clear_chat(): | |
| """Clear the chat interface""" | |
| return "", "" | |
| # Create custom theme | |
| custom_theme = gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="indigo", | |
| neutral_hue="slate", | |
| font=gr.themes.GoogleFont("Inter"), | |
| text_size="lg", | |
| spacing_size="lg", | |
| radius_size="md" | |
| ).set( | |
| button_primary_background_fill="*primary_600", | |
| button_primary_background_fill_hover="*primary_700", | |
| block_title_text_weight="600", | |
| ) | |
| # Build the Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # π€ Smol LLM Inference GUI | |
| **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)** - | |
| Efficient text generation using TinyLlama | |
| This application runs a compact language model locally for text generation. | |
| Perfect for chat, completion tasks, and creative writing. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| # Model loading section | |
| with gr.Group(): | |
| gr.Markdown("### π¦ Model Management") | |
| model_status = gr.Textbox( | |
| label="Model Status", | |
| value="Model not loaded. Click 'Load Model' to start.", | |
| interactive=False | |
| ) | |
| load_btn = gr.Button( | |
| "π Load Model", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| # Generation parameters | |
| gr.Markdown("### βοΈ Generation Parameters") | |
| with gr.Row(): | |
| max_length = gr.Slider( | |
| minimum=50, | |
| maximum=1024, | |
| value=200, | |
| step=50, | |
| label="Max Tokens" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, | |
| maximum=2.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature" | |
| ) | |
| with gr.Row(): | |
| top_p = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-p" | |
| ) | |
| repetition_penalty = gr.Slider( | |
| minimum=1.0, | |
| maximum=2.0, | |
| value=1.1, | |
| step=0.1, | |
| label="Repetition Penalty" | |
| ) | |
| system_prompt = gr.Textbox( | |
| label="System Prompt", | |
| value="You are a helpful AI assistant. Provide clear and concise answers.", | |
| lines=3, | |
| placeholder="Enter a system prompt to guide the model's behavior..." | |
| ) | |
| with gr.Column(scale=3): | |
| # Main interface | |
| with gr.Group(): | |
| gr.Markdown("### π¬ Text Generation") | |
| prompt_input = gr.Textbox( | |
| label="Enter your prompt", | |
| placeholder="Type your message here...", | |
| lines=4, | |
| autofocus=True | |
| ) | |
| with gr.Row(): | |
| generate_btn = gr.Button( | |
| "π Generate", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| clear_btn = gr.Button( | |
| "ποΈ Clear", | |
| variant="secondary" | |
| ) | |
| output_text = gr.Markdown( | |
| label="Generated Response", | |
| value="*Response will appear here...*" | |
| ) | |
| # Example prompts | |
| with gr.Accordion("π Example Prompts", open=False): | |
| gr.Examples( | |
| examples=[ | |
| ["Write a short story about a robot discovering music."], | |
| ["Explain quantum computing in simple terms."], | |
| ["Create a poem about the changing seasons."], | |
| ["What are the benefits of renewable energy?"], | |
| ["Write a Python function to calculate fibonacci numbers."], | |
| ["Describe the perfect day in your own words."], | |
| ["Explain the concept of machine learning to a beginner."], | |
| ["Create a dialogue between two friends planning a trip."] | |
| ], | |
| inputs=[prompt_input], | |
| label="Click an example to get started" | |
| ) | |
| # Event handlers | |
| load_btn.click( | |
| fn=load_model, | |
| outputs=[model_status], | |
| api_visibility="public" | |
| ) | |
| generate_btn.click( | |
| fn=generate_text, | |
| inputs=[ | |
| prompt_input, | |
| max_length, | |
| temperature, | |
| top_p, | |
| repetition_penalty, | |
| system_prompt | |
| ], | |
| outputs=[output_text], | |
| api_visibility="public" | |
| ) | |
| clear_btn.click( | |
| fn=clear_chat, | |
| outputs=[prompt_input], | |
| api_visibility="private" | |
| ) | |
| # Allow Enter key to generate | |
| prompt_input.submit( | |
| fn=generate_text, | |
| inputs=[ | |
| prompt_input, | |
| max_length, | |
| temperature, | |
| top_p, | |
| repetition_penalty, | |
| system_prompt | |
| ], | |
| outputs=[output_text], | |
| api_visibility="public" | |
| ) | |
| # Launch the application | |
| demo.launch( | |
| theme=custom_theme, | |
| footer_links=[ | |
| {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, | |
| {"label": "TinyLlama Model", "url": "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0"}, | |
| {"label": "Gradio", "url": "https://gradio.app"} | |
| ], | |
| share=False, | |
| show_error=True | |
| ) |