TobDeBer's picture
Upload folder using huggingface_hub
6ae5993 verified
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import time
import random
# Model configuration - using TinyLlama for efficient CPU inference
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# Global variables for model components
tokenizer = None
model = None
text_generator = None
def load_model():
"""Load the Smol LLM model and tokenizer"""
global tokenizer, model, text_generator
try:
print(f"Loading model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float32, # Use float32 for CPU
device_map="auto"
)
# Create text generation pipeline
text_generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=512,
temperature=0.7,
top_p=0.95,
do_sample=True
)
# Set pad token if not present
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
return "βœ… Model loaded successfully!"
except Exception as e:
return f"❌ Error loading model: {str(e)}"
def format_prompt(prompt, system_prompt=None):
"""Format the prompt for chat-style models"""
if system_prompt:
formatted = f"<|system|>\n{system_prompt}\n<|user|>\n{prompt}\n<|assistant|>"
else:
formatted = f"<|user|>\n{prompt}\n<|assistant|>"
return formatted
def generate_text(
prompt,
max_length=200,
temperature=0.7,
top_p=0.95,
repetition_penalty=1.1,
system_prompt="You are a helpful AI assistant. Provide clear and concise answers."
):
"""Generate text using the loaded model"""
global text_generator
if text_generator is None:
return "⚠️ Please load the model first using the 'Load Model' button."
if not prompt.strip():
return "⚠️ Please enter a prompt."
try:
# Format the prompt
formatted_prompt = format_prompt(prompt, system_prompt)
# Update pipeline parameters
text_generator.max_new_tokens = max_length
text_generator.temperature = temperature
text_generator.top_p = top_p
text_generator.repetition_penalty = repetition_penalty
# Generate response
start_time = time.time()
result = text_generator(
formatted_prompt,
max_new_tokens=max_length,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id
)
generation_time = time.time() - start_time
# Extract the generated text
generated_text = result[0]["generated_text"]
# Extract only the assistant's response
if "<|assistant|>" in generated_text:
response = generated_text.split("<|assistant|>")[-1].strip()
else:
response = generated_text
# Format output with metadata
output = f"**Response:**\n{response}\n\n---\n*Generated in {generation_time:.2f} seconds*"
return output
except Exception as e:
return f"❌ Error during generation: {str(e)}"
def clear_chat():
"""Clear the chat interface"""
return "", ""
# Create custom theme
custom_theme = gr.themes.Soft(
primary_hue="blue",
secondary_hue="indigo",
neutral_hue="slate",
font=gr.themes.GoogleFont("Inter"),
text_size="lg",
spacing_size="lg",
radius_size="md"
).set(
button_primary_background_fill="*primary_600",
button_primary_background_fill_hover="*primary_700",
block_title_text_weight="600",
)
# Build the Gradio interface
with gr.Blocks() as demo:
gr.Markdown(
"""
# πŸ€– Smol LLM Inference GUI
**Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)** -
Efficient text generation using TinyLlama
This application runs a compact language model locally for text generation.
Perfect for chat, completion tasks, and creative writing.
"""
)
with gr.Row():
with gr.Column(scale=2):
# Model loading section
with gr.Group():
gr.Markdown("### πŸ“¦ Model Management")
model_status = gr.Textbox(
label="Model Status",
value="Model not loaded. Click 'Load Model' to start.",
interactive=False
)
load_btn = gr.Button(
"πŸ”„ Load Model",
variant="primary",
size="lg"
)
# Generation parameters
gr.Markdown("### βš™οΈ Generation Parameters")
with gr.Row():
max_length = gr.Slider(
minimum=50,
maximum=1024,
value=200,
step=50,
label="Max Tokens"
)
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.7,
step=0.1,
label="Temperature"
)
with gr.Row():
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p"
)
repetition_penalty = gr.Slider(
minimum=1.0,
maximum=2.0,
value=1.1,
step=0.1,
label="Repetition Penalty"
)
system_prompt = gr.Textbox(
label="System Prompt",
value="You are a helpful AI assistant. Provide clear and concise answers.",
lines=3,
placeholder="Enter a system prompt to guide the model's behavior..."
)
with gr.Column(scale=3):
# Main interface
with gr.Group():
gr.Markdown("### πŸ’¬ Text Generation")
prompt_input = gr.Textbox(
label="Enter your prompt",
placeholder="Type your message here...",
lines=4,
autofocus=True
)
with gr.Row():
generate_btn = gr.Button(
"πŸš€ Generate",
variant="primary",
size="lg"
)
clear_btn = gr.Button(
"πŸ—‘οΈ Clear",
variant="secondary"
)
output_text = gr.Markdown(
label="Generated Response",
value="*Response will appear here...*"
)
# Example prompts
with gr.Accordion("πŸ“ Example Prompts", open=False):
gr.Examples(
examples=[
["Write a short story about a robot discovering music."],
["Explain quantum computing in simple terms."],
["Create a poem about the changing seasons."],
["What are the benefits of renewable energy?"],
["Write a Python function to calculate fibonacci numbers."],
["Describe the perfect day in your own words."],
["Explain the concept of machine learning to a beginner."],
["Create a dialogue between two friends planning a trip."]
],
inputs=[prompt_input],
label="Click an example to get started"
)
# Event handlers
load_btn.click(
fn=load_model,
outputs=[model_status],
api_visibility="public"
)
generate_btn.click(
fn=generate_text,
inputs=[
prompt_input,
max_length,
temperature,
top_p,
repetition_penalty,
system_prompt
],
outputs=[output_text],
api_visibility="public"
)
clear_btn.click(
fn=clear_chat,
outputs=[prompt_input],
api_visibility="private"
)
# Allow Enter key to generate
prompt_input.submit(
fn=generate_text,
inputs=[
prompt_input,
max_length,
temperature,
top_p,
repetition_penalty,
system_prompt
],
outputs=[output_text],
api_visibility="public"
)
# Launch the application
demo.launch(
theme=custom_theme,
footer_links=[
{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
{"label": "TinyLlama Model", "url": "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0"},
{"label": "Gradio", "url": "https://gradio.app"}
],
share=False,
show_error=True
)