Spaces:

TobDeBer
/

SmolTransform

Sleeping

App Files Files Community

TobDeBer commited on Dec 19, 2025

Commit

ee7dcb0

1 Parent(s): b393dd6

multiturn 1

Browse files

Files changed (1) hide show

app.py +47 -140

app.py CHANGED Viewed

@@ -7,43 +7,29 @@ from threading import Thread
 import sys
 import os
-# Model configuration - using SmolLM2 for efficient inference
-# Check for command line argument for local model path
 if len(sys.argv) > 1 and os.path.exists(sys.argv[1]):
     MODEL_NAME = sys.argv[1]
     print(f"Using local model from: {MODEL_NAME}")
 else:
     MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
-# Global variables for model components
 tokenizer = None
 model = None
-text_generator = None
 def load_model():
     """Load the Smol LLM model and tokenizer"""
-    global tokenizer, model, text_generator
     try:
         print(f"Loading model: {MODEL_NAME}")
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
-            dtype=torch.float32,  # Use float32 for CPU
             device_map="auto"
         )
-        # Create text generation pipeline (still useful for non-streaming checks if needed, but we use model.generate for streaming)
-        text_generator = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            max_new_tokens=512,
-            temperature=0.7,
-            top_p=0.95,
-            do_sample=True
-        )
-        # Set pad token if not present
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
@@ -51,36 +37,32 @@ def load_model():
     except Exception as e:
         return f"❌ Error loading model: {str(e)}"
-def format_prompt(prompt, system_prompt=None):
-    """Format the prompt for chat-style models using tokenizer's template"""
-    messages = []
-    if system_prompt:
-        messages.append({"role": "system", "content": system_prompt})
-    messages.append({"role": "user", "content": prompt})
-    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-def generate_text(
-    prompt,
-    max_length=200,
-    temperature=0.7,
-    top_p=0.95,
-    repetition_penalty=1.1,
-    system_prompt="You are a helpful AI assistant. Provide clear and concise answers."
-):
-    """Generate text using the loaded model with streaming"""
     global model, tokenizer
     if model is None or tokenizer is None:
         yield "⚠️ Please wait for the model to finish loading..."
         return
-    if not prompt.strip():
         yield "⚠️ Please enter a prompt."
         return
     try:
         # Format the prompt
-        formatted_prompt = format_prompt(prompt, system_prompt)
         inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
         # Setup streamer
@@ -111,25 +93,18 @@ def generate_text(
         for new_text in streamer:
             generated_text += new_text
             token_count += 1
-            current_time = time.time()
-            elapsed_time = current_time - start_time
-            # Avoid division by zero
-            if elapsed_time > 0:
-                tps = token_count / elapsed_time
-                stats = f"\n\n---\n*Generated {token_count} tokens in {elapsed_time:.2f} seconds ({tps:.2f} tokens/s)*"
-            else:
-                stats = "\n\n---\n*Starting generation...*"
-            yield f"**Response:**\n{generated_text}{stats}"
     except Exception as e:
         yield f"❌ Error during generation: {str(e)}"
-def clear_chat():
-    """Clear the chat interface"""
-    return "", "*Response will appear here...*"
 # Create custom theme
 custom_theme = gr.themes.Soft(
     primary_hue="blue",
@@ -149,119 +124,51 @@ custom_theme = gr.themes.Soft(
 with gr.Blocks(theme=custom_theme) as demo:
     gr.Markdown(
         """
-        # 🤖 Smol LLM Inference GUI
-        Efficient text generation using SmolLM2-135M
-        This application runs a compact language model locally for text generation.
-        Perfect for chat, completion tasks, and creative writing.
         """
     )
-    # Main interface
-    with gr.Group():
-        gr.Markdown("### 💬 Text Generation")
-        prompt_input = gr.Textbox(
-            label="Enter your prompt",
-            placeholder="Type your message here...",
-            lines=4,
-            autofocus=True
-        )
-        with gr.Row():
-            generate_btn = gr.Button(
-                "🚀 Generate",
-                variant="primary",
-                size="lg"
-            )
-            clear_btn = gr.Button(
-                "🗑️ Clear",
-                variant="secondary"
-            )
-        output_text = gr.Markdown(
-            label="Generated Response",
-            value="*Response will appear here...*"
-        )
-    # Settings
-    with gr.Accordion("⚙️ Settings", open=False):
-        # Generation parameters
-        gr.Markdown("### ⚙️ Generation Parameters")
-        with gr.Row():
-            max_length = gr.Slider(
                 minimum=50,
                 maximum=1024,
                 value=200,
                 step=50,
                 label="Max Tokens"
-            )
-            temperature = gr.Slider(
                 minimum=0.1,
                 maximum=2.0,
                 value=0.7,
                 step=0.1,
                 label="Temperature"
-            )
-        with gr.Row():
-            top_p = gr.Slider(
                 minimum=0.1,
                 maximum=1.0,
                 value=0.95,
                 step=0.05,
                 label="Top-p"
-            )
-            repetition_penalty = gr.Slider(
                 minimum=1.0,
                 maximum=2.0,
                 value=1.1,
                 step=0.1,
                 label="Repetition Penalty"
             )
-        system_prompt = gr.Textbox(
-            label="System Prompt",
-            value="You are a helpful AI assistant. Provide clear and concise answers.",
-            lines=3,
-            placeholder="Enter a system prompt to guide the model's behavior..."
-        )
-    # Event handlers
-    generate_btn.click(
-        fn=generate_text,
-        inputs=[
-            prompt_input,
-            max_length,
-            temperature,
-            top_p,
-            repetition_penalty,
-            system_prompt
-        ],
-        outputs=[output_text]
-    )
-    clear_btn.click(
-        fn=clear_chat,
-        outputs=[prompt_input, output_text]
-    )
-    # Allow Enter key to generate
-    prompt_input.submit(
-        fn=generate_text,
-        inputs=[
-            prompt_input,
-            max_length,
-            temperature,
-            top_p,
-            repetition_penalty,
-            system_prompt
         ],
-        outputs=[output_text]
     )
 # Auto-load the model at startup
@@ -272,4 +179,4 @@ print(f"Startup load status: {load_status}")
 demo.launch(
     share=False,
     show_error=True
-)

 import sys
 import os
+# Model configuration
 if len(sys.argv) > 1 and os.path.exists(sys.argv[1]):
     MODEL_NAME = sys.argv[1]
     print(f"Using local model from: {MODEL_NAME}")
 else:
     MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
+# Global variables
 tokenizer = None
 model = None
 def load_model():
     """Load the Smol LLM model and tokenizer"""
+    global tokenizer, model
     try:
         print(f"Loading model: {MODEL_NAME}")
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
+            dtype=torch.float32,
             device_map="auto"
         )
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
     except Exception as e:
         return f"❌ Error loading model: {str(e)}"
+def chat_predict(message, history, max_length, temperature, top_p, repetition_penalty, system_prompt):
+    """Generate text using the loaded model with streaming and history"""
     global model, tokenizer
     if model is None or tokenizer is None:
         yield "⚠️ Please wait for the model to finish loading..."
         return
+    if not message.strip():
         yield "⚠️ Please enter a prompt."
         return
     try:
+        # Build conversation history
+        messages = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        for user_msg, assistant_msg in history:
+            messages.append({"role": "user", "content": user_msg})
+            messages.append({"role": "assistant", "content": assistant_msg})
+        messages.append({"role": "user", "content": message})
         # Format the prompt
+        formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
         # Setup streamer
         for new_text in streamer:
             generated_text += new_text
             token_count += 1
+            yield generated_text
+        # Append stats after generation is complete
+        elapsed_time = time.time() - start_time
+        if elapsed_time > 0:
+            tps = token_count / elapsed_time
+            stats = f"\n\n---\n*Generated {token_count} tokens in {elapsed_time:.2f}s ({tps:.2f} t/s)*"
+            yield generated_text + stats
     except Exception as e:
         yield f"❌ Error during generation: {str(e)}"
 # Create custom theme
 custom_theme = gr.themes.Soft(
     primary_hue="blue",
 with gr.Blocks(theme=custom_theme) as demo:
     gr.Markdown(
         """
+        # 🤖 Smol LLM Chat
+        Multi-turn chat with SmolLM2-135M.
         """
     )
+    # Chat Interface
+    chat_interface = gr.ChatInterface(
+        fn=chat_predict,
+        additional_inputs=[
+            gr.Slider(
                 minimum=50,
                 maximum=1024,
                 value=200,
                 step=50,
                 label="Max Tokens"
+            ),
+            gr.Slider(
                 minimum=0.1,
                 maximum=2.0,
                 value=0.7,
                 step=0.1,
                 label="Temperature"
+            ),
+            gr.Slider(
                 minimum=0.1,
                 maximum=1.0,
                 value=0.95,
                 step=0.05,
                 label="Top-p"
+            ),
+            gr.Slider(
                 minimum=1.0,
                 maximum=2.0,
                 value=1.1,
                 step=0.1,
                 label="Repetition Penalty"
+            ),
+            gr.Textbox(
+                label="System Prompt",
+                value="You are a helpful AI assistant. Provide clear and concise answers.",
+                lines=2
             )
         ],
+        additional_inputs_accordion=gr.Accordion("⚙️ Generation Parameters", open=False),
     )
 # Auto-load the model at startup
 demo.launch(
     share=False,
     show_error=True
+)