Spaces:

ajsbsd
/

smollm2-zerocpu-demo

Running

App Files Files Community

ajsbsd commited on Jun 16

Commit

9656c26

verified ·

1 Parent(s): d32f90c

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -19

app.py CHANGED Viewed

@@ -54,7 +54,7 @@ def load_model_for_zerocpu():
             print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
     else:
         print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
     print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
     try:
         model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
@@ -75,7 +75,6 @@ def predict_chat(message: str, history: list):
         yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
         return
-    # history contains [user_message, bot_message] tuples, convert to messages format for apply_chat_template
     messages = [{"role": "system", "content": "You are a friendly chatbot."}]
     for human_msg, ai_msg in history:
         messages.append({"role": "user", "content": human_msg})
@@ -124,7 +123,7 @@ def predict_chat(message: str, history: list):
         )
         generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
         yield generated_text
     end_time = time.time()
     print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
@@ -137,10 +136,9 @@ if __name__ == "__main__":
         "environment for efficient demonstration. How can I help you today?"
     )
-    # Use gr.Chatbot with type='messages' to avoid the deprecation warning
     chatbot_component = gr.Chatbot(height=500, type='messages')
-    with gr.Blocks(theme="soft") as demo: # Use gr.Blocks to lay out components
         gr.Markdown(
             f"# SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU\n"
             f"This Space demonstrates an LLM for efficient CPU-only inference. "
@@ -148,34 +146,30 @@ if __name__ == "__main__":
             f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
             f"without GGUF. Expect varied responses each run due to randomized generation."
         )
-        chatbot_component.render() # Render the chatbot
-        # Use gr.ChatInterface for the core chat functionality
-        # It handles the textbox, send button, and history implicitly
         chat_interface = gr.ChatInterface(
             fn=predict_chat,
-            chatbot=chatbot_component, # Link to the rendered chatbot component
             textbox=gr.Textbox(
                 placeholder="Ask me a question...",
                 container=False,
                 scale=7
             ),
-            # clear_btn is removed from ChatInterface constructor
             examples=[
                 ["What is the capital of France?"],
                 ["Can you tell me a fun fact about outer space?"],
                 ["What's the best way to stay motivated?"],
             ],
-            cache_examples=False,
-            # initial_chatbot_message will be set after chat_interface is rendered
         )
-        # Manually add a clear button that links to the chatbot component
         gr.ClearButton(components=[chatbot_component])
-        # Set the initial message for the chatbot
-        # This needs to be done *after* the chatbot_component is defined
         chatbot_component.value = [[None, initial_chatbot_message]]

             print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
     else:
         print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
     print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
     try:
         model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
         yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
         return
     messages = [{"role": "system", "content": "You are a friendly chatbot."}]
     for human_msg, ai_msg in history:
         messages.append({"role": "user", "content": human_msg})
         )
         generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
         yield generated_text
     end_time = time.time()
     print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
         "environment for efficient demonstration. How can I help you today?"
     )
     chatbot_component = gr.Chatbot(height=500, type='messages')
+    with gr.Blocks(theme="soft") as demo:
         gr.Markdown(
             f"# SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU\n"
             f"This Space demonstrates an LLM for efficient CPU-only inference. "
             f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
             f"without GGUF. Expect varied responses each run due to randomized generation."
         )
+        # This is the key change: explicitly placing the chat_interface component
         chat_interface = gr.ChatInterface(
             fn=predict_chat,
+            chatbot=chatbot_component,
             textbox=gr.Textbox(
                 placeholder="Ask me a question...",
                 container=False,
                 scale=7
             ),
             examples=[
                 ["What is the capital of France?"],
                 ["Can you tell me a fun fact about outer space?"],
                 ["What's the best way to stay motivated?"],
             ],
+            cache_examples=False,
         )
+        # Now explicitly place the chat_interface component into the Blocks layout
+        chat_interface.render()
+        # The clear button is typically below the chat interface
         gr.ClearButton(components=[chatbot_component])
         chatbot_component.value = [[None, initial_chatbot_message]]