Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -54,7 +54,7 @@ def load_model_for_zerocpu():
|
|
| 54 |
print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
|
| 55 |
else:
|
| 56 |
print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
|
| 57 |
-
|
| 58 |
print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
|
| 59 |
try:
|
| 60 |
model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
|
|
@@ -75,7 +75,6 @@ def predict_chat(message: str, history: list):
|
|
| 75 |
yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
|
| 76 |
return
|
| 77 |
|
| 78 |
-
# history contains [user_message, bot_message] tuples, convert to messages format for apply_chat_template
|
| 79 |
messages = [{"role": "system", "content": "You are a friendly chatbot."}]
|
| 80 |
for human_msg, ai_msg in history:
|
| 81 |
messages.append({"role": "user", "content": human_msg})
|
|
@@ -124,7 +123,7 @@ def predict_chat(message: str, history: list):
|
|
| 124 |
)
|
| 125 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
| 126 |
yield generated_text
|
| 127 |
-
|
| 128 |
end_time = time.time()
|
| 129 |
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
|
| 130 |
|
|
@@ -137,10 +136,9 @@ if __name__ == "__main__":
|
|
| 137 |
"environment for efficient demonstration. How can I help you today?"
|
| 138 |
)
|
| 139 |
|
| 140 |
-
# Use gr.Chatbot with type='messages' to avoid the deprecation warning
|
| 141 |
chatbot_component = gr.Chatbot(height=500, type='messages')
|
| 142 |
-
|
| 143 |
-
with gr.Blocks(theme="soft") as demo:
|
| 144 |
gr.Markdown(
|
| 145 |
f"# SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU\n"
|
| 146 |
f"This Space demonstrates an LLM for efficient CPU-only inference. "
|
|
@@ -148,34 +146,30 @@ if __name__ == "__main__":
|
|
| 148 |
f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
|
| 149 |
f"without GGUF. Expect varied responses each run due to randomized generation."
|
| 150 |
)
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
# Use gr.ChatInterface for the core chat functionality
|
| 155 |
-
# It handles the textbox, send button, and history implicitly
|
| 156 |
chat_interface = gr.ChatInterface(
|
| 157 |
fn=predict_chat,
|
| 158 |
-
chatbot=chatbot_component,
|
| 159 |
textbox=gr.Textbox(
|
| 160 |
placeholder="Ask me a question...",
|
| 161 |
container=False,
|
| 162 |
scale=7
|
| 163 |
),
|
| 164 |
-
# clear_btn is removed from ChatInterface constructor
|
| 165 |
examples=[
|
| 166 |
["What is the capital of France?"],
|
| 167 |
["Can you tell me a fun fact about outer space?"],
|
| 168 |
["What's the best way to stay motivated?"],
|
| 169 |
],
|
| 170 |
-
cache_examples=False,
|
| 171 |
-
# initial_chatbot_message will be set after chat_interface is rendered
|
| 172 |
)
|
| 173 |
-
|
| 174 |
-
#
|
|
|
|
|
|
|
|
|
|
| 175 |
gr.ClearButton(components=[chatbot_component])
|
| 176 |
|
| 177 |
-
# Set the initial message for the chatbot
|
| 178 |
-
# This needs to be done *after* the chatbot_component is defined
|
| 179 |
chatbot_component.value = [[None, initial_chatbot_message]]
|
| 180 |
|
| 181 |
|
|
|
|
| 54 |
print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
|
| 55 |
else:
|
| 56 |
print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
|
| 57 |
+
|
| 58 |
print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
|
| 59 |
try:
|
| 60 |
model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
|
|
|
|
| 75 |
yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
|
| 76 |
return
|
| 77 |
|
|
|
|
| 78 |
messages = [{"role": "system", "content": "You are a friendly chatbot."}]
|
| 79 |
for human_msg, ai_msg in history:
|
| 80 |
messages.append({"role": "user", "content": human_msg})
|
|
|
|
| 123 |
)
|
| 124 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
| 125 |
yield generated_text
|
| 126 |
+
|
| 127 |
end_time = time.time()
|
| 128 |
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
|
| 129 |
|
|
|
|
| 136 |
"environment for efficient demonstration. How can I help you today?"
|
| 137 |
)
|
| 138 |
|
|
|
|
| 139 |
chatbot_component = gr.Chatbot(height=500, type='messages')
|
| 140 |
+
|
| 141 |
+
with gr.Blocks(theme="soft") as demo:
|
| 142 |
gr.Markdown(
|
| 143 |
f"# SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU\n"
|
| 144 |
f"This Space demonstrates an LLM for efficient CPU-only inference. "
|
|
|
|
| 146 |
f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
|
| 147 |
f"without GGUF. Expect varied responses each run due to randomized generation."
|
| 148 |
)
|
| 149 |
+
|
| 150 |
+
# This is the key change: explicitly placing the chat_interface component
|
|
|
|
|
|
|
|
|
|
| 151 |
chat_interface = gr.ChatInterface(
|
| 152 |
fn=predict_chat,
|
| 153 |
+
chatbot=chatbot_component,
|
| 154 |
textbox=gr.Textbox(
|
| 155 |
placeholder="Ask me a question...",
|
| 156 |
container=False,
|
| 157 |
scale=7
|
| 158 |
),
|
|
|
|
| 159 |
examples=[
|
| 160 |
["What is the capital of France?"],
|
| 161 |
["Can you tell me a fun fact about outer space?"],
|
| 162 |
["What's the best way to stay motivated?"],
|
| 163 |
],
|
| 164 |
+
cache_examples=False,
|
|
|
|
| 165 |
)
|
| 166 |
+
|
| 167 |
+
# Now explicitly place the chat_interface component into the Blocks layout
|
| 168 |
+
chat_interface.render()
|
| 169 |
+
|
| 170 |
+
# The clear button is typically below the chat interface
|
| 171 |
gr.ClearButton(components=[chatbot_component])
|
| 172 |
|
|
|
|
|
|
|
| 173 |
chatbot_component.value = [[None, initial_chatbot_message]]
|
| 174 |
|
| 175 |
|