Spaces:

PioTio
/

AIMan

Sleeping

App Files Files Community

PioTio commited on Feb 16

Commit

e989a17

verified ·

1 Parent(s): c39961e

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +25 -0

app.py CHANGED Viewed

@@ -28,6 +28,7 @@ except Exception:
 # Config / defaults
 # ---------------------------------------------------------------------------
 DEFAULT_MODEL = "PioTio/Nanbeige2.5"
 DEFAULT_SYSTEM_PROMPT = "You are a helpful, honest assistant. Answer succinctly unless asked otherwise."
 # globals populated by load_model()
@@ -306,6 +307,18 @@ def submit_message(user_message: str, history, system_prompt: str, temperature:
     prompt = build_prompt(pairs[:-1], user_message, system_prompt, max_history)
     if stream:
         # stream partial assistant outputs
         for partial in _generate_stream(prompt, temperature, top_p, top_k, max_new_tokens):
@@ -363,6 +376,7 @@ with gr.Blocks(title="Nanbeige2.5 — Chat UI") as demo:
     with gr.Row():
         model_input = gr.Textbox(value=DEFAULT_MODEL, label="Model repo (HF)", interactive=True)
         load_btn = gr.Button("Load model")
         model_status = gr.Textbox(value="Model not loaded", label="Status", interactive=False)
     with gr.Row():
@@ -427,6 +441,17 @@ with gr.Blocks(title="Nanbeige2.5 — Chat UI") as demo:
     else:
         model_status.value = _bg_initial_load()
     gr.Markdown("---\n**Tips:** select GPU hardware for smoother streaming and enable 4-bit bitsandbytes by installing `bitsandbytes` in `requirements.txt`.")

 # Config / defaults
 # ---------------------------------------------------------------------------
 DEFAULT_MODEL = "PioTio/Nanbeige2.5"
+CPU_DEMO_MODEL = "distilgpt2"  # fast, small CPU-friendly fallback for demos
 DEFAULT_SYSTEM_PROMPT = "You are a helpful, honest assistant. Answer succinctly unless asked otherwise."
 # globals populated by load_model()
     prompt = build_prompt(pairs[:-1], user_message, system_prompt, max_history)
+    # If user is running the full Nanbeige model on CPU, warn and suggest options
+    if MODEL_NAME == DEFAULT_MODEL and DEVICE == "cpu":
+        warning = (
+            "⚠️ **Nanbeige is too large for CPU inference and will be extremely slow.**\n\n"
+            "Options:\n"
+            "- Enable GPU in Space settings (recommended)\n"
+            f"- Click **Load fast CPU demo ({CPU_DEMO_MODEL})** for a quick, low-cost demo\n"
+            "- Or set `ALLOW_CPU_NANBEIGE=1` in the server env to force CPU generation (not recommended)")
+        pairs[-1] = (user_message, warning)
+        yield pairs, ""
+        return
     if stream:
         # stream partial assistant outputs
         for partial in _generate_stream(prompt, temperature, top_p, top_k, max_new_tokens):
     with gr.Row():
         model_input = gr.Textbox(value=DEFAULT_MODEL, label="Model repo (HF)", interactive=True)
         load_btn = gr.Button("Load model")
+        model_demo_btn = gr.Button(f"Load fast CPU demo ({CPU_DEMO_MODEL})")
         model_status = gr.Textbox(value="Model not loaded", label="Status", interactive=False)
     with gr.Row():
     else:
         model_status.value = _bg_initial_load()
+    # CPU warning / demo hint (visible in UI)
+    gr.Markdown("""
+**⚠️ If this Space is running on CPU, `PioTio/Nanbeige2.5` will be extremely slow.**
+- Enable GPU in Space Settings for real-time use.
+- Or click **Load fast CPU demo (distilgpt2)** for an immediate, low-cost demo reply.
+""")
+    # wire demo button
+    model_demo_btn.click(fn=lambda: load_model_ui(CPU_DEMO_MODEL), inputs=None, outputs=model_status)
     gr.Markdown("---\n**Tips:** select GPU hardware for smoother streaming and enable 4-bit bitsandbytes by installing `bitsandbytes` in `requirements.txt`.")