Spaces:

Alovestocode
/

ZeroGPU-LLM-Inference

Sleeping

Alikestocode commited on Nov 8, 2025

Commit

03689e3

1 Parent(s): 06aef1b

Fix Gradio UI structure and add comprehensive fallback logging

- Fix UI structure: move buttons to separate Row, fix Column nesting
- Add detailed fallback chain logging with emoji indicators
- Show clear progression: vLLM → Transformers AWQ → BitsAndBytes → FP16/FP32
- Improve error messages to show which fallback path is being used
- All fallback paths now properly logged for debugging

Files changed (1) hide show

app.py +81 -59

app.py CHANGED Viewed

@@ -212,41 +212,54 @@ def load_awq_pipeline(repo: str, tokenizer):
 def load_pipeline(model_name: str):
-    """Load model with vLLM (preferred) or Transformers (fallback)."""
     # Try vLLM first (best performance with native AWQ support via llm-compressor)
     # vLLM handles AWQ natively, so AutoAWQ deprecation doesn't affect us
     if VLLM_AVAILABLE:
         try:
-            print(f"Attempting to load {model_name} with vLLM (native AWQ support)...")
             return load_vllm_model(model_name)
         except Exception as exc:
-            print(f"⚠️ vLLM load failed, falling back to Transformers: {exc}")
             import traceback
             traceback.print_exc()
     # Fallback to Transformers pipeline
     if model_name in PIPELINES:
         return PIPELINES[model_name]
     repo = MODELS[model_name]["repo_id"]
     tokenizer = get_tokenizer(repo)
-    # Try AWQ first if available
     if AWQ_AVAILABLE:
         try:
-            print(f"Loading {repo} with AWQ quantization...")
             pipe = load_awq_pipeline(repo, tokenizer)
             PIPELINES[model_name] = pipe
             _schedule_background_warm(model_name)
             # Warm kernels immediately after loading
             Thread(target=lambda: _warm_kernels(model_name), daemon=True).start()
             return pipe
         except Exception as exc:
-            print(f"AWQ load failed for {repo}: {exc}. Falling back to BitsAndBytes.")
     # Fallback to BitsAndBytes 8-bit
     if BITSANDBYTES_AVAILABLE:
         try:
             quant_config = BitsAndBytesConfig(load_in_8bit=True)
             model_kwargs = {"quantization_config": quant_config}
             if FLASH_ATTN_AVAILABLE:
@@ -275,13 +288,17 @@ def load_pipeline(model_name: str):
             PIPELINES[model_name] = pipe
             _schedule_background_warm(model_name)
             return pipe
         except Exception as exc:
-            print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
-    # Fallback to bfloat16/fp16/fp32
     for dtype in (torch.bfloat16, torch.float16, torch.float32):
         try:
             model_kwargs = {}
             if FLASH_ATTN_AVAILABLE:
                 model_kwargs["attn_implementation"] = "flash_attention_2"
@@ -308,11 +325,14 @@ def load_pipeline(model_name: str):
             PIPELINES[model_name] = pipe
             _schedule_background_warm(model_name)
             return pipe
-        except Exception:
             continue
-    # Final fallback
     model_kwargs = {}
     if FLASH_ATTN_AVAILABLE:
         model_kwargs["attn_implementation"] = "flash_attention_2"
@@ -338,6 +358,7 @@ def load_pipeline(model_name: str):
     PIPELINES[model_name] = pipe
     _schedule_background_warm(model_name)
     return pipe
@@ -788,56 +809,57 @@ def build_ui():
     """) as demo:
         gr.Markdown("# 🛰️ Router Control Room — ZeroGPU" )
         gr.Markdown(description)
-    with gr.Row():
-        with gr.Column(scale=3):
-            user_task = gr.Textbox(
-                label="User Task / Problem Statement",
-                placeholder="Describe the homework-style query that needs routing...",
-                lines=8,
-                value="Explain how to solve a constrained optimization homework problem that mixes calculus and coding steps.",
-            )
-            context = gr.Textbox(
-                label="Supporting Context (optional)",
-                placeholder="Paste any retrieved evidence, PDFs, or rubric notes.",
-                lines=4,
-            )
-            acceptance = gr.Textbox(
-                label="Acceptance Criteria",
-                placeholder="Bullet list of 'definition of done' checks.",
-                lines=3,
-                value="- Provide citations for every claim.\n- Ensure /math verifies /code output.",
-            )
-            extra_guidance = gr.Textbox(
-                label="Additional Guidance",
-                placeholder="Special constraints, tools to avoid, etc.",
-                lines=3,
-            )
-        with gr.Column(scale=2):
-            model_choice = gr.Dropdown(
-                label="Router Checkpoint",
-                choices=list(MODELS.keys()),
-                value=list(MODELS.keys())[0] if MODELS else None,
-                allow_custom_value=False,
-            )
-            difficulty = gr.Radio(
-                label="Difficulty Tier",
-                choices=["introductory", "intermediate", "advanced"],
-                value="advanced",
-                interactive=True,
-            )
-            tags = gr.Textbox(
-                label="Tags",
-                placeholder="Comma-separated e.g. calculus, optimization, python",
-                value="calculus, optimization, python",
-            )
-            max_new_tokens = gr.Slider(256, 20000, value=16000, step=32, label="Max New Tokens")
-            temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature")
-            top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
-            gpu_duration = gr.Slider(60, 1800, value=600, step=60, label="GPU Duration (seconds)", info="Maximum GPU time allocation for this request")
-        generate_btn = gr.Button("Generate Router Plan", variant="primary")
-        clear_btn = gr.Button("Clear", variant="secondary")
         with gr.Row():
             raw_output = gr.Textbox(label="Raw Model Output", lines=12)

 def load_pipeline(model_name: str):
+    """Load model with vLLM (preferred) or Transformers (fallback).
+    Fallback chain:
+    1. vLLM with AWQ (best performance, continuous batching)
+    2. vLLM with FP16 (if AWQ not available)
+    3. Transformers with AWQ (via AutoAWQ - deprecated but functional)
+    4. Transformers with BitsAndBytes 8-bit
+    5. Transformers with FP16/FP32
+    """
     # Try vLLM first (best performance with native AWQ support via llm-compressor)
     # vLLM handles AWQ natively, so AutoAWQ deprecation doesn't affect us
     if VLLM_AVAILABLE:
         try:
+            print(f"🔄 Attempting to load {model_name} with vLLM (native AWQ support)...")
             return load_vllm_model(model_name)
         except Exception as exc:
+            print(f"⚠️ vLLM load failed: {exc}")
+            print(f"   → Falling back to Transformers pipeline...")
             import traceback
             traceback.print_exc()
     # Fallback to Transformers pipeline
     if model_name in PIPELINES:
+        print(f"✅ Using cached Transformers pipeline for {model_name}")
         return PIPELINES[model_name]
     repo = MODELS[model_name]["repo_id"]
     tokenizer = get_tokenizer(repo)
+    # Try AWQ first if available (Transformers fallback path)
     if AWQ_AVAILABLE:
         try:
+            print(f"🔄 Loading {repo} with Transformers + AutoAWQ (fallback path)...")
             pipe = load_awq_pipeline(repo, tokenizer)
             PIPELINES[model_name] = pipe
             _schedule_background_warm(model_name)
             # Warm kernels immediately after loading
             Thread(target=lambda: _warm_kernels(model_name), daemon=True).start()
+            print(f"✅ Transformers + AutoAWQ pipeline loaded: {model_name}")
             return pipe
         except Exception as exc:
+            print(f"⚠️ AutoAWQ load failed for {repo}: {exc}")
+            print(f"   → Falling back to BitsAndBytes 8-bit...")
     # Fallback to BitsAndBytes 8-bit
     if BITSANDBYTES_AVAILABLE:
         try:
+            print(f"🔄 Loading {repo} with BitsAndBytes 8-bit quantization...")
             quant_config = BitsAndBytesConfig(load_in_8bit=True)
             model_kwargs = {"quantization_config": quant_config}
             if FLASH_ATTN_AVAILABLE:
             PIPELINES[model_name] = pipe
             _schedule_background_warm(model_name)
+            print(f"✅ BitsAndBytes 8-bit pipeline loaded: {model_name}")
             return pipe
         except Exception as exc:
+            print(f"⚠️ BitsAndBytes 8-bit load failed for {repo}: {exc}")
+            print(f"   → Falling back to FP16/FP32...")
+    # Fallback to bfloat16/fp16/fp32 (unquantized)
     for dtype in (torch.bfloat16, torch.float16, torch.float32):
+        dtype_name = {torch.bfloat16: "bfloat16", torch.float16: "float16", torch.float32: "float32"}[dtype]
         try:
+            print(f"🔄 Loading {repo} with {dtype_name} precision...")
             model_kwargs = {}
             if FLASH_ATTN_AVAILABLE:
                 model_kwargs["attn_implementation"] = "flash_attention_2"
             PIPELINES[model_name] = pipe
             _schedule_background_warm(model_name)
+            print(f"✅ {dtype_name} pipeline loaded: {model_name}")
             return pipe
+        except Exception as exc:
+            print(f"⚠️ {dtype_name} load failed: {exc}")
             continue
+    # Final fallback (no quantization, no FlashAttention)
+    print(f"⚠️ All quantization methods failed, using basic pipeline...")
     model_kwargs = {}
     if FLASH_ATTN_AVAILABLE:
         model_kwargs["attn_implementation"] = "flash_attention_2"
     PIPELINES[model_name] = pipe
     _schedule_background_warm(model_name)
+    print(f"✅ Basic pipeline loaded: {model_name}")
     return pipe
     """) as demo:
         gr.Markdown("# 🛰️ Router Control Room — ZeroGPU" )
         gr.Markdown(description)
+        with gr.Row():
+            with gr.Column(scale=3):
+                user_task = gr.Textbox(
+                    label="User Task / Problem Statement",
+                    placeholder="Describe the homework-style query that needs routing...",
+                    lines=8,
+                    value="Explain how to solve a constrained optimization homework problem that mixes calculus and coding steps.",
+                )
+                context = gr.Textbox(
+                    label="Supporting Context (optional)",
+                    placeholder="Paste any retrieved evidence, PDFs, or rubric notes.",
+                    lines=4,
+                )
+                acceptance = gr.Textbox(
+                    label="Acceptance Criteria",
+                    placeholder="Bullet list of 'definition of done' checks.",
+                    lines=3,
+                    value="- Provide citations for every claim.\n- Ensure /math verifies /code output.",
+                )
+                extra_guidance = gr.Textbox(
+                    label="Additional Guidance",
+                    placeholder="Special constraints, tools to avoid, etc.",
+                    lines=3,
+                )
+            with gr.Column(scale=2):
+                model_choice = gr.Dropdown(
+                    label="Router Checkpoint",
+                    choices=list(MODELS.keys()),
+                    value=list(MODELS.keys())[0] if MODELS else None,
+                    allow_custom_value=False,
+                )
+                difficulty = gr.Radio(
+                    label="Difficulty Tier",
+                    choices=["introductory", "intermediate", "advanced"],
+                    value="advanced",
+                    interactive=True,
+                )
+                tags = gr.Textbox(
+                    label="Tags",
+                    placeholder="Comma-separated e.g. calculus, optimization, python",
+                    value="calculus, optimization, python",
+                )
+                max_new_tokens = gr.Slider(256, 20000, value=16000, step=32, label="Max New Tokens")
+                temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature")
+                top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
+                gpu_duration = gr.Slider(60, 1800, value=600, step=60, label="GPU Duration (seconds)", info="Maximum GPU time allocation for this request")
+        with gr.Row():
+            generate_btn = gr.Button("Generate Router Plan", variant="primary", scale=1)
+            clear_btn = gr.Button("Clear", variant="secondary", scale=1)
         with gr.Row():
             raw_output = gr.Textbox(label="Raw Model Output", lines=12)