Spaces:

Luigi
/

tiny-scribe

Running

Luigi commited on Feb 2

Commit

6604bf5

1 Parent(s): fec94b5

feat: add CPU thread configuration UI

Add hardware configuration section in Advanced Settings with:
- CPU Thread Preset dropdown: HF Spaces Free (2 vCPUs), CPU Upgrade (8 vCPUs), or Custom
- Custom thread count slider (1-32, visible only when Custom selected)
- Dynamic thread count passing through load_model() and summarize_streaming()

Enables users to optimize performance for local deployment or specific HF Spaces tiers.

Files changed (1) hide show

app.py +55 -10

app.py CHANGED Viewed

@@ -363,18 +363,19 @@ AVAILABLE_MODELS = {
 DEFAULT_MODEL_KEY = "qwen3_600m_q4"
-def load_model(model_key: str = None) -> Tuple[Llama, str]:
     """
     Load model with CPU optimizations. Only reloads if model changes.
     Args:
         model_key: Model identifier from AVAILABLE_MODELS
     Returns:
         Tuple of (loaded_model, info_message)
     """
     global llm, converter, current_model_key
     # Default to current or default model
     if model_key is None:
         model_key = current_model_key if current_model_key else DEFAULT_MODEL_KEY
@@ -423,8 +424,8 @@ def load_model(model_key: str = None) -> Tuple[Llama, str]:
             filename=model["filename"],
             n_ctx=n_ctx,
             n_batch=min(2048, n_ctx),  # Batch size for throughput
-            n_threads=2,               # Match 2 vCPUs
-            n_threads_batch=2,         # Parallel batch processing
             n_gpu_layers=n_gpu_layers,  # 0=CPU only, -1=all GPU layers (if available)
             verbose=False,
             seed=1337,
@@ -660,6 +661,8 @@ def summarize_streaming(
     top_p: float = None,
     top_k: int = None,
     output_language: str = "en",
 ) -> Generator[Tuple[str, str, str, dict], None, None]:
     """
     Stream summary generation from uploaded file.
@@ -696,7 +699,16 @@ def summarize_streaming(
         "truncation_info": {},
     }
     global llm, converter
     model = AVAILABLE_MODELS[model_key]
     usable_max = min(model["max_context"], MAX_USABLE_CTX)
@@ -775,7 +787,7 @@ def summarize_streaming(
     # Load model (no-op if already loaded) with timing
     model_load_start = time.time()
     try:
-        llm, load_msg = load_model(model_key)
         logger.info(load_msg)
         metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
     except Exception as e:
@@ -1176,6 +1188,29 @@ def create_interface():
                     with gr.Accordion("⚙️ Advanced Settings", open=False):
                         with gr.Group(elem_classes=["advanced-settings"]):
                             temperature_slider = gr.Slider(
                                 minimum=0.0,
                                 maximum=2.0,
@@ -1267,7 +1302,7 @@ def create_interface():
         # Event handlers
         submit_btn.click(
             fn=summarize_streaming,
-            inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector],
             outputs=[thinking_output, summary_output, info_output, metrics_state],
             show_progress="full"
         )
@@ -1278,7 +1313,17 @@ def create_interface():
             inputs=[model_dropdown],
             outputs=[temperature_slider, top_p, top_k, info_output]
         )
         # Copy buttons
         copy_summary_btn.click(
             fn=lambda x: x,

 DEFAULT_MODEL_KEY = "qwen3_600m_q4"
+def load_model(model_key: str = None, n_threads: int = 2) -> Tuple[Llama, str]:
     """
     Load model with CPU optimizations. Only reloads if model changes.
     Args:
         model_key: Model identifier from AVAILABLE_MODELS
+        n_threads: Number of CPU threads to use for inference
     Returns:
         Tuple of (loaded_model, info_message)
     """
     global llm, converter, current_model_key
     # Default to current or default model
     if model_key is None:
         model_key = current_model_key if current_model_key else DEFAULT_MODEL_KEY
             filename=model["filename"],
             n_ctx=n_ctx,
             n_batch=min(2048, n_ctx),  # Batch size for throughput
+            n_threads=n_threads,               # Configurable thread count
+            n_threads_batch=n_threads,         # Parallel batch processing
             n_gpu_layers=n_gpu_layers,  # 0=CPU only, -1=all GPU layers (if available)
             verbose=False,
             seed=1337,
     top_p: float = None,
     top_k: int = None,
     output_language: str = "en",
+    thread_config: str = "free",
+    custom_threads: int = 4,
 ) -> Generator[Tuple[str, str, str, dict], None, None]:
     """
     Stream summary generation from uploaded file.
         "truncation_info": {},
     }
     global llm, converter
+    # Determine thread count based on configuration preset
+    thread_preset_map = {
+        "free": 2,      # HF Spaces Free Tier: 2 vCPUs
+        "upgrade": 8,   # HF Spaces CPU Upgrade: 8 vCPUs
+        "custom": custom_threads,  # User-specified thread count
+    }
+    n_threads = thread_preset_map.get(thread_config, 2)
+    logger.info(f"Using {n_threads} threads (config: {thread_config})")
     model = AVAILABLE_MODELS[model_key]
     usable_max = min(model["max_context"], MAX_USABLE_CTX)
     # Load model (no-op if already loaded) with timing
     model_load_start = time.time()
     try:
+        llm, load_msg = load_model(model_key, n_threads=n_threads)
         logger.info(load_msg)
         metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
     except Exception as e:
                     with gr.Accordion("⚙️ Advanced Settings", open=False):
                         with gr.Group(elem_classes=["advanced-settings"]):
+                            gr.HTML('<div class="section-header" style="margin-top: 10px;"><span class="section-icon">🖥️</span> Hardware Configuration</div>')
+                            thread_config_dropdown = gr.Dropdown(
+                                choices=[
+                                    ("HF Spaces Free Tier (2 vCPUs)", "free"),
+                                    ("HF Spaces CPU Upgrade (8 vCPUs)", "upgrade"),
+                                    ("Custom (manual)", "custom"),
+                                ],
+                                value="free",
+                                label="CPU Thread Preset",
+                                info="Select hardware tier or specify custom thread count"
+                            )
+                            custom_threads_slider = gr.Slider(
+                                minimum=1,
+                                maximum=32,
+                                value=4,
+                                step=1,
+                                label="Custom Thread Count",
+                                info="Number of CPU threads for model inference (1-32)",
+                                visible=False
+                            )
                             temperature_slider = gr.Slider(
                                 minimum=0.0,
                                 maximum=2.0,
         # Event handlers
         submit_btn.click(
             fn=summarize_streaming,
+            inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector, thread_config_dropdown, custom_threads_slider],
             outputs=[thinking_output, summary_output, info_output, metrics_state],
             show_progress="full"
         )
             inputs=[model_dropdown],
             outputs=[temperature_slider, top_p, top_k, info_output]
         )
+        # Show/hide custom thread slider based on selection
+        def toggle_custom_threads(thread_config):
+            return gr.update(visible=(thread_config == "custom"))
+        thread_config_dropdown.change(
+            fn=toggle_custom_threads,
+            inputs=[thread_config_dropdown],
+            outputs=[custom_threads_slider]
+        )
         # Copy buttons
         copy_summary_btn.click(
             fn=lambda x: x,