Luigi commited on
Commit
6604bf5
·
1 Parent(s): fec94b5

feat: add CPU thread configuration UI

Browse files

Add hardware configuration section in Advanced Settings with:
- CPU Thread Preset dropdown: HF Spaces Free (2 vCPUs), CPU Upgrade (8 vCPUs), or Custom
- Custom thread count slider (1-32, visible only when Custom selected)
- Dynamic thread count passing through load_model() and summarize_streaming()

Enables users to optimize performance for local deployment or specific HF Spaces tiers.

Files changed (1) hide show
  1. app.py +55 -10
app.py CHANGED
@@ -363,18 +363,19 @@ AVAILABLE_MODELS = {
363
  DEFAULT_MODEL_KEY = "qwen3_600m_q4"
364
 
365
 
366
- def load_model(model_key: str = None) -> Tuple[Llama, str]:
367
  """
368
  Load model with CPU optimizations. Only reloads if model changes.
369
-
370
  Args:
371
  model_key: Model identifier from AVAILABLE_MODELS
372
-
 
373
  Returns:
374
  Tuple of (loaded_model, info_message)
375
  """
376
  global llm, converter, current_model_key
377
-
378
  # Default to current or default model
379
  if model_key is None:
380
  model_key = current_model_key if current_model_key else DEFAULT_MODEL_KEY
@@ -423,8 +424,8 @@ def load_model(model_key: str = None) -> Tuple[Llama, str]:
423
  filename=model["filename"],
424
  n_ctx=n_ctx,
425
  n_batch=min(2048, n_ctx), # Batch size for throughput
426
- n_threads=2, # Match 2 vCPUs
427
- n_threads_batch=2, # Parallel batch processing
428
  n_gpu_layers=n_gpu_layers, # 0=CPU only, -1=all GPU layers (if available)
429
  verbose=False,
430
  seed=1337,
@@ -660,6 +661,8 @@ def summarize_streaming(
660
  top_p: float = None,
661
  top_k: int = None,
662
  output_language: str = "en",
 
 
663
  ) -> Generator[Tuple[str, str, str, dict], None, None]:
664
  """
665
  Stream summary generation from uploaded file.
@@ -696,7 +699,16 @@ def summarize_streaming(
696
  "truncation_info": {},
697
  }
698
  global llm, converter
699
-
 
 
 
 
 
 
 
 
 
700
  model = AVAILABLE_MODELS[model_key]
701
  usable_max = min(model["max_context"], MAX_USABLE_CTX)
702
 
@@ -775,7 +787,7 @@ def summarize_streaming(
775
  # Load model (no-op if already loaded) with timing
776
  model_load_start = time.time()
777
  try:
778
- llm, load_msg = load_model(model_key)
779
  logger.info(load_msg)
780
  metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
781
  except Exception as e:
@@ -1176,6 +1188,29 @@ def create_interface():
1176
 
1177
  with gr.Accordion("⚙️ Advanced Settings", open=False):
1178
  with gr.Group(elem_classes=["advanced-settings"]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1179
  temperature_slider = gr.Slider(
1180
  minimum=0.0,
1181
  maximum=2.0,
@@ -1267,7 +1302,7 @@ def create_interface():
1267
  # Event handlers
1268
  submit_btn.click(
1269
  fn=summarize_streaming,
1270
- inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector],
1271
  outputs=[thinking_output, summary_output, info_output, metrics_state],
1272
  show_progress="full"
1273
  )
@@ -1278,7 +1313,17 @@ def create_interface():
1278
  inputs=[model_dropdown],
1279
  outputs=[temperature_slider, top_p, top_k, info_output]
1280
  )
1281
-
 
 
 
 
 
 
 
 
 
 
1282
  # Copy buttons
1283
  copy_summary_btn.click(
1284
  fn=lambda x: x,
 
363
  DEFAULT_MODEL_KEY = "qwen3_600m_q4"
364
 
365
 
366
+ def load_model(model_key: str = None, n_threads: int = 2) -> Tuple[Llama, str]:
367
  """
368
  Load model with CPU optimizations. Only reloads if model changes.
369
+
370
  Args:
371
  model_key: Model identifier from AVAILABLE_MODELS
372
+ n_threads: Number of CPU threads to use for inference
373
+
374
  Returns:
375
  Tuple of (loaded_model, info_message)
376
  """
377
  global llm, converter, current_model_key
378
+
379
  # Default to current or default model
380
  if model_key is None:
381
  model_key = current_model_key if current_model_key else DEFAULT_MODEL_KEY
 
424
  filename=model["filename"],
425
  n_ctx=n_ctx,
426
  n_batch=min(2048, n_ctx), # Batch size for throughput
427
+ n_threads=n_threads, # Configurable thread count
428
+ n_threads_batch=n_threads, # Parallel batch processing
429
  n_gpu_layers=n_gpu_layers, # 0=CPU only, -1=all GPU layers (if available)
430
  verbose=False,
431
  seed=1337,
 
661
  top_p: float = None,
662
  top_k: int = None,
663
  output_language: str = "en",
664
+ thread_config: str = "free",
665
+ custom_threads: int = 4,
666
  ) -> Generator[Tuple[str, str, str, dict], None, None]:
667
  """
668
  Stream summary generation from uploaded file.
 
699
  "truncation_info": {},
700
  }
701
  global llm, converter
702
+
703
+ # Determine thread count based on configuration preset
704
+ thread_preset_map = {
705
+ "free": 2, # HF Spaces Free Tier: 2 vCPUs
706
+ "upgrade": 8, # HF Spaces CPU Upgrade: 8 vCPUs
707
+ "custom": custom_threads, # User-specified thread count
708
+ }
709
+ n_threads = thread_preset_map.get(thread_config, 2)
710
+ logger.info(f"Using {n_threads} threads (config: {thread_config})")
711
+
712
  model = AVAILABLE_MODELS[model_key]
713
  usable_max = min(model["max_context"], MAX_USABLE_CTX)
714
 
 
787
  # Load model (no-op if already loaded) with timing
788
  model_load_start = time.time()
789
  try:
790
+ llm, load_msg = load_model(model_key, n_threads=n_threads)
791
  logger.info(load_msg)
792
  metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
793
  except Exception as e:
 
1188
 
1189
  with gr.Accordion("⚙️ Advanced Settings", open=False):
1190
  with gr.Group(elem_classes=["advanced-settings"]):
1191
+ gr.HTML('<div class="section-header" style="margin-top: 10px;"><span class="section-icon">🖥️</span> Hardware Configuration</div>')
1192
+
1193
+ thread_config_dropdown = gr.Dropdown(
1194
+ choices=[
1195
+ ("HF Spaces Free Tier (2 vCPUs)", "free"),
1196
+ ("HF Spaces CPU Upgrade (8 vCPUs)", "upgrade"),
1197
+ ("Custom (manual)", "custom"),
1198
+ ],
1199
+ value="free",
1200
+ label="CPU Thread Preset",
1201
+ info="Select hardware tier or specify custom thread count"
1202
+ )
1203
+
1204
+ custom_threads_slider = gr.Slider(
1205
+ minimum=1,
1206
+ maximum=32,
1207
+ value=4,
1208
+ step=1,
1209
+ label="Custom Thread Count",
1210
+ info="Number of CPU threads for model inference (1-32)",
1211
+ visible=False
1212
+ )
1213
+
1214
  temperature_slider = gr.Slider(
1215
  minimum=0.0,
1216
  maximum=2.0,
 
1302
  # Event handlers
1303
  submit_btn.click(
1304
  fn=summarize_streaming,
1305
+ inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector, thread_config_dropdown, custom_threads_slider],
1306
  outputs=[thinking_output, summary_output, info_output, metrics_state],
1307
  show_progress="full"
1308
  )
 
1313
  inputs=[model_dropdown],
1314
  outputs=[temperature_slider, top_p, top_k, info_output]
1315
  )
1316
+
1317
+ # Show/hide custom thread slider based on selection
1318
+ def toggle_custom_threads(thread_config):
1319
+ return gr.update(visible=(thread_config == "custom"))
1320
+
1321
+ thread_config_dropdown.change(
1322
+ fn=toggle_custom_threads,
1323
+ inputs=[thread_config_dropdown],
1324
+ outputs=[custom_threads_slider]
1325
+ )
1326
+
1327
  # Copy buttons
1328
  copy_summary_btn.click(
1329
  fn=lambda x: x,