Spaces:
Running
Running
feat: add CPU thread configuration UI
Browse filesAdd hardware configuration section in Advanced Settings with:
- CPU Thread Preset dropdown: HF Spaces Free (2 vCPUs), CPU Upgrade (8 vCPUs), or Custom
- Custom thread count slider (1-32, visible only when Custom selected)
- Dynamic thread count passing through load_model() and summarize_streaming()
Enables users to optimize performance for local deployment or specific HF Spaces tiers.
app.py
CHANGED
|
@@ -363,18 +363,19 @@ AVAILABLE_MODELS = {
|
|
| 363 |
DEFAULT_MODEL_KEY = "qwen3_600m_q4"
|
| 364 |
|
| 365 |
|
| 366 |
-
def load_model(model_key: str = None) -> Tuple[Llama, str]:
|
| 367 |
"""
|
| 368 |
Load model with CPU optimizations. Only reloads if model changes.
|
| 369 |
-
|
| 370 |
Args:
|
| 371 |
model_key: Model identifier from AVAILABLE_MODELS
|
| 372 |
-
|
|
|
|
| 373 |
Returns:
|
| 374 |
Tuple of (loaded_model, info_message)
|
| 375 |
"""
|
| 376 |
global llm, converter, current_model_key
|
| 377 |
-
|
| 378 |
# Default to current or default model
|
| 379 |
if model_key is None:
|
| 380 |
model_key = current_model_key if current_model_key else DEFAULT_MODEL_KEY
|
|
@@ -423,8 +424,8 @@ def load_model(model_key: str = None) -> Tuple[Llama, str]:
|
|
| 423 |
filename=model["filename"],
|
| 424 |
n_ctx=n_ctx,
|
| 425 |
n_batch=min(2048, n_ctx), # Batch size for throughput
|
| 426 |
-
n_threads=
|
| 427 |
-
n_threads_batch=
|
| 428 |
n_gpu_layers=n_gpu_layers, # 0=CPU only, -1=all GPU layers (if available)
|
| 429 |
verbose=False,
|
| 430 |
seed=1337,
|
|
@@ -660,6 +661,8 @@ def summarize_streaming(
|
|
| 660 |
top_p: float = None,
|
| 661 |
top_k: int = None,
|
| 662 |
output_language: str = "en",
|
|
|
|
|
|
|
| 663 |
) -> Generator[Tuple[str, str, str, dict], None, None]:
|
| 664 |
"""
|
| 665 |
Stream summary generation from uploaded file.
|
|
@@ -696,7 +699,16 @@ def summarize_streaming(
|
|
| 696 |
"truncation_info": {},
|
| 697 |
}
|
| 698 |
global llm, converter
|
| 699 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
model = AVAILABLE_MODELS[model_key]
|
| 701 |
usable_max = min(model["max_context"], MAX_USABLE_CTX)
|
| 702 |
|
|
@@ -775,7 +787,7 @@ def summarize_streaming(
|
|
| 775 |
# Load model (no-op if already loaded) with timing
|
| 776 |
model_load_start = time.time()
|
| 777 |
try:
|
| 778 |
-
llm, load_msg = load_model(model_key)
|
| 779 |
logger.info(load_msg)
|
| 780 |
metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
|
| 781 |
except Exception as e:
|
|
@@ -1176,6 +1188,29 @@ def create_interface():
|
|
| 1176 |
|
| 1177 |
with gr.Accordion("⚙️ Advanced Settings", open=False):
|
| 1178 |
with gr.Group(elem_classes=["advanced-settings"]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1179 |
temperature_slider = gr.Slider(
|
| 1180 |
minimum=0.0,
|
| 1181 |
maximum=2.0,
|
|
@@ -1267,7 +1302,7 @@ def create_interface():
|
|
| 1267 |
# Event handlers
|
| 1268 |
submit_btn.click(
|
| 1269 |
fn=summarize_streaming,
|
| 1270 |
-
inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector],
|
| 1271 |
outputs=[thinking_output, summary_output, info_output, metrics_state],
|
| 1272 |
show_progress="full"
|
| 1273 |
)
|
|
@@ -1278,7 +1313,17 @@ def create_interface():
|
|
| 1278 |
inputs=[model_dropdown],
|
| 1279 |
outputs=[temperature_slider, top_p, top_k, info_output]
|
| 1280 |
)
|
| 1281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1282 |
# Copy buttons
|
| 1283 |
copy_summary_btn.click(
|
| 1284 |
fn=lambda x: x,
|
|
|
|
| 363 |
DEFAULT_MODEL_KEY = "qwen3_600m_q4"
|
| 364 |
|
| 365 |
|
| 366 |
+
def load_model(model_key: str = None, n_threads: int = 2) -> Tuple[Llama, str]:
|
| 367 |
"""
|
| 368 |
Load model with CPU optimizations. Only reloads if model changes.
|
| 369 |
+
|
| 370 |
Args:
|
| 371 |
model_key: Model identifier from AVAILABLE_MODELS
|
| 372 |
+
n_threads: Number of CPU threads to use for inference
|
| 373 |
+
|
| 374 |
Returns:
|
| 375 |
Tuple of (loaded_model, info_message)
|
| 376 |
"""
|
| 377 |
global llm, converter, current_model_key
|
| 378 |
+
|
| 379 |
# Default to current or default model
|
| 380 |
if model_key is None:
|
| 381 |
model_key = current_model_key if current_model_key else DEFAULT_MODEL_KEY
|
|
|
|
| 424 |
filename=model["filename"],
|
| 425 |
n_ctx=n_ctx,
|
| 426 |
n_batch=min(2048, n_ctx), # Batch size for throughput
|
| 427 |
+
n_threads=n_threads, # Configurable thread count
|
| 428 |
+
n_threads_batch=n_threads, # Parallel batch processing
|
| 429 |
n_gpu_layers=n_gpu_layers, # 0=CPU only, -1=all GPU layers (if available)
|
| 430 |
verbose=False,
|
| 431 |
seed=1337,
|
|
|
|
| 661 |
top_p: float = None,
|
| 662 |
top_k: int = None,
|
| 663 |
output_language: str = "en",
|
| 664 |
+
thread_config: str = "free",
|
| 665 |
+
custom_threads: int = 4,
|
| 666 |
) -> Generator[Tuple[str, str, str, dict], None, None]:
|
| 667 |
"""
|
| 668 |
Stream summary generation from uploaded file.
|
|
|
|
| 699 |
"truncation_info": {},
|
| 700 |
}
|
| 701 |
global llm, converter
|
| 702 |
+
|
| 703 |
+
# Determine thread count based on configuration preset
|
| 704 |
+
thread_preset_map = {
|
| 705 |
+
"free": 2, # HF Spaces Free Tier: 2 vCPUs
|
| 706 |
+
"upgrade": 8, # HF Spaces CPU Upgrade: 8 vCPUs
|
| 707 |
+
"custom": custom_threads, # User-specified thread count
|
| 708 |
+
}
|
| 709 |
+
n_threads = thread_preset_map.get(thread_config, 2)
|
| 710 |
+
logger.info(f"Using {n_threads} threads (config: {thread_config})")
|
| 711 |
+
|
| 712 |
model = AVAILABLE_MODELS[model_key]
|
| 713 |
usable_max = min(model["max_context"], MAX_USABLE_CTX)
|
| 714 |
|
|
|
|
| 787 |
# Load model (no-op if already loaded) with timing
|
| 788 |
model_load_start = time.time()
|
| 789 |
try:
|
| 790 |
+
llm, load_msg = load_model(model_key, n_threads=n_threads)
|
| 791 |
logger.info(load_msg)
|
| 792 |
metrics["model_load_time_ms"] = (time.time() - model_load_start) * 1000
|
| 793 |
except Exception as e:
|
|
|
|
| 1188 |
|
| 1189 |
with gr.Accordion("⚙️ Advanced Settings", open=False):
|
| 1190 |
with gr.Group(elem_classes=["advanced-settings"]):
|
| 1191 |
+
gr.HTML('<div class="section-header" style="margin-top: 10px;"><span class="section-icon">🖥️</span> Hardware Configuration</div>')
|
| 1192 |
+
|
| 1193 |
+
thread_config_dropdown = gr.Dropdown(
|
| 1194 |
+
choices=[
|
| 1195 |
+
("HF Spaces Free Tier (2 vCPUs)", "free"),
|
| 1196 |
+
("HF Spaces CPU Upgrade (8 vCPUs)", "upgrade"),
|
| 1197 |
+
("Custom (manual)", "custom"),
|
| 1198 |
+
],
|
| 1199 |
+
value="free",
|
| 1200 |
+
label="CPU Thread Preset",
|
| 1201 |
+
info="Select hardware tier or specify custom thread count"
|
| 1202 |
+
)
|
| 1203 |
+
|
| 1204 |
+
custom_threads_slider = gr.Slider(
|
| 1205 |
+
minimum=1,
|
| 1206 |
+
maximum=32,
|
| 1207 |
+
value=4,
|
| 1208 |
+
step=1,
|
| 1209 |
+
label="Custom Thread Count",
|
| 1210 |
+
info="Number of CPU threads for model inference (1-32)",
|
| 1211 |
+
visible=False
|
| 1212 |
+
)
|
| 1213 |
+
|
| 1214 |
temperature_slider = gr.Slider(
|
| 1215 |
minimum=0.0,
|
| 1216 |
maximum=2.0,
|
|
|
|
| 1302 |
# Event handlers
|
| 1303 |
submit_btn.click(
|
| 1304 |
fn=summarize_streaming,
|
| 1305 |
+
inputs=[file_input, model_dropdown, enable_reasoning, max_tokens, temperature_slider, top_p, top_k, language_selector, thread_config_dropdown, custom_threads_slider],
|
| 1306 |
outputs=[thinking_output, summary_output, info_output, metrics_state],
|
| 1307 |
show_progress="full"
|
| 1308 |
)
|
|
|
|
| 1313 |
inputs=[model_dropdown],
|
| 1314 |
outputs=[temperature_slider, top_p, top_k, info_output]
|
| 1315 |
)
|
| 1316 |
+
|
| 1317 |
+
# Show/hide custom thread slider based on selection
|
| 1318 |
+
def toggle_custom_threads(thread_config):
|
| 1319 |
+
return gr.update(visible=(thread_config == "custom"))
|
| 1320 |
+
|
| 1321 |
+
thread_config_dropdown.change(
|
| 1322 |
+
fn=toggle_custom_threads,
|
| 1323 |
+
inputs=[thread_config_dropdown],
|
| 1324 |
+
outputs=[custom_threads_slider]
|
| 1325 |
+
)
|
| 1326 |
+
|
| 1327 |
# Copy buttons
|
| 1328 |
copy_summary_btn.click(
|
| 1329 |
fn=lambda x: x,
|