- Updated default model to qwen3.0-14b.
Browse files- Added automatic model registration in language_checker.py
- Dockerfile +4 -2
- backend/language_checker.py +320 -89
- backend/runtime_config.py +13 -151
- download_models.py +0 -90
- model_paths.py +9 -0
- requirements.txt +4 -3
- server.py +66 -0
Dockerfile
CHANGED
|
@@ -18,9 +18,10 @@ RUN npm run build
|
|
| 18 |
# -----------------------------------------------------------------------------
|
| 19 |
FROM python:3.9-slim
|
| 20 |
|
| 21 |
-
# System deps (git for Hugging Face Hub downloads)
|
| 22 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 23 |
git \
|
|
|
|
| 24 |
&& rm -rf /var/lib/apt/lists/*
|
| 25 |
|
| 26 |
# Create a non-root user with UID 1000 (mandatory in Spaces)
|
|
@@ -53,4 +54,5 @@ COPY --chown=user:users --from=frontend /app/client/dist ./client/dist
|
|
| 53 |
ENV FORCE_INT8=1
|
| 54 |
|
| 55 |
EXPOSE 7860
|
| 56 |
-
|
|
|
|
|
|
| 18 |
# -----------------------------------------------------------------------------
|
| 19 |
FROM python:3.9-slim
|
| 20 |
|
| 21 |
+
# System deps (git for Hugging Face Hub downloads, build-essential for triton/AWQ CUDA kernel compilation)
|
| 22 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 23 |
git \
|
| 24 |
+
build-essential \
|
| 25 |
&& rm -rf /var/lib/apt/lists/*
|
| 26 |
|
| 27 |
# Create a non-root user with UID 1000 (mandatory in Spaces)
|
|
|
|
| 54 |
ENV FORCE_INT8=1
|
| 55 |
|
| 56 |
EXPOSE 7860
|
| 57 |
+
|
| 58 |
+
CMD ["python", "server.py", "--model", "qwen3.0-14b", "--address", "0.0.0.0", "--port", "7860"]
|
backend/language_checker.py
CHANGED
|
@@ -2,10 +2,11 @@ import os
|
|
| 2 |
import time
|
| 3 |
import torch
|
| 4 |
import gc
|
|
|
|
| 5 |
from typing import Dict, List, Optional, Tuple, Callable
|
| 6 |
|
| 7 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 8 |
-
from .class_register import register_model
|
| 9 |
from .runtime_config import load_runtime_config
|
| 10 |
from model_paths import MODEL_PATHS
|
| 11 |
|
|
@@ -61,7 +62,41 @@ class DeviceManager:
|
|
| 61 |
time_str = f"{int(load_time // 60)}m{load_time % 60:.1f}s"
|
| 62 |
|
| 63 |
print(f"✅ 模型加载完成 [大小: {size_str}, 耗时: {time_str}, 速度: {load_speed_mb_per_sec:.1f}MB/s]")
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
class AbstractLanguageChecker:
|
| 67 |
"""
|
|
@@ -168,6 +203,7 @@ class QwenLM(AbstractLanguageChecker):
|
|
| 168 |
# 判断加载策略
|
| 169 |
# ============================================================
|
| 170 |
use_int8 = False
|
|
|
|
| 171 |
device_map = None
|
| 172 |
dtype = None
|
| 173 |
use_low_cpu_mem = False
|
|
@@ -179,7 +215,24 @@ class QwenLM(AbstractLanguageChecker):
|
|
| 179 |
force_int8 = os.environ.get('FORCE_INT8')
|
| 180 |
force_bfloat16 = os.environ.get('CPU_FORCE_BFLOAT16')
|
| 181 |
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
print("🔧 CPU 模式:手动控制设备分配")
|
| 184 |
|
| 185 |
if force_int8:
|
|
@@ -227,52 +280,49 @@ class QwenLM(AbstractLanguageChecker):
|
|
| 227 |
print("🔧 device_map: auto")
|
| 228 |
|
| 229 |
# ============================================================
|
| 230 |
-
#
|
| 231 |
# ============================================================
|
| 232 |
model_load_start_time = time.perf_counter()
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
load_in_8bit=True,
|
| 239 |
-
device_map=device_map,
|
| 240 |
-
trust_remote_code=True,
|
| 241 |
-
low_cpu_mem_usage=True,
|
| 242 |
-
local_files_only=local_files_only
|
| 243 |
-
),
|
| 244 |
-
load_description
|
| 245 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
elif device_map:
|
| 247 |
-
# GPU/MPS
|
| 248 |
-
self.model =
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
device_map=device_map,
|
| 252 |
-
dtype=dtype,
|
| 253 |
-
trust_remote_code=True,
|
| 254 |
-
low_cpu_mem_usage=use_low_cpu_mem,
|
| 255 |
-
local_files_only=local_files_only
|
| 256 |
-
),
|
| 257 |
-
load_description
|
| 258 |
)
|
| 259 |
else:
|
| 260 |
-
# CPU
|
| 261 |
-
self.model =
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
dtype=dtype,
|
| 265 |
-
trust_remote_code=True,
|
| 266 |
-
low_cpu_mem_usage=use_low_cpu_mem,
|
| 267 |
-
local_files_only=local_files_only
|
| 268 |
-
).to(self.device),
|
| 269 |
-
load_description
|
| 270 |
)
|
| 271 |
|
| 272 |
DeviceManager.print_model_load_stats(self.model, time.perf_counter() - model_load_start_time)
|
| 273 |
|
| 274 |
self.model.eval()
|
| 275 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
# ============================================================
|
| 277 |
# 关于 torch.compile() 的性能优化讨论结论:
|
| 278 |
#
|
|
@@ -302,6 +352,207 @@ class QwenLM(AbstractLanguageChecker):
|
|
| 302 |
device_name = DeviceManager.get_device_name(self.device)
|
| 303 |
print(f"✓ {model_display_name} 模型已加载 ({device_name})")
|
| 304 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
def _load_runtime_config(self, model_name: Optional[str]):
|
| 306 |
"""
|
| 307 |
加载运行时配置:基于模型和平台的四层配置合并
|
|
@@ -536,6 +787,11 @@ class QwenLM(AbstractLanguageChecker):
|
|
| 536 |
DeviceManager.clear_cache(self.device)
|
| 537 |
gc.collect()
|
| 538 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
return {'bpe_strings': bpe_strings}
|
| 540 |
|
| 541 |
except Exception as e:
|
|
@@ -547,57 +803,32 @@ class QwenLM(AbstractLanguageChecker):
|
|
| 547 |
# _cleanup_tensors 方法已被移除,因为不再需要显式清理小张量
|
| 548 |
|
| 549 |
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
Qwen3-4B 模型支持
|
| 576 |
-
使用 Qwen3-4B Base 模型
|
| 577 |
-
"""
|
| 578 |
-
def __init__(self):
|
| 579 |
-
# model_name 和 model_path 会自动从配置获取
|
| 580 |
-
super().__init__()
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
@register_model(name='qwen3.0-8b')
|
| 584 |
-
class QwenLM_3_0_8B(QwenLM):
|
| 585 |
-
"""
|
| 586 |
-
Qwen3-8B 模型支持
|
| 587 |
-
使用 Qwen3-8B Base 模型
|
| 588 |
-
"""
|
| 589 |
-
def __init__(self):
|
| 590 |
-
# model_name 和 model_path 会自动从配置获取
|
| 591 |
-
super().__init__()
|
| 592 |
-
|
| 593 |
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
"""
|
| 597 |
-
Qwen3-14B 模型支持
|
| 598 |
-
使用 Qwen3-14B Base 模型
|
| 599 |
-
"""
|
| 600 |
-
def __init__(self):
|
| 601 |
-
# model_name 和 model_path 会自动从配置获取
|
| 602 |
-
super().__init__()
|
| 603 |
|
|
|
|
| 2 |
import time
|
| 3 |
import torch
|
| 4 |
import gc
|
| 5 |
+
import warnings
|
| 6 |
from typing import Dict, List, Optional, Tuple, Callable
|
| 7 |
|
| 8 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 9 |
+
from .class_register import register_model, REGISTERED_MODELS
|
| 10 |
from .runtime_config import load_runtime_config
|
| 11 |
from model_paths import MODEL_PATHS
|
| 12 |
|
|
|
|
| 62 |
time_str = f"{int(load_time // 60)}m{load_time % 60:.1f}s"
|
| 63 |
|
| 64 |
print(f"✅ 模型加载完成 [大小: {size_str}, 耗时: {time_str}, 速度: {load_speed_mb_per_sec:.1f}MB/s]")
|
| 65 |
+
|
| 66 |
+
@staticmethod
|
| 67 |
+
def print_cuda_memory_summary(title="GPU 内存统计", device=0):
|
| 68 |
+
"""打印详细的 CUDA 内存统计信息"""
|
| 69 |
+
if not torch.cuda.is_available():
|
| 70 |
+
return
|
| 71 |
+
|
| 72 |
+
print(f"\n{'='*60}")
|
| 73 |
+
print(f"🔍 {title}")
|
| 74 |
+
print(f"{'='*60}")
|
| 75 |
+
|
| 76 |
+
# 基本统计
|
| 77 |
+
allocated = torch.cuda.memory_allocated(device) / 1024**3
|
| 78 |
+
reserved = torch.cuda.memory_reserved(device) / 1024**3
|
| 79 |
+
max_allocated = torch.cuda.max_memory_allocated(device) / 1024**3
|
| 80 |
+
total = torch.cuda.get_device_properties(device).total_memory / 1024**3
|
| 81 |
+
|
| 82 |
+
print(f"📊 总显存: {total:.2f} GB")
|
| 83 |
+
print(f"✅ 已分配 (allocated): {allocated:.2f} GB ({allocated/total*100:.1f}%)")
|
| 84 |
+
print(f"📦 已预留 (reserved): {reserved:.2f} GB ({reserved/total*100:.1f}%)")
|
| 85 |
+
print(f"📈 峰值分配: {max_allocated:.2f} GB")
|
| 86 |
+
print(f"💚 可用空间: {total - reserved:.2f} GB ({(total-reserved)/total*100:.1f}%)")
|
| 87 |
+
print(f"🔸 碎片化: {reserved - allocated:.2f} GB")
|
| 88 |
+
|
| 89 |
+
# 详细统计(简化版)
|
| 90 |
+
try:
|
| 91 |
+
stats = torch.cuda.memory_stats(device)
|
| 92 |
+
num_allocs = stats.get("num_alloc_retries", 0)
|
| 93 |
+
num_ooms = stats.get("num_ooms", 0)
|
| 94 |
+
if num_allocs > 0 or num_ooms > 0:
|
| 95 |
+
print(f"⚠️ 分配重试: {num_allocs} 次, OOM: {num_ooms} 次")
|
| 96 |
+
except:
|
| 97 |
+
pass
|
| 98 |
+
|
| 99 |
+
print(f"{'='*60}\n")
|
| 100 |
|
| 101 |
class AbstractLanguageChecker:
|
| 102 |
"""
|
|
|
|
| 203 |
# 判断加载策略
|
| 204 |
# ============================================================
|
| 205 |
use_int8 = False
|
| 206 |
+
use_awq = False
|
| 207 |
device_map = None
|
| 208 |
dtype = None
|
| 209 |
use_low_cpu_mem = False
|
|
|
|
| 215 |
force_int8 = os.environ.get('FORCE_INT8')
|
| 216 |
force_bfloat16 = os.environ.get('CPU_FORCE_BFLOAT16')
|
| 217 |
|
| 218 |
+
# 检测是否为 AWQ 模型(自动检测)
|
| 219 |
+
is_awq_model = self._is_awq_model(model_path)
|
| 220 |
+
if is_awq_model:
|
| 221 |
+
# AWQ 仅支持 CUDA 环境(Docker 部署)
|
| 222 |
+
if self.device.type != "cuda":
|
| 223 |
+
raise RuntimeError(
|
| 224 |
+
f"❌ AWQ 量化模型仅支持 CUDA 环境\n"
|
| 225 |
+
f" 当前设备: {self.device.type.upper()}\n"
|
| 226 |
+
f" 请在 Docker + CUDA 环境中运行,或使用非量化模型"
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
use_awq = True
|
| 230 |
+
load_description = "模型(AWQ 量化,Transformers 内置支持)"
|
| 231 |
+
print("✓ 检测到 AWQ 量化模型,使用 Transformers 内置支持加载")
|
| 232 |
+
print("🔧 CUDA 模式:自动设备分配")
|
| 233 |
+
print("🔧 device_map: auto")
|
| 234 |
+
|
| 235 |
+
elif self.device.type == "cpu":
|
| 236 |
print("🔧 CPU 模式:手动控制设备分配")
|
| 237 |
|
| 238 |
if force_int8:
|
|
|
|
| 280 |
print("🔧 device_map: auto")
|
| 281 |
|
| 282 |
# ============================================================
|
| 283 |
+
# 执行模型加载:根据硬件环境选择最优加载策略
|
| 284 |
# ============================================================
|
| 285 |
model_load_start_time = time.perf_counter()
|
| 286 |
+
|
| 287 |
+
if use_awq:
|
| 288 |
+
# 场景0:AWQ 量化 - W4/W8 权重量化(优先级最高)
|
| 289 |
+
self.model = self._load_model_with_awq(
|
| 290 |
+
model_path, load_component, load_description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
)
|
| 292 |
+
elif use_int8:
|
| 293 |
+
# 场景1:INT8 量化 - 内存受限环境
|
| 294 |
+
if self.device.type == "cuda":
|
| 295 |
+
# CUDA: INT8 量化加载
|
| 296 |
+
self.model = self._load_model_with_int8_cuda(
|
| 297 |
+
model_path, load_component, load_description
|
| 298 |
+
)
|
| 299 |
+
else:
|
| 300 |
+
# CPU: 纯 CPU 量化
|
| 301 |
+
self.model = self._load_model_with_int8_cpu(
|
| 302 |
+
model_path, load_component, load_description
|
| 303 |
+
)
|
| 304 |
elif device_map:
|
| 305 |
+
# 场景2:GPU/MPS - 自动设备分配(充分利用加速器)
|
| 306 |
+
self.model = self._load_model_with_device_map(
|
| 307 |
+
model_path, load_component, load_description,
|
| 308 |
+
device_map, dtype, use_low_cpu_mem
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
)
|
| 310 |
else:
|
| 311 |
+
# 场景3:CPU - 手动设备控制(无加速器可用)
|
| 312 |
+
self.model = self._load_model_on_cpu(
|
| 313 |
+
model_path, load_component, load_description,
|
| 314 |
+
dtype, use_low_cpu_mem
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
)
|
| 316 |
|
| 317 |
DeviceManager.print_model_load_stats(self.model, time.perf_counter() - model_load_start_time)
|
| 318 |
|
| 319 |
self.model.eval()
|
| 320 |
|
| 321 |
+
# 打印模型加载后的内存统计
|
| 322 |
+
if self.device.type == "cuda":
|
| 323 |
+
device_idx = self.device.index if self.device.index is not None else 0
|
| 324 |
+
DeviceManager.print_cuda_memory_summary(device=device_idx)
|
| 325 |
+
|
| 326 |
# ============================================================
|
| 327 |
# 关于 torch.compile() 的性能优化讨论结论:
|
| 328 |
#
|
|
|
|
| 352 |
device_name = DeviceManager.get_device_name(self.device)
|
| 353 |
print(f"✓ {model_display_name} 模型已加载 ({device_name})")
|
| 354 |
|
| 355 |
+
def _load_model_with_int8_cuda(
|
| 356 |
+
self,
|
| 357 |
+
model_path: str,
|
| 358 |
+
load_component: Callable,
|
| 359 |
+
load_description: str
|
| 360 |
+
):
|
| 361 |
+
"""
|
| 362 |
+
INT8 量化加载模式 - CUDA GPU 环境
|
| 363 |
+
|
| 364 |
+
技术细节:
|
| 365 |
+
- 使用 bitsandbytes 8bit 量化
|
| 366 |
+
- device_map="auto" 让 Transformers 自动分配层
|
| 367 |
+
|
| 368 |
+
性能特点:
|
| 369 |
+
- GPU 层推理速度快(INT8 Tensor Core 加速)
|
| 370 |
+
"""
|
| 371 |
+
return load_component(
|
| 372 |
+
lambda local_files_only: AutoModelForCausalLM.from_pretrained(
|
| 373 |
+
model_path,
|
| 374 |
+
load_in_8bit=True,
|
| 375 |
+
device_map="auto",
|
| 376 |
+
trust_remote_code=True,
|
| 377 |
+
low_cpu_mem_usage=True,
|
| 378 |
+
local_files_only=local_files_only
|
| 379 |
+
),
|
| 380 |
+
load_description
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
def _load_model_with_int8_cpu(
|
| 384 |
+
self,
|
| 385 |
+
model_path: str,
|
| 386 |
+
load_component: Callable,
|
| 387 |
+
load_description: str
|
| 388 |
+
):
|
| 389 |
+
"""
|
| 390 |
+
INT8 量化加载模式 - CPU 环境
|
| 391 |
+
|
| 392 |
+
适用场景:
|
| 393 |
+
- 无 GPU 可用,纯 CPU 推理
|
| 394 |
+
- 内存受限,无法加载 float32 模型
|
| 395 |
+
|
| 396 |
+
技术细节:
|
| 397 |
+
- 使用 bitsandbytes 8bit 量化
|
| 398 |
+
- device_map="cpu" 指定所有层在 CPU 上
|
| 399 |
+
- 量化可减少约 4 倍内存占用(相比 float32)
|
| 400 |
+
|
| 401 |
+
性能特点:
|
| 402 |
+
- 推理速度比 GPU 慢 10-100 倍
|
| 403 |
+
- 内存占用低,适合资源受限环境
|
| 404 |
+
- 实验性功能,可能在某些情况下降低性能
|
| 405 |
+
|
| 406 |
+
注意事项:
|
| 407 |
+
- 仅在 FORCE_INT8=1 时启用
|
| 408 |
+
- 优先考虑使用 GPU 或扩大内存
|
| 409 |
+
"""
|
| 410 |
+
print("⚙️ 纯 CPU INT8 量化")
|
| 411 |
+
|
| 412 |
+
return load_component(
|
| 413 |
+
lambda local_files_only: AutoModelForCausalLM.from_pretrained(
|
| 414 |
+
model_path,
|
| 415 |
+
load_in_8bit=True,
|
| 416 |
+
device_map="cpu",
|
| 417 |
+
trust_remote_code=True,
|
| 418 |
+
low_cpu_mem_usage=True,
|
| 419 |
+
local_files_only=local_files_only
|
| 420 |
+
),
|
| 421 |
+
load_description
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
def _load_model_with_device_map(
|
| 425 |
+
self,
|
| 426 |
+
model_path: str,
|
| 427 |
+
load_component: Callable,
|
| 428 |
+
load_description: str,
|
| 429 |
+
device_map: str,
|
| 430 |
+
dtype: torch.dtype,
|
| 431 |
+
use_low_cpu_mem: bool
|
| 432 |
+
):
|
| 433 |
+
"""
|
| 434 |
+
GPU/MPS 自动设备分配模式
|
| 435 |
+
|
| 436 |
+
适用场景:
|
| 437 |
+
- 有可用的硬件加速器(CUDA GPU 或 Apple Silicon MPS)
|
| 438 |
+
- 显存充足,不需要量化
|
| 439 |
+
|
| 440 |
+
技术细节:
|
| 441 |
+
- device_map="auto" 让 Transformers 自动分配设备
|
| 442 |
+
- 使用 float16/bfloat16 精度(充分利用 Tensor Core 加速)
|
| 443 |
+
- 支持多 GPU 自动分片(如果有多个 GPU)
|
| 444 |
+
|
| 445 |
+
性能优势:
|
| 446 |
+
- GPU float16 比 float32 快 2-3 倍
|
| 447 |
+
- 自动负载均衡(多 GPU 环境)
|
| 448 |
+
- 无需手动管理设备转移
|
| 449 |
+
"""
|
| 450 |
+
return load_component(
|
| 451 |
+
lambda local_files_only: AutoModelForCausalLM.from_pretrained(
|
| 452 |
+
model_path,
|
| 453 |
+
device_map=device_map,
|
| 454 |
+
dtype=dtype,
|
| 455 |
+
trust_remote_code=True,
|
| 456 |
+
low_cpu_mem_usage=use_low_cpu_mem,
|
| 457 |
+
local_files_only=local_files_only
|
| 458 |
+
),
|
| 459 |
+
load_description
|
| 460 |
+
)
|
| 461 |
+
|
| 462 |
+
def _load_model_on_cpu(
|
| 463 |
+
self,
|
| 464 |
+
model_path: str,
|
| 465 |
+
load_component: Callable,
|
| 466 |
+
load_description: str,
|
| 467 |
+
dtype: torch.dtype,
|
| 468 |
+
use_low_cpu_mem: bool
|
| 469 |
+
):
|
| 470 |
+
"""
|
| 471 |
+
CPU 手动设备控制模式
|
| 472 |
+
|
| 473 |
+
适用场景:
|
| 474 |
+
- 无硬件加速器可用
|
| 475 |
+
- 开发/测试环境
|
| 476 |
+
|
| 477 |
+
技术细节:
|
| 478 |
+
- 不使用 device_map(避免 CPU 环境的非预期行为)
|
| 479 |
+
- 手动 .to(device) 转移到 CPU
|
| 480 |
+
- 默认使用 float32(CPU 上 float16 没有加速优势)
|
| 481 |
+
- 可选 bfloat16(需要硬件支持,通过 CPU_FORCE_BFLOAT16 启用)
|
| 482 |
+
|
| 483 |
+
注意事项:
|
| 484 |
+
- CPU 推理速度较慢(比 GPU 慢 10-100 倍)
|
| 485 |
+
- float32 占用内存是 float16 的 2 倍
|
| 486 |
+
"""
|
| 487 |
+
return load_component(
|
| 488 |
+
lambda local_files_only: AutoModelForCausalLM.from_pretrained(
|
| 489 |
+
model_path,
|
| 490 |
+
dtype=dtype,
|
| 491 |
+
trust_remote_code=True,
|
| 492 |
+
low_cpu_mem_usage=use_low_cpu_mem,
|
| 493 |
+
local_files_only=local_files_only
|
| 494 |
+
).to(self.device),
|
| 495 |
+
load_description
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
def _load_model_with_awq(
|
| 499 |
+
self,
|
| 500 |
+
model_path: str,
|
| 501 |
+
load_component: Callable,
|
| 502 |
+
load_description: str
|
| 503 |
+
):
|
| 504 |
+
"""
|
| 505 |
+
AWQ 量化加载(使用 Transformers 内置支持,W4/W8,仅支持 CUDA)
|
| 506 |
+
|
| 507 |
+
- 显存占用:约为 FP16 的 1/4(W4)或 1/2(W8)
|
| 508 |
+
- 推理速度:优于 INT8
|
| 509 |
+
- 仅支持 Docker + CUDA 环境
|
| 510 |
+
- Transformers 会自动识别 quantization_config.json 并加载 AWQ 模型
|
| 511 |
+
"""
|
| 512 |
+
def awq_loader(local_files_only):
|
| 513 |
+
return AutoModelForCausalLM.from_pretrained(
|
| 514 |
+
model_path,
|
| 515 |
+
device_map="auto", # 自动分配到 GPU
|
| 516 |
+
trust_remote_code=True,
|
| 517 |
+
local_files_only=local_files_only
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
+
return load_component(awq_loader, load_description)
|
| 521 |
+
|
| 522 |
+
@staticmethod
|
| 523 |
+
def _is_awq_model(model_path: str) -> bool:
|
| 524 |
+
"""
|
| 525 |
+
检测模型是否为 AWQ 量化模型
|
| 526 |
+
|
| 527 |
+
检测策略:
|
| 528 |
+
1. 路径名包含 'awq' 关键字(如 'model-awq', 'Qwen-AWQ')
|
| 529 |
+
2. 检查模型目录下是否存在 quantization_config.json 且包含 'awq' 配置
|
| 530 |
+
|
| 531 |
+
Args:
|
| 532 |
+
model_path: 模型路径(本地路径或 HuggingFace 模型 ID)
|
| 533 |
+
|
| 534 |
+
Returns:
|
| 535 |
+
是否为 AWQ 模型
|
| 536 |
+
"""
|
| 537 |
+
# 策略1:路径名检测
|
| 538 |
+
if 'awq' in model_path.lower():
|
| 539 |
+
return True
|
| 540 |
+
|
| 541 |
+
# 策略2:配置文件检测(仅对本地路径)
|
| 542 |
+
if os.path.isdir(model_path):
|
| 543 |
+
import json
|
| 544 |
+
config_path = os.path.join(model_path, 'quantization_config.json')
|
| 545 |
+
if os.path.exists(config_path):
|
| 546 |
+
try:
|
| 547 |
+
with open(config_path, 'r') as f:
|
| 548 |
+
config = json.load(f)
|
| 549 |
+
if config.get('quant_method') == 'awq':
|
| 550 |
+
return True
|
| 551 |
+
except Exception:
|
| 552 |
+
pass
|
| 553 |
+
|
| 554 |
+
return False
|
| 555 |
+
|
| 556 |
def _load_runtime_config(self, model_name: Optional[str]):
|
| 557 |
"""
|
| 558 |
加载运行时配置:基于模型和平台的四层配置合并
|
|
|
|
| 787 |
DeviceManager.clear_cache(self.device)
|
| 788 |
gc.collect()
|
| 789 |
|
| 790 |
+
# 打印分析任务完成后的内存统计
|
| 791 |
+
if self.device.type == "cuda":
|
| 792 |
+
device_idx = self.device.index if self.device.index is not None else 0
|
| 793 |
+
DeviceManager.print_cuda_memory_summary(device=device_idx)
|
| 794 |
+
|
| 795 |
return {'bpe_strings': bpe_strings}
|
| 796 |
|
| 797 |
except Exception as e:
|
|
|
|
| 803 |
# _cleanup_tensors 方法已被移除,因为不再需要显式清理小张量
|
| 804 |
|
| 805 |
|
| 806 |
+
# ============================================================
|
| 807 |
+
# 自动注册:根据 MODEL_PATHS 自动注册所有模型
|
| 808 |
+
# ============================================================
|
| 809 |
+
# 只需要在 model_paths.py 中添加模型路径,即可自动注册
|
| 810 |
+
# 无需手动创建子类,实现 DRY 原则
|
| 811 |
+
def _auto_register_models():
|
| 812 |
+
"""自动注册 MODEL_PATHS 中的所有模型"""
|
| 813 |
+
for model_name in MODEL_PATHS.keys():
|
| 814 |
+
if model_name not in REGISTERED_MODELS:
|
| 815 |
+
# 动态创建模型类并注册
|
| 816 |
+
# 使用闭包捕获当前 model_name
|
| 817 |
+
def make_init():
|
| 818 |
+
def __init__(self):
|
| 819 |
+
QwenLM.__init__(self)
|
| 820 |
+
return __init__
|
| 821 |
+
|
| 822 |
+
model_class = type(
|
| 823 |
+
f'QwenLM_{model_name.replace(".", "_").replace("-", "_")}',
|
| 824 |
+
(QwenLM,),
|
| 825 |
+
{
|
| 826 |
+
'__init__': make_init(),
|
| 827 |
+
'__doc__': f'{model_name} 模型支持(自动注册)'
|
| 828 |
+
}
|
| 829 |
+
)
|
| 830 |
+
register_model(model_name)(model_class)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 831 |
|
| 832 |
+
# 执行自动注册
|
| 833 |
+
_auto_register_models()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 834 |
|
backend/runtime_config.py
CHANGED
|
@@ -16,7 +16,6 @@
|
|
| 16 |
import os
|
| 17 |
import torch
|
| 18 |
import sys
|
| 19 |
-
import multiprocessing
|
| 20 |
from typing import Dict, Optional
|
| 21 |
|
| 22 |
|
|
@@ -257,187 +256,51 @@ def validate_platform_config(platform: str, chunk_size: int, verbose: bool = Tru
|
|
| 257 |
print(f"✓ MPS 平台安全检查通过: chunk_size={chunk_size} (上限={MPS_TOPK_BUG_THRESHOLD})")
|
| 258 |
|
| 259 |
|
| 260 |
-
def
|
| 261 |
"""
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
容器环境中,multiprocessing.cpu_count() 会返回宿主机 CPU 数,
|
| 265 |
-
而非容器实际可用的 CPU 配额,导致线程过度订阅。
|
| 266 |
-
|
| 267 |
-
Returns:
|
| 268 |
-
实际可用的 CPU 核心数,如果读取失败或无限制返回 None
|
| 269 |
-
"""
|
| 270 |
-
try:
|
| 271 |
-
# CGroup v2 (较新的 Docker/K8s)
|
| 272 |
-
quota_file_v2 = "/sys/fs/cgroup/cpu.max"
|
| 273 |
-
|
| 274 |
-
if os.path.exists(quota_file_v2):
|
| 275 |
-
# CGroup v2 格式: "quota period"
|
| 276 |
-
with open(quota_file_v2) as f:
|
| 277 |
-
parts = f.read().strip().split()
|
| 278 |
-
if parts[0] == "max":
|
| 279 |
-
return None # 无限制
|
| 280 |
-
quota = int(parts[0])
|
| 281 |
-
period = int(parts[1])
|
| 282 |
-
return max(1, int(quota / period))
|
| 283 |
-
|
| 284 |
-
# CGroup v1
|
| 285 |
-
quota_file = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"
|
| 286 |
-
period_file = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"
|
| 287 |
-
|
| 288 |
-
if os.path.exists(quota_file) and os.path.exists(period_file):
|
| 289 |
-
with open(quota_file) as f:
|
| 290 |
-
quota = int(f.read().strip())
|
| 291 |
-
with open(period_file) as f:
|
| 292 |
-
period = int(f.read().strip())
|
| 293 |
-
|
| 294 |
-
if quota == -1:
|
| 295 |
-
return None # 无限制
|
| 296 |
-
|
| 297 |
-
return max(1, int(quota / period))
|
| 298 |
-
|
| 299 |
-
except Exception:
|
| 300 |
-
pass
|
| 301 |
-
|
| 302 |
-
return None
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
def _get_cpu_info() -> tuple[Optional[str], Optional[str]]:
|
| 306 |
-
"""
|
| 307 |
-
读取 CPU 供应商和型号信息(仅用于显示)
|
| 308 |
|
| 309 |
Returns:
|
| 310 |
-
|
| 311 |
"""
|
| 312 |
-
vendor = None
|
| 313 |
model_name = None
|
| 314 |
|
| 315 |
try:
|
| 316 |
if sys.platform == 'linux':
|
| 317 |
with open('/proc/cpuinfo', 'r') as f:
|
| 318 |
for line in f:
|
| 319 |
-
# 读取 vendor_id
|
| 320 |
-
if vendor is None and 'vendor_id' in line.lower():
|
| 321 |
-
vendor_str = line.split(':', 1)[1].strip()
|
| 322 |
-
if 'AUTHENTICAMD' in vendor_str.upper() or 'AMD' in vendor_str.upper():
|
| 323 |
-
vendor = 'AMD'
|
| 324 |
-
elif 'GENUINEINTEL' in vendor_str.upper() or 'INTEL' in vendor_str.upper():
|
| 325 |
-
vendor = 'Intel'
|
| 326 |
-
else:
|
| 327 |
-
vendor = vendor_str
|
| 328 |
-
|
| 329 |
# 读取 model name
|
| 330 |
if model_name is None and 'model name' in line.lower():
|
| 331 |
model_name = line.split(':', 1)[1].strip()
|
| 332 |
|
| 333 |
# 如果已经读取到所需信息,可以提前退出
|
| 334 |
-
if
|
| 335 |
break
|
| 336 |
except Exception:
|
| 337 |
pass
|
| 338 |
|
| 339 |
-
return
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
def _configure_cpu_threads() -> None:
|
| 343 |
-
"""
|
| 344 |
-
在 CPU 平台智能配置线程数,避免容器环境中的线程过度订阅
|
| 345 |
-
|
| 346 |
-
关键问题:
|
| 347 |
-
- multiprocessing.cpu_count() 在容器中返回宿主机 CPU 数(如 64)
|
| 348 |
-
- PyTorch 据此设置过多线程(如 inter-op=32),导致严重竞争
|
| 349 |
-
- 实际容器配额可能只有 8 vCPU,线程竞争导致性能暴跌
|
| 350 |
-
|
| 351 |
-
解决方案:
|
| 352 |
-
- 优先级 1: 读取环境变量 MAX_CPU_COUNT(显式指定,优先级最高)
|
| 353 |
-
- 优先级 2: 从 cgroup 读取真实 CPU 配额(容器环境自动检测)
|
| 354 |
-
- 优先级 3: 回退到 multiprocessing.cpu_count()
|
| 355 |
-
- intra-op: 单操作内并行(矩阵运算),设为实际 CPU 数
|
| 356 |
-
- inter-op: 操作间并行,对 LLM 推理设为 1 最优(避免竞争)
|
| 357 |
-
"""
|
| 358 |
-
# 优先级 1: 从环境变量读取(用户显式指定)
|
| 359 |
-
max_cpu_env = os.getenv('MAX_CPU_COUNT')
|
| 360 |
-
if max_cpu_env is not None:
|
| 361 |
-
try:
|
| 362 |
-
actual_cpus = int(max_cpu_env)
|
| 363 |
-
print(f"📌 使用环境变量 MAX_CPU_COUNT={actual_cpus}")
|
| 364 |
-
except ValueError:
|
| 365 |
-
print(f"⚠️ 环境变量 MAX_CPU_COUNT={max_cpu_env} 无效,忽略")
|
| 366 |
-
actual_cpus = None
|
| 367 |
-
else:
|
| 368 |
-
actual_cpus = None
|
| 369 |
-
|
| 370 |
-
# 优先级 2: 从 cgroup 读取实际 CPU 配额(容器环境)
|
| 371 |
-
if actual_cpus is None:
|
| 372 |
-
actual_cpus = _get_container_cpu_quota()
|
| 373 |
-
|
| 374 |
-
# 优先级 3: 回退到系统报告的 CPU 数
|
| 375 |
-
if actual_cpus is None:
|
| 376 |
-
actual_cpus = multiprocessing.cpu_count()
|
| 377 |
-
|
| 378 |
-
# 🎯 关键配��
|
| 379 |
-
# intra-op: 单个操作内的并行(矩阵运算等)
|
| 380 |
-
intra_threads = actual_cpus
|
| 381 |
-
|
| 382 |
-
# inter-op: 不同操作间的并行
|
| 383 |
-
# 对于序列化的 LLM 推理,设为 1 通常最优(避免操作间竞争)
|
| 384 |
-
inter_threads = 1
|
| 385 |
-
|
| 386 |
-
torch.set_num_threads(intra_threads)
|
| 387 |
-
torch.set_num_interop_threads(inter_threads)
|
| 388 |
-
|
| 389 |
-
print(f"🔧 已配置 CPU 线程: intra={intra_threads}, inter={inter_threads} (实际使用 {actual_cpus} vCPU)")
|
| 390 |
|
| 391 |
|
| 392 |
def _print_cpu_info() -> None:
|
| 393 |
"""
|
| 394 |
-
打印 CPU
|
| 395 |
"""
|
| 396 |
try:
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
else:
|
| 402 |
-
vendor_display = cpu_vendor if cpu_vendor else "未知"
|
| 403 |
-
model_display = cpu_model if cpu_model else "未知"
|
| 404 |
-
print(f"💻 CPU 供应商: {vendor_display}")
|
| 405 |
-
print(f"💻 CPU 型号: {model_display}")
|
| 406 |
except Exception as e:
|
| 407 |
print(f"⚠️ CPU 信息获取失败: {e}")
|
| 408 |
|
| 409 |
|
| 410 |
def _print_cpu_thread_info() -> None:
|
| 411 |
-
"""
|
| 412 |
-
打印 CPU 线程配置信息(仅用于调试,不修改配置)
|
| 413 |
-
|
| 414 |
-
帮助判断 PyTorch 是否合理利用了 CPU 资源
|
| 415 |
-
"""
|
| 416 |
try:
|
| 417 |
-
# 获取系统信息
|
| 418 |
-
num_cores = multiprocessing.cpu_count()
|
| 419 |
-
|
| 420 |
-
# 获取 PyTorch 当前线程配置
|
| 421 |
intra_threads = torch.get_num_threads()
|
| 422 |
inter_threads = torch.get_num_interop_threads()
|
| 423 |
-
|
| 424 |
-
# 环境变量信息
|
| 425 |
-
omp_threads = os.getenv('OMP_NUM_THREADS', '未设置')
|
| 426 |
-
mkl_threads = os.getenv('MKL_NUM_THREADS', '未设置')
|
| 427 |
-
|
| 428 |
-
# 打印调试信息
|
| 429 |
-
print(f"🧵 CPU 线程配置信息:")
|
| 430 |
-
print(f" - cpu_count(): {num_cores}")
|
| 431 |
-
print(f" - PyTorch intra-op 线程: {intra_threads}")
|
| 432 |
-
print(f" - PyTorch inter-op 线程: {inter_threads}")
|
| 433 |
-
print(f" - 环境变量 OMP_NUM_THREADS: {omp_threads}")
|
| 434 |
-
print(f" - 环境变量 MKL_NUM_THREADS: {mkl_threads}")
|
| 435 |
-
|
| 436 |
-
# 分析建议
|
| 437 |
-
if intra_threads < num_cores and omp_threads == '未设置':
|
| 438 |
-
print(f" 💡 提示: PyTorch 使用 {intra_threads} 线程,但系统有 {num_cores} 个核心")
|
| 439 |
-
print(f" 可考虑设置 OMP_NUM_THREADS={num_cores} 来充分利用 CPU")
|
| 440 |
-
|
| 441 |
except Exception as e:
|
| 442 |
print(f"⚠️ CPU 线程信息获取失败: {e}")
|
| 443 |
|
|
@@ -478,9 +341,8 @@ def load_runtime_config(model_name: str, verbose: bool = False) -> tuple[str, in
|
|
| 478 |
# 5. 打印 CPU 信息(所有平台都打印)
|
| 479 |
_print_cpu_info()
|
| 480 |
|
| 481 |
-
# 6. CPU
|
| 482 |
if "cpu" in platform.lower():
|
| 483 |
-
_configure_cpu_threads() # 配置线程数,避免容器环境过度订阅
|
| 484 |
_print_cpu_thread_info() # 打印调试信息
|
| 485 |
|
| 486 |
# 7. 打印配置摘要
|
|
|
|
| 16 |
import os
|
| 17 |
import torch
|
| 18 |
import sys
|
|
|
|
| 19 |
from typing import Dict, Optional
|
| 20 |
|
| 21 |
|
|
|
|
| 256 |
print(f"✓ MPS 平台安全检查通过: chunk_size={chunk_size} (上限={MPS_TOPK_BUG_THRESHOLD})")
|
| 257 |
|
| 258 |
|
| 259 |
+
def _get_cpu_info() -> Optional[str]:
|
| 260 |
"""
|
| 261 |
+
读取 CPU 型号信息(仅用于显示)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
Returns:
|
| 264 |
+
model_name, if None, return "未知"
|
| 265 |
"""
|
|
|
|
| 266 |
model_name = None
|
| 267 |
|
| 268 |
try:
|
| 269 |
if sys.platform == 'linux':
|
| 270 |
with open('/proc/cpuinfo', 'r') as f:
|
| 271 |
for line in f:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
# 读取 model name
|
| 273 |
if model_name is None and 'model name' in line.lower():
|
| 274 |
model_name = line.split(':', 1)[1].strip()
|
| 275 |
|
| 276 |
# 如果已经读取到所需信息,可以提前退出
|
| 277 |
+
if model_name:
|
| 278 |
break
|
| 279 |
except Exception:
|
| 280 |
pass
|
| 281 |
|
| 282 |
+
return model_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
|
| 285 |
def _print_cpu_info() -> None:
|
| 286 |
"""
|
| 287 |
+
打印 CPU 型号信息(所有平台都打印)
|
| 288 |
"""
|
| 289 |
try:
|
| 290 |
+
cpu_model = _get_cpu_info()
|
| 291 |
+
model = cpu_model or "未知"
|
| 292 |
+
|
| 293 |
+
print(f"💻 CPU 型号: {model}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
except Exception as e:
|
| 295 |
print(f"⚠️ CPU 信息获取失败: {e}")
|
| 296 |
|
| 297 |
|
| 298 |
def _print_cpu_thread_info() -> None:
|
| 299 |
+
"""打印 CPU 线程配置信息(PyTorch 默认配置)"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
intra_threads = torch.get_num_threads()
|
| 302 |
inter_threads = torch.get_num_interop_threads()
|
| 303 |
+
print(f"🧵 PyTorch 线程配置: intra-op={intra_threads}, inter-op={inter_threads}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
except Exception as e:
|
| 305 |
print(f"⚠️ CPU 线程信息获取失败: {e}")
|
| 306 |
|
|
|
|
| 341 |
# 5. 打印 CPU 信息(所有平台都打印)
|
| 342 |
_print_cpu_info()
|
| 343 |
|
| 344 |
+
# 6. CPU 线程配置信息打印(仅针对 CPU 平台)
|
| 345 |
if "cpu" in platform.lower():
|
|
|
|
| 346 |
_print_cpu_thread_info() # 打印调试信息
|
| 347 |
|
| 348 |
# 7. 打印配置摘要
|
download_models.py
DELETED
|
@@ -1,90 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
预下载模型脚本
|
| 4 |
-
用于在Docker构建阶段预先下载模型到容器镜像中
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import os
|
| 8 |
-
import sys
|
| 9 |
-
import gc
|
| 10 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 11 |
-
from model_paths import MODEL_PATHS
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
def download_model(model_name: str):
|
| 15 |
-
"""下载指定的模型"""
|
| 16 |
-
if model_name not in MODEL_PATHS:
|
| 17 |
-
print(f"❌ 未知的模型名称: {model_name}")
|
| 18 |
-
print(f"可用模型: {', '.join(MODEL_PATHS.keys())}")
|
| 19 |
-
return False
|
| 20 |
-
|
| 21 |
-
model_path = MODEL_PATHS[model_name]
|
| 22 |
-
print(f"\n{'='*60}")
|
| 23 |
-
print(f"📥 开始下载模型: {model_name}")
|
| 24 |
-
print(f"🔗 HuggingFace 路径: {model_path}")
|
| 25 |
-
print(f"{'='*60}\n")
|
| 26 |
-
|
| 27 |
-
try:
|
| 28 |
-
# 下载 tokenizer
|
| 29 |
-
print(f"📦 下载 tokenizer...")
|
| 30 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 31 |
-
model_path,
|
| 32 |
-
trust_remote_code=True
|
| 33 |
-
)
|
| 34 |
-
print(f"✅ Tokenizer 下载完成")
|
| 35 |
-
|
| 36 |
-
# 下载模型
|
| 37 |
-
print(f"📦 下载模型权重(这可能需要几分钟)...")
|
| 38 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 39 |
-
model_path,
|
| 40 |
-
trust_remote_code=True,
|
| 41 |
-
dtype='auto'
|
| 42 |
-
)
|
| 43 |
-
print(f"✅ 模型下载完成")
|
| 44 |
-
|
| 45 |
-
# 清理内存
|
| 46 |
-
del model
|
| 47 |
-
del tokenizer
|
| 48 |
-
gc.collect()
|
| 49 |
-
|
| 50 |
-
print(f"\n✅ 模型 {model_name} 预下载成功!\n")
|
| 51 |
-
return True
|
| 52 |
-
|
| 53 |
-
except Exception as e:
|
| 54 |
-
print(f"\n❌ 模型 {model_name} 下载失败: {e}\n")
|
| 55 |
-
return False
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
def main():
|
| 59 |
-
"""主函数"""
|
| 60 |
-
# 从命令行参数获取要下载的模型列表
|
| 61 |
-
if len(sys.argv) < 2:
|
| 62 |
-
print("用法: python download_models.py <model1> [model2] ...")
|
| 63 |
-
print(f"可用模型: {', '.join(MODEL_PATHS.keys())}")
|
| 64 |
-
sys.exit(1)
|
| 65 |
-
|
| 66 |
-
models_to_download = sys.argv[1:]
|
| 67 |
-
|
| 68 |
-
# 显示缓存目录信息
|
| 69 |
-
cache_dir = os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface'))
|
| 70 |
-
print(f"\n🗂️ HuggingFace 缓存目录: {cache_dir}\n")
|
| 71 |
-
|
| 72 |
-
# 下载每个模型
|
| 73 |
-
success_count = 0
|
| 74 |
-
for model_name in models_to_download:
|
| 75 |
-
if download_model(model_name):
|
| 76 |
-
success_count += 1
|
| 77 |
-
|
| 78 |
-
# 总结
|
| 79 |
-
print(f"\n{'='*60}")
|
| 80 |
-
print(f"📊 下载完成: {success_count}/{len(models_to_download)} 个模型成功")
|
| 81 |
-
print(f"{'='*60}\n")
|
| 82 |
-
|
| 83 |
-
# 如果有模型下载失败,返回非零退出码
|
| 84 |
-
if success_count < len(models_to_download):
|
| 85 |
-
sys.exit(1)
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
if __name__ == '__main__':
|
| 89 |
-
main()
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_paths.py
CHANGED
|
@@ -5,11 +5,20 @@
|
|
| 5 |
|
| 6 |
# 所有可用模型的 HuggingFace 路径映射
|
| 7 |
MODEL_PATHS = {
|
|
|
|
| 8 |
'qwen2.5-0.5b': 'Qwen/Qwen2.5-0.5B',
|
| 9 |
'qwen3.0-0.6b': 'Qwen/Qwen3-0.6B-Base',
|
| 10 |
'qwen3.0-1.7b': 'Qwen/Qwen3-1.7B-Base',
|
| 11 |
'qwen3.0-4b': 'Qwen/Qwen3-4B-Base',
|
| 12 |
'qwen3.0-8b': 'Qwen/Qwen3-8B-Base',
|
| 13 |
'qwen3.0-14b': 'Qwen/Qwen3-14B-Base',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
}
|
| 15 |
|
|
|
|
| 5 |
|
| 6 |
# 所有可用模型的 HuggingFace 路径映射
|
| 7 |
MODEL_PATHS = {
|
| 8 |
+
# 标准模型(FP16/BF16)
|
| 9 |
'qwen2.5-0.5b': 'Qwen/Qwen2.5-0.5B',
|
| 10 |
'qwen3.0-0.6b': 'Qwen/Qwen3-0.6B-Base',
|
| 11 |
'qwen3.0-1.7b': 'Qwen/Qwen3-1.7B-Base',
|
| 12 |
'qwen3.0-4b': 'Qwen/Qwen3-4B-Base',
|
| 13 |
'qwen3.0-8b': 'Qwen/Qwen3-8B-Base',
|
| 14 |
'qwen3.0-14b': 'Qwen/Qwen3-14B-Base',
|
| 15 |
+
'qwen3.0-30b-a3b': 'Qwen/Qwen3-30B-A3B-Base',
|
| 16 |
+
'qwen2.5-32b': 'Qwen/Qwen2.5-32B',
|
| 17 |
+
'qwen2.5-72b': 'Qwen/Qwen2.5-72B',
|
| 18 |
+
|
| 19 |
+
# AWQ 量化模型(W4A16,显存占用约为标准模型的 1/4)
|
| 20 |
+
# 自动检测,仅支持 Docker + CUDA 环境
|
| 21 |
+
# Qwen3-14B-AWQ评估质量差,因为基于instruct版本而不是base版本
|
| 22 |
+
'qwen3.0-14b-awq': 'Qwen/Qwen3-14B-AWQ'
|
| 23 |
}
|
| 24 |
|
requirements.txt
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
-
transformers>=4.
|
| 2 |
-
torch>=2.0.0
|
| 3 |
numpy>=1.24.0
|
| 4 |
connexion[flask,swagger-ui,uvicorn]>=3.0.0
|
| 5 |
flask>=3.0.0
|
| 6 |
PyYAML>=6.0
|
| 7 |
flask-cors>=4.0.0
|
| 8 |
-
accelerate>=0.
|
| 9 |
bitsandbytes>=0.41.0
|
| 10 |
hf-transfer>=0.1.0
|
|
|
|
|
|
| 1 |
+
transformers>=4.51.0
|
| 2 |
+
torch>=2.1.0,<2.5.0
|
| 3 |
numpy>=1.24.0
|
| 4 |
connexion[flask,swagger-ui,uvicorn]>=3.0.0
|
| 5 |
flask>=3.0.0
|
| 6 |
PyYAML>=6.0
|
| 7 |
flask-cors>=4.0.0
|
| 8 |
+
accelerate>=0.30.0,<1.0.0
|
| 9 |
bitsandbytes>=0.41.0
|
| 10 |
hf-transfer>=0.1.0
|
| 11 |
+
autoawq>=0.2.0 # Transformers AWQ 支持所需的后端库(已弃用但必需)
|
server.py
CHANGED
|
@@ -3,6 +3,72 @@
|
|
| 3 |
import argparse
|
| 4 |
import os
|
| 5 |
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# Allow an early --force-cpu flag to be passed so we can set the device
|
| 8 |
# before importing backend modules that may instantiate models.
|
|
|
|
| 3 |
import argparse
|
| 4 |
import os
|
| 5 |
import logging
|
| 6 |
+
import multiprocessing
|
| 7 |
+
|
| 8 |
+
# ============= 诊断并修复 OMP_NUM_THREADS 和 MKL_NUM_THREADS 环境变量(必须在导入任何可能使用 OpenMP 的库之前)=============
|
| 9 |
+
def _diagnose_and_fix_thread_env_vars() -> None:
|
| 10 |
+
"""
|
| 11 |
+
诊断并修复 OMP_NUM_THREADS 和 MKL_NUM_THREADS 环境变量,定位 libgomp 报错问题
|
| 12 |
+
|
| 13 |
+
在 HF Space 的 CUDA 容器中,可能预设了无效的环境变量值(如空字符串或 '3500m')
|
| 14 |
+
这会导致 bitsandbytes 库初始化时 libgomp 报错
|
| 15 |
+
|
| 16 |
+
修复策略:所有无效值统一设置为实际 CPU 核数
|
| 17 |
+
"""
|
| 18 |
+
# 获取实际 CPU 核数
|
| 19 |
+
actual_cores = multiprocessing.cpu_count()
|
| 20 |
+
|
| 21 |
+
# 需要诊断和修复的环境变量列表
|
| 22 |
+
env_vars = ['OMP_NUM_THREADS', 'MKL_NUM_THREADS']
|
| 23 |
+
|
| 24 |
+
is_first_fix = True # 标记是否是第一次修复(用于打印标题)
|
| 25 |
+
|
| 26 |
+
for env_var in env_vars:
|
| 27 |
+
value = os.environ.get(env_var)
|
| 28 |
+
|
| 29 |
+
# 如果未设置,跳过(不打印)
|
| 30 |
+
if value is None:
|
| 31 |
+
continue
|
| 32 |
+
|
| 33 |
+
# 检查是否为有效值
|
| 34 |
+
stripped = value.strip()
|
| 35 |
+
is_valid = False
|
| 36 |
+
reason = ""
|
| 37 |
+
|
| 38 |
+
if not stripped:
|
| 39 |
+
reason = "值为空字符串"
|
| 40 |
+
elif not stripped.isdigit():
|
| 41 |
+
reason = f"包含非数字字符: {repr(stripped)}"
|
| 42 |
+
else:
|
| 43 |
+
try:
|
| 44 |
+
int_value = int(stripped)
|
| 45 |
+
if int_value <= 0:
|
| 46 |
+
reason = f"值 <= 0: {int_value}"
|
| 47 |
+
else:
|
| 48 |
+
is_valid = True
|
| 49 |
+
except ValueError:
|
| 50 |
+
reason = f"无法转换为整数: {repr(stripped)}"
|
| 51 |
+
|
| 52 |
+
# 只有在无效时才修复并立即打印
|
| 53 |
+
if not is_valid:
|
| 54 |
+
# 如果是第一次修复,先打印标题
|
| 55 |
+
if is_first_fix:
|
| 56 |
+
print(f"🔍 检测到无效的线程环境变量(实际 CPU 核数: {actual_cores}):")
|
| 57 |
+
is_first_fix = False
|
| 58 |
+
|
| 59 |
+
# 统一修复为实际核数
|
| 60 |
+
os.environ[env_var] = str(actual_cores)
|
| 61 |
+
|
| 62 |
+
# 立即打印修复信息
|
| 63 |
+
print(f" {env_var}:")
|
| 64 |
+
print(f" - 原始值: {repr(value)}")
|
| 65 |
+
print(f" - 问题: {reason}")
|
| 66 |
+
print(f" - 🔧 已自动修复: {env_var}={actual_cores} (设置为实际 CPU 核数)")
|
| 67 |
+
if env_var == 'OMP_NUM_THREADS':
|
| 68 |
+
print(f" - ⚠️ 无效值可能导致 libgomp 报错: 'Invalid value for environment variable OMP_NUM_THREADS'")
|
| 69 |
+
|
| 70 |
+
_diagnose_and_fix_thread_env_vars()
|
| 71 |
+
# ============= 诊断结束 =============
|
| 72 |
|
| 73 |
# Allow an early --force-cpu flag to be passed so we can set the device
|
| 74 |
# before importing backend modules that may instantiate models.
|