Spaces:

dqy08
/

InfoRadar

Running

App Files Files Community

dqy08 commited on Jan 6

Commit

870433d

1 Parent(s): cad45cf

- Updated default model to qwen3.0-14b.

Browse files

- Added automatic model registration in language_checker.py

Files changed (7) hide show

Dockerfile +4 -2
backend/language_checker.py +320 -89
backend/runtime_config.py +13 -151
download_models.py +0 -90
model_paths.py +9 -0
requirements.txt +4 -3
server.py +66 -0

Dockerfile CHANGED Viewed

@@ -18,9 +18,10 @@ RUN npm run build
 # -----------------------------------------------------------------------------
 FROM python:3.9-slim
-# System deps (git for Hugging Face Hub downloads)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     git \
   && rm -rf /var/lib/apt/lists/*
 # Create a non-root user with UID 1000 (mandatory in Spaces)
@@ -53,4 +54,5 @@ COPY --chown=user:users --from=frontend /app/client/dist ./client/dist
 ENV FORCE_INT8=1
 EXPOSE 7860
-CMD ["python", "server.py", "--model", "qwen3.0-8b", "--address", "0.0.0.0", "--port", "7860"]

 # -----------------------------------------------------------------------------
 FROM python:3.9-slim
+# System deps (git for Hugging Face Hub downloads, build-essential for triton/AWQ CUDA kernel compilation)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     git \
+    build-essential \
   && rm -rf /var/lib/apt/lists/*
 # Create a non-root user with UID 1000 (mandatory in Spaces)
 ENV FORCE_INT8=1
 EXPOSE 7860
+CMD ["python", "server.py", "--model", "qwen3.0-14b", "--address", "0.0.0.0", "--port", "7860"]

backend/language_checker.py CHANGED Viewed

@@ -2,10 +2,11 @@ import os
 import time
 import torch
 import gc
 from typing import Dict, List, Optional, Tuple, Callable
 from transformers import AutoTokenizer, AutoModelForCausalLM
-from .class_register import register_model
 from .runtime_config import load_runtime_config
 from model_paths import MODEL_PATHS
@@ -61,7 +62,41 @@ class DeviceManager:
             time_str = f"{int(load_time // 60)}m{load_time % 60:.1f}s"
         print(f"✅ 模型加载完成 [大小: {size_str}, 耗时: {time_str}, 速度: {load_speed_mb_per_sec:.1f}MB/s]")
 class AbstractLanguageChecker:
     """
@@ -168,6 +203,7 @@ class QwenLM(AbstractLanguageChecker):
         # 判断加载策略
         # ============================================================
         use_int8 = False
         device_map = None
         dtype = None
         use_low_cpu_mem = False
@@ -179,7 +215,24 @@ class QwenLM(AbstractLanguageChecker):
         force_int8 = os.environ.get('FORCE_INT8')
         force_bfloat16 = os.environ.get('CPU_FORCE_BFLOAT16')
-        if self.device.type == "cpu":
             print("🔧 CPU 模式：手动控制设备分配")
             if force_int8:
@@ -227,52 +280,49 @@ class QwenLM(AbstractLanguageChecker):
             print("🔧 device_map: auto")
         # ============================================================
-        # 执行模型加载
         # ============================================================
         model_load_start_time = time.perf_counter()
-        if use_int8:
-            # INT8 量化加载
-            self.model = load_component(
-                lambda local_files_only: AutoModelForCausalLM.from_pretrained(
-                    model_path,
-                    load_in_8bit=True,
-                    device_map=device_map,
-                    trust_remote_code=True,
-                    low_cpu_mem_usage=True,
-                    local_files_only=local_files_only
-                ),
-                load_description
             )
         elif device_map:
-            # GPU/MPS: 使用 device_map="auto"
-            self.model = load_component(
-                lambda local_files_only: AutoModelForCausalLM.from_pretrained(
-                    model_path,
-                    device_map=device_map,
-                    dtype=dtype,
-                    trust_remote_code=True,
-                    low_cpu_mem_usage=use_low_cpu_mem,
-                    local_files_only=local_files_only
-                ),
-                load_description
             )
         else:
-            # CPU: 手动控制设备
-            self.model = load_component(
-                lambda local_files_only: AutoModelForCausalLM.from_pretrained(
-                    model_path,
-                    dtype=dtype,
-                    trust_remote_code=True,
-                    low_cpu_mem_usage=use_low_cpu_mem,
-                    local_files_only=local_files_only
-                ).to(self.device),
-                load_description
             )
         DeviceManager.print_model_load_stats(self.model, time.perf_counter() - model_load_start_time)
         self.model.eval()
         # ============================================================
         # 关于 torch.compile() 的性能优化讨论结论：
         #
@@ -302,6 +352,207 @@ class QwenLM(AbstractLanguageChecker):
         device_name = DeviceManager.get_device_name(self.device)
         print(f"✓ {model_display_name} 模型已加载 ({device_name})")
     def _load_runtime_config(self, model_name: Optional[str]):
         """
         加载运行时配置：基于模型和平台的四层配置合并
@@ -536,6 +787,11 @@ class QwenLM(AbstractLanguageChecker):
             DeviceManager.clear_cache(self.device)
             gc.collect()
             return {'bpe_strings': bpe_strings}
         except Exception as e:
@@ -547,57 +803,32 @@ class QwenLM(AbstractLanguageChecker):
     # _cleanup_tensors 方法已被移除，因为不再需要显式清理小张量
-@register_model(name='qwen3.0-0.6b')
-class QwenLM_3_0_6B(QwenLM):
-    """
-    Qwen3-0.6B 模型支持
-    使用 Qwen3-0.6B Base 模型
-    """
-    def __init__(self):
-        # model_name 和 model_path 会自动从配置获取
-        super().__init__()
-@register_model(name='qwen3.0-1.7b')
-class QwenLM_3_0_1_7B(QwenLM):
-    """
-    Qwen3-1.7B 模型支持
-    使用 Qwen3-1.7B Base 模型
-    """
-    def __init__(self):
-        # model_name 和 model_path 会自动从配置获取
-        super().__init__()
-@register_model(name='qwen3.0-4b')
-class QwenLM_3_0_4B(QwenLM):
-    """
-    Qwen3-4B 模型支持
-    使用 Qwen3-4B Base 模型
-    """
-    def __init__(self):
-        # model_name 和 model_path 会自动从配置获取
-        super().__init__()
-@register_model(name='qwen3.0-8b')
-class QwenLM_3_0_8B(QwenLM):
-    """
-    Qwen3-8B 模型支持
-    使用 Qwen3-8B Base 模型
-    """
-    def __init__(self):
-        # model_name 和 model_path 会自动从配置获取
-        super().__init__()
-@register_model(name='qwen3.0-14b')
-class QwenLM_3_0_14B(QwenLM):
-    """
-    Qwen3-14B 模型支持
-    使用 Qwen3-14B Base 模型
-    """
-    def __init__(self):
-        # model_name 和 model_path 会自动从配置获取
-        super().__init__()

 import time
 import torch
 import gc
+import warnings
 from typing import Dict, List, Optional, Tuple, Callable
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from .class_register import register_model, REGISTERED_MODELS
 from .runtime_config import load_runtime_config
 from model_paths import MODEL_PATHS
             time_str = f"{int(load_time // 60)}m{load_time % 60:.1f}s"
         print(f"✅ 模型加载完成 [大小: {size_str}, 耗时: {time_str}, 速度: {load_speed_mb_per_sec:.1f}MB/s]")
+    @staticmethod
+    def print_cuda_memory_summary(title="GPU 内存统计", device=0):
+        """打印详细的 CUDA 内存统计信息"""
+        if not torch.cuda.is_available():
+            return
+        print(f"\n{'='*60}")
+        print(f"🔍 {title}")
+        print(f"{'='*60}")
+        # 基本统计
+        allocated = torch.cuda.memory_allocated(device) / 1024**3
+        reserved = torch.cuda.memory_reserved(device) / 1024**3
+        max_allocated = torch.cuda.max_memory_allocated(device) / 1024**3
+        total = torch.cuda.get_device_properties(device).total_memory / 1024**3
+        print(f"📊 总显存: {total:.2f} GB")
+        print(f"✅ 已分配 (allocated): {allocated:.2f} GB  ({allocated/total*100:.1f}%)")
+        print(f"📦 已预留 (reserved): {reserved:.2f} GB  ({reserved/total*100:.1f}%)")
+        print(f"📈 峰值分配: {max_allocated:.2f} GB")
+        print(f"💚 可用空间: {total - reserved:.2f} GB  ({(total-reserved)/total*100:.1f}%)")
+        print(f"🔸 碎片化: {reserved - allocated:.2f} GB")
+        # 详细统计（简化版）
+        try:
+            stats = torch.cuda.memory_stats(device)
+            num_allocs = stats.get("num_alloc_retries", 0)
+            num_ooms = stats.get("num_ooms", 0)
+            if num_allocs > 0 or num_ooms > 0:
+                print(f"⚠️  分配重试: {num_allocs} 次, OOM: {num_ooms} 次")
+        except:
+            pass
+        print(f"{'='*60}\n")
 class AbstractLanguageChecker:
     """
         # 判断加载策略
         # ============================================================
         use_int8 = False
+        use_awq = False
         device_map = None
         dtype = None
         use_low_cpu_mem = False
         force_int8 = os.environ.get('FORCE_INT8')
         force_bfloat16 = os.environ.get('CPU_FORCE_BFLOAT16')
+        # 检测是否为 AWQ 模型（自动检测）
+        is_awq_model = self._is_awq_model(model_path)
+        if is_awq_model:
+            # AWQ 仅支持 CUDA 环境（Docker 部署）
+            if self.device.type != "cuda":
+                raise RuntimeError(
+                    f"❌ AWQ 量化模型仅支持 CUDA 环境\n"
+                    f"   当前设备: {self.device.type.upper()}\n"
+                    f"   请在 Docker + CUDA 环境中运行，或使用非量化模型"
+                )
+            use_awq = True
+            load_description = "模型（AWQ 量化，Transformers 内置支持）"
+            print("✓ 检测到 AWQ 量化模型，使用 Transformers 内置支持加载")
+            print("🔧 CUDA 模式：自动设备分配")
+            print("🔧 device_map: auto")
+        elif self.device.type == "cpu":
             print("🔧 CPU 模式：手动控制设备分配")
             if force_int8:
             print("🔧 device_map: auto")
         # ============================================================
+        # 执行模型加载：根据硬件环境选择最优加载策略
         # ============================================================
         model_load_start_time = time.perf_counter()
+        if use_awq:
+            # 场景0：AWQ 量化 - W4/W8 权重量化（优先级最高）
+            self.model = self._load_model_with_awq(
+                model_path, load_component, load_description
             )
+        elif use_int8:
+            # 场景1：INT8 量化 - 内存受限环境
+            if self.device.type == "cuda":
+                # CUDA: INT8 量化加载
+                self.model = self._load_model_with_int8_cuda(
+                    model_path, load_component, load_description
+                )
+            else:
+                # CPU: 纯 CPU 量化
+                self.model = self._load_model_with_int8_cpu(
+                    model_path, load_component, load_description
+                )
         elif device_map:
+            # 场景2：GPU/MPS - 自动设备分配（充分利用加速器）
+            self.model = self._load_model_with_device_map(
+                model_path, load_component, load_description,
+                device_map, dtype, use_low_cpu_mem
             )
         else:
+            # 场景3：CPU - 手动设备控制（无加速器可用）
+            self.model = self._load_model_on_cpu(
+                model_path, load_component, load_description,
+                dtype, use_low_cpu_mem
             )
         DeviceManager.print_model_load_stats(self.model, time.perf_counter() - model_load_start_time)
         self.model.eval()
+        # 打印模型加载后的内存统计
+        if self.device.type == "cuda":
+            device_idx = self.device.index if self.device.index is not None else 0
+            DeviceManager.print_cuda_memory_summary(device=device_idx)
         # ============================================================
         # 关于 torch.compile() 的性能优化讨论结论：
         #
         device_name = DeviceManager.get_device_name(self.device)
         print(f"✓ {model_display_name} 模型已加载 ({device_name})")
+    def _load_model_with_int8_cuda(
+        self,
+        model_path: str,
+        load_component: Callable,
+        load_description: str
+    ):
+        """
+        INT8 量化加载模式 - CUDA GPU 环境
+        技术细节：
+        - 使用 bitsandbytes 8bit 量化
+        - device_map="auto" 让 Transformers 自动分配层
+        性能特点：
+        - GPU 层推理速度快（INT8 Tensor Core 加速）
+        """
+        return load_component(
+            lambda local_files_only: AutoModelForCausalLM.from_pretrained(
+                model_path,
+                load_in_8bit=True,
+                device_map="auto",
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,
+                local_files_only=local_files_only
+            ),
+            load_description
+        )
+    def _load_model_with_int8_cpu(
+        self,
+        model_path: str,
+        load_component: Callable,
+        load_description: str
+    ):
+        """
+        INT8 量化加载模式 - CPU 环境
+        适用场景：
+        - 无 GPU 可用，纯 CPU 推理
+        - 内存受限，无法加载 float32 模型
+        技术细节：
+        - 使用 bitsandbytes 8bit 量化
+        - device_map="cpu" 指定所有层在 CPU 上
+        - 量化可减少约 4 倍内存占用（相比 float32）
+        性能特点：
+        - 推理速度比 GPU 慢 10-100 倍
+        - 内存占用低，适合资源受限环境
+        - 实验性功能，可能在某些情况下降低性能
+        注意事项：
+        - 仅在 FORCE_INT8=1 时启用
+        - 优先考虑使用 GPU 或扩大内存
+        """
+        print("⚙️  纯 CPU INT8 量化")
+        return load_component(
+            lambda local_files_only: AutoModelForCausalLM.from_pretrained(
+                model_path,
+                load_in_8bit=True,
+                device_map="cpu",
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,
+                local_files_only=local_files_only
+            ),
+            load_description
+        )
+    def _load_model_with_device_map(
+        self,
+        model_path: str,
+        load_component: Callable,
+        load_description: str,
+        device_map: str,
+        dtype: torch.dtype,
+        use_low_cpu_mem: bool
+    ):
+        """
+        GPU/MPS 自动设备分配模式
+        适用场景：
+        - 有可用的硬件加速器（CUDA GPU 或 Apple Silicon MPS）
+        - 显存充足，不需要量化
+        技术细节：
+        - device_map="auto" 让 Transformers 自动分配设备
+        - 使用 float16/bfloat16 精度（充分利用 Tensor Core 加速）
+        - 支持多 GPU 自动分片（如果有多个 GPU）
+        性能优势：
+        - GPU float16 比 float32 快 2-3 倍
+        - 自动负载均衡（多 GPU 环境）
+        - 无需手动管理设备转移
+        """
+        return load_component(
+            lambda local_files_only: AutoModelForCausalLM.from_pretrained(
+                model_path,
+                device_map=device_map,
+                dtype=dtype,
+                trust_remote_code=True,
+                low_cpu_mem_usage=use_low_cpu_mem,
+                local_files_only=local_files_only
+            ),
+            load_description
+        )
+    def _load_model_on_cpu(
+        self,
+        model_path: str,
+        load_component: Callable,
+        load_description: str,
+        dtype: torch.dtype,
+        use_low_cpu_mem: bool
+    ):
+        """
+        CPU 手动设备控制模式
+        适用场景：
+        - 无硬件加速器可用
+        - 开发/测试环境
+        技术细节：
+        - 不使用 device_map（避免 CPU 环境的非预期行为）
+        - 手动 .to(device) 转移到 CPU
+        - 默认使用 float32（CPU 上 float16 没有加速优势）
+        - 可选 bfloat16（需要硬件支持，通过 CPU_FORCE_BFLOAT16 启用）
+        注意事项：
+        - CPU 推理速度较慢（比 GPU 慢 10-100 倍）
+        - float32 占用内存是 float16 的 2 倍
+        """
+        return load_component(
+            lambda local_files_only: AutoModelForCausalLM.from_pretrained(
+                model_path,
+                dtype=dtype,
+                trust_remote_code=True,
+                low_cpu_mem_usage=use_low_cpu_mem,
+                local_files_only=local_files_only
+            ).to(self.device),
+            load_description
+        )
+    def _load_model_with_awq(
+        self,
+        model_path: str,
+        load_component: Callable,
+        load_description: str
+    ):
+        """
+        AWQ 量化加载（使用 Transformers 内置支持，W4/W8，仅支持 CUDA）
+        - 显存占用：约为 FP16 的 1/4（W4）或 1/2（W8）
+        - 推理速度：优于 INT8
+        - 仅支持 Docker + CUDA 环境
+        - Transformers 会自动识别 quantization_config.json 并加载 AWQ 模型
+        """
+        def awq_loader(local_files_only):
+            return AutoModelForCausalLM.from_pretrained(
+                model_path,
+                device_map="auto",  # 自动分配到 GPU
+                trust_remote_code=True,
+                local_files_only=local_files_only
+            )
+        return load_component(awq_loader, load_description)
+    @staticmethod
+    def _is_awq_model(model_path: str) -> bool:
+        """
+        检测模型是否为 AWQ 量化模型
+        检测策略：
+        1. 路径名包含 'awq' 关键字（如 'model-awq', 'Qwen-AWQ'）
+        2. 检查模型目录下是否存在 quantization_config.json 且包含 'awq' 配置
+        Args:
+            model_path: 模型路径（本地路径或 HuggingFace 模型 ID）
+        Returns:
+            是否为 AWQ 模型
+        """
+        # 策略1：路径名检测
+        if 'awq' in model_path.lower():
+            return True
+        # 策略2：配置文件检测（仅对本地路径）
+        if os.path.isdir(model_path):
+            import json
+            config_path = os.path.join(model_path, 'quantization_config.json')
+            if os.path.exists(config_path):
+                try:
+                    with open(config_path, 'r') as f:
+                        config = json.load(f)
+                        if config.get('quant_method') == 'awq':
+                            return True
+                except Exception:
+                    pass
+        return False
     def _load_runtime_config(self, model_name: Optional[str]):
         """
         加载运行时配置：基于模型和平台的四层配置合并
             DeviceManager.clear_cache(self.device)
             gc.collect()
+            # 打印分析任务完成后的内存统计
+            if self.device.type == "cuda":
+                device_idx = self.device.index if self.device.index is not None else 0
+                DeviceManager.print_cuda_memory_summary(device=device_idx)
             return {'bpe_strings': bpe_strings}
         except Exception as e:
     # _cleanup_tensors 方法已被移除，因为不再需要显式清理小张量
+# ============================================================
+# 自动注册：根据 MODEL_PATHS 自动注册所有模型
+# ============================================================
+# 只需要在 model_paths.py 中添加模型路径，即可自动注册
+# 无需手动创建子类，实现 DRY 原则
+def _auto_register_models():
+    """自动注册 MODEL_PATHS 中的所有模型"""
+    for model_name in MODEL_PATHS.keys():
+        if model_name not in REGISTERED_MODELS:
+            # 动态创建模型类并注册
+            # 使用闭包捕获当前 model_name
+            def make_init():
+                def __init__(self):
+                    QwenLM.__init__(self)
+                return __init__
+            model_class = type(
+                f'QwenLM_{model_name.replace(".", "_").replace("-", "_")}',
+                (QwenLM,),
+                {
+                    '__init__': make_init(),
+                    '__doc__': f'{model_name} 模型支持（自动注册）'
+                }
+            )
+            register_model(model_name)(model_class)
+# 执行自动注册
+_auto_register_models()

backend/runtime_config.py CHANGED Viewed

@@ -16,7 +16,6 @@
 import os
 import torch
 import sys
-import multiprocessing
 from typing import Dict, Optional
@@ -257,187 +256,51 @@ def validate_platform_config(platform: str, chunk_size: int, verbose: bool = Tru
             print(f"✓ MPS 平台安全检查通过: chunk_size={chunk_size} (上限={MPS_TOPK_BUG_THRESHOLD})")
-def _get_container_cpu_quota() -> Optional[int]:
     """
-    从 cgroup 读取容器的实际 CPU 配额（适用于 Docker/K8s）
-    容器环境中，multiprocessing.cpu_count() 会返回宿主机 CPU 数，
-    而非容器实际可用的 CPU 配额，导致线程过度订阅。
-    Returns:
-        实际可用的 CPU 核心数，如果读取失败或无限制返回 None
-    """
-    try:
-        # CGroup v2 (较新的 Docker/K8s)
-        quota_file_v2 = "/sys/fs/cgroup/cpu.max"
-        if os.path.exists(quota_file_v2):
-            # CGroup v2 格式: "quota period"
-            with open(quota_file_v2) as f:
-                parts = f.read().strip().split()
-                if parts[0] == "max":
-                    return None  # 无限制
-                quota = int(parts[0])
-                period = int(parts[1])
-                return max(1, int(quota / period))
-        # CGroup v1
-        quota_file = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"
-        period_file = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"
-        if os.path.exists(quota_file) and os.path.exists(period_file):
-            with open(quota_file) as f:
-                quota = int(f.read().strip())
-            with open(period_file) as f:
-                period = int(f.read().strip())
-            if quota == -1:
-                return None  # 无限制
-            return max(1, int(quota / period))
-    except Exception:
-        pass
-    return None
-def _get_cpu_info() -> tuple[Optional[str], Optional[str]]:
-    """
-    读取 CPU 供应商和型号信息（仅用于显示）
     Returns:
-        (vendor, model_name) 元组，如果无法读取则返回 (None, None)
     """
-    vendor = None
     model_name = None
     try:
         if sys.platform == 'linux':
             with open('/proc/cpuinfo', 'r') as f:
                 for line in f:
-                    # 读取 vendor_id
-                    if vendor is None and 'vendor_id' in line.lower():
-                        vendor_str = line.split(':', 1)[1].strip()
-                        if 'AUTHENTICAMD' in vendor_str.upper() or 'AMD' in vendor_str.upper():
-                            vendor = 'AMD'
-                        elif 'GENUINEINTEL' in vendor_str.upper() or 'INTEL' in vendor_str.upper():
-                            vendor = 'Intel'
-                        else:
-                            vendor = vendor_str
                     # 读取 model name
                     if model_name is None and 'model name' in line.lower():
                         model_name = line.split(':', 1)[1].strip()
                     # 如果已经读取到所需信息，可以提前退出
-                    if vendor and model_name:
                         break
     except Exception:
         pass
-    return (vendor, model_name)
-def _configure_cpu_threads() -> None:
-    """
-    在 CPU 平台智能配置线程数，避免容器环境中的线程过度订阅
-    关键问题：
-    - multiprocessing.cpu_count() 在容器中返回宿主机 CPU 数（如 64）
-    - PyTorch 据此设置过多线程（如 inter-op=32），导致严重竞争
-    - 实际容器配额可能只有 8 vCPU，线程竞争导致性能暴跌
-    解决方案：
-    - 优先级 1: 读取环境变量 MAX_CPU_COUNT（显式指定，优先级最高）
-    - 优先级 2: 从 cgroup 读取真实 CPU 配额（容器环境自动检测）
-    - 优先级 3: 回退到 multiprocessing.cpu_count()
-    - intra-op: 单操作内并行（矩阵运算），设为实际 CPU 数
-    - inter-op: 操作间并行，对 LLM 推理设为 1 最优（避免竞争）
-    """
-    # 优先级 1: 从环境变量读取（用户显式指定）
-    max_cpu_env = os.getenv('MAX_CPU_COUNT')
-    if max_cpu_env is not None:
-        try:
-            actual_cpus = int(max_cpu_env)
-            print(f"📌 使用环境变量 MAX_CPU_COUNT={actual_cpus}")
-        except ValueError:
-            print(f"⚠️  环境变量 MAX_CPU_COUNT={max_cpu_env} 无效，忽略")
-            actual_cpus = None
-    else:
-        actual_cpus = None
-    # 优先级 2: 从 cgroup 读取实际 CPU 配额（容器环境）
-    if actual_cpus is None:
-        actual_cpus = _get_container_cpu_quota()
-    # 优先级 3: 回退到系统报告的 CPU 数
-    if actual_cpus is None:
-        actual_cpus = multiprocessing.cpu_count()
-    # 🎯 关键配��
-    # intra-op: 单个操作内的并行（矩阵运算等）
-    intra_threads = actual_cpus
-    # inter-op: 不同操作间的并行
-    # 对于序列化的 LLM 推理，设为 1 通常最优（避免操作间竞争）
-    inter_threads = 1
-    torch.set_num_threads(intra_threads)
-    torch.set_num_interop_threads(inter_threads)
-    print(f"🔧 已配置 CPU 线程: intra={intra_threads}, inter={inter_threads} (实际使用 {actual_cpus} vCPU)")
 def _print_cpu_info() -> None:
     """
-    打印 CPU 供应商和型号信息（所有平台都打印）
     """
     try:
-        cpu_vendor, cpu_model = _get_cpu_info()
-        # 始终打印 CPU 信息
-        if cpu_vendor and cpu_model:
-            print(f"💻 CPU: {cpu_vendor} {cpu_model}")
-        else:
-            vendor_display = cpu_vendor if cpu_vendor else "未知"
-            model_display = cpu_model if cpu_model else "未知"
-            print(f"💻 CPU 供应商: {vendor_display}")
-            print(f"💻 CPU 型号: {model_display}")
     except Exception as e:
         print(f"⚠️  CPU 信息获取失败: {e}")
 def _print_cpu_thread_info() -> None:
-    """
-    打印 CPU 线程配置信息（仅用于调试，不修改配置）
-    帮助判断 PyTorch 是否合理利用了 CPU 资源
-    """
     try:
-        # 获取系统信息
-        num_cores = multiprocessing.cpu_count()
-        # 获取 PyTorch 当前线程配置
         intra_threads = torch.get_num_threads()
         inter_threads = torch.get_num_interop_threads()
-        # 环境变量信息
-        omp_threads = os.getenv('OMP_NUM_THREADS', '未设置')
-        mkl_threads = os.getenv('MKL_NUM_THREADS', '未设置')
-        # 打印调试信息
-        print(f"🧵 CPU 线程配置信息:")
-        print(f"   - cpu_count(): {num_cores}")
-        print(f"   - PyTorch intra-op 线程: {intra_threads}")
-        print(f"   - PyTorch inter-op 线程: {inter_threads}")
-        print(f"   - 环境变量 OMP_NUM_THREADS: {omp_threads}")
-        print(f"   - 环境变量 MKL_NUM_THREADS: {mkl_threads}")
-        # 分析建议
-        if intra_threads < num_cores and omp_threads == '未设置':
-            print(f"   💡 提示: PyTorch 使用 {intra_threads} 线程，但系统有 {num_cores} 个核心")
-            print(f"        可考虑设置 OMP_NUM_THREADS={num_cores} 来充分利用 CPU")
     except Exception as e:
         print(f"⚠️  CPU 线程信息获取失败: {e}")
@@ -478,9 +341,8 @@ def load_runtime_config(model_name: str, verbose: bool = False) -> tuple[str, in
     # 5. 打印 CPU 信息（所有平台都打印）
     _print_cpu_info()
-    # 6. CPU 线程配置与信息打印（仅针对 CPU 平台）
     if "cpu" in platform.lower():
-        _configure_cpu_threads()  # 配置线程数，避免容器环境过度订阅
         _print_cpu_thread_info()  # 打印调试信息
     # 7. 打印配置摘要

 import os
 import torch
 import sys
 from typing import Dict, Optional
             print(f"✓ MPS 平台安全检查通过: chunk_size={chunk_size} (上限={MPS_TOPK_BUG_THRESHOLD})")
+def _get_cpu_info() -> Optional[str]:
     """
+    读取 CPU 型号信息（仅用于显示）
     Returns:
+        model_name, if None, return "未知"
     """
     model_name = None
     try:
         if sys.platform == 'linux':
             with open('/proc/cpuinfo', 'r') as f:
                 for line in f:
                     # 读取 model name
                     if model_name is None and 'model name' in line.lower():
                         model_name = line.split(':', 1)[1].strip()
                     # 如果已经读取到所需信息，可以提前退出
+                    if model_name:
                         break
     except Exception:
         pass
+    return model_name
 def _print_cpu_info() -> None:
     """
+    打印 CPU 型号信息（所有平台都打印）
     """
     try:
+        cpu_model = _get_cpu_info()
+        model = cpu_model or "未知"
+        print(f"💻 CPU 型号: {model}")
     except Exception as e:
         print(f"⚠️  CPU 信息获取失败: {e}")
 def _print_cpu_thread_info() -> None:
+    """打印 CPU 线程配置信息（PyTorch 默认配置）"""
     try:
         intra_threads = torch.get_num_threads()
         inter_threads = torch.get_num_interop_threads()
+        print(f"🧵 PyTorch 线程配置: intra-op={intra_threads}, inter-op={inter_threads}")
     except Exception as e:
         print(f"⚠️  CPU 线程信息获取失败: {e}")
     # 5. 打印 CPU 信息（所有平台都打印）
     _print_cpu_info()
+    # 6. CPU 线程配置信息打印（仅针对 CPU 平台）
     if "cpu" in platform.lower():
         _print_cpu_thread_info()  # 打印调试信息
     # 7. 打印配置摘要

download_models.py DELETED Viewed

@@ -1,90 +0,0 @@
-#!/usr/bin/env python3
-"""
-预下载模型脚本
-用于在Docker构建阶段预先下载模型到容器镜像中
-"""
-import os
-import sys
-import gc
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from model_paths import MODEL_PATHS
-def download_model(model_name: str):
-    """下载指定的模型"""
-    if model_name not in MODEL_PATHS:
-        print(f"❌ 未知的模型名称: {model_name}")
-        print(f"可用模型: {', '.join(MODEL_PATHS.keys())}")
-        return False
-    model_path = MODEL_PATHS[model_name]
-    print(f"\n{'='*60}")
-    print(f"📥 开始下载模型: {model_name}")
-    print(f"🔗 HuggingFace 路径: {model_path}")
-    print(f"{'='*60}\n")
-    try:
-        # 下载 tokenizer
-        print(f"📦 下载 tokenizer...")
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_path,
-            trust_remote_code=True
-        )
-        print(f"✅ Tokenizer 下载完成")
-        # 下载模型
-        print(f"📦 下载模型权重（这可能需要几分钟）...")
-        model = AutoModelForCausalLM.from_pretrained(
-            model_path,
-            trust_remote_code=True,
-            dtype='auto'
-        )
-        print(f"✅ 模型下载完成")
-        # 清理内存
-        del model
-        del tokenizer
-        gc.collect()
-        print(f"\n✅ 模型 {model_name} 预下载成功！\n")
-        return True
-    except Exception as e:
-        print(f"\n❌ 模型 {model_name} 下载失败: {e}\n")
-        return False
-def main():
-    """主函数"""
-    # 从命令行参数获取要下载的模型列表
-    if len(sys.argv) < 2:
-        print("用法: python download_models.py <model1> [model2] ...")
-        print(f"可用模型: {', '.join(MODEL_PATHS.keys())}")
-        sys.exit(1)
-    models_to_download = sys.argv[1:]
-    # 显示缓存目录信息
-    cache_dir = os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface'))
-    print(f"\n🗂️  HuggingFace 缓存目录: {cache_dir}\n")
-    # 下载每个模型
-    success_count = 0
-    for model_name in models_to_download:
-        if download_model(model_name):
-            success_count += 1
-    # 总结
-    print(f"\n{'='*60}")
-    print(f"📊 下载完成: {success_count}/{len(models_to_download)} 个模型成功")
-    print(f"{'='*60}\n")
-    # 如果有模型下载失败，返回非零退出码
-    if success_count < len(models_to_download):
-        sys.exit(1)
-if __name__ == '__main__':
-    main()

model_paths.py CHANGED Viewed

@@ -5,11 +5,20 @@
 # 所有可用模型的 HuggingFace 路径映射
 MODEL_PATHS = {
     'qwen2.5-0.5b': 'Qwen/Qwen2.5-0.5B',
     'qwen3.0-0.6b': 'Qwen/Qwen3-0.6B-Base',
     'qwen3.0-1.7b': 'Qwen/Qwen3-1.7B-Base',
     'qwen3.0-4b': 'Qwen/Qwen3-4B-Base',
     'qwen3.0-8b': 'Qwen/Qwen3-8B-Base',
     'qwen3.0-14b': 'Qwen/Qwen3-14B-Base',
 }

 # 所有可用模型的 HuggingFace 路径映射
 MODEL_PATHS = {
+    # 标准模型（FP16/BF16）
     'qwen2.5-0.5b': 'Qwen/Qwen2.5-0.5B',
     'qwen3.0-0.6b': 'Qwen/Qwen3-0.6B-Base',
     'qwen3.0-1.7b': 'Qwen/Qwen3-1.7B-Base',
     'qwen3.0-4b': 'Qwen/Qwen3-4B-Base',
     'qwen3.0-8b': 'Qwen/Qwen3-8B-Base',
     'qwen3.0-14b': 'Qwen/Qwen3-14B-Base',
+    'qwen3.0-30b-a3b': 'Qwen/Qwen3-30B-A3B-Base',
+    'qwen2.5-32b': 'Qwen/Qwen2.5-32B',
+    'qwen2.5-72b': 'Qwen/Qwen2.5-72B',
+    # AWQ 量化模型（W4A16，显存占用约为标准模型的 1/4）
+    # 自动检测，仅支持 Docker + CUDA 环境
+    # Qwen3-14B-AWQ评估质量差，因为基于instruct版本而不是base版本
+    'qwen3.0-14b-awq': 'Qwen/Qwen3-14B-AWQ'
 }

requirements.txt CHANGED Viewed

@@ -1,10 +1,11 @@
-transformers>=4.30.0
-torch>=2.0.0
 numpy>=1.24.0
 connexion[flask,swagger-ui,uvicorn]>=3.0.0
 flask>=3.0.0
 PyYAML>=6.0
 flask-cors>=4.0.0
-accelerate>=0.20.0
 bitsandbytes>=0.41.0
 hf-transfer>=0.1.0

+transformers>=4.51.0
+torch>=2.1.0,<2.5.0
 numpy>=1.24.0
 connexion[flask,swagger-ui,uvicorn]>=3.0.0
 flask>=3.0.0
 PyYAML>=6.0
 flask-cors>=4.0.0
+accelerate>=0.30.0,<1.0.0
 bitsandbytes>=0.41.0
 hf-transfer>=0.1.0
+autoawq>=0.2.0  # Transformers AWQ 支持所需的后端库（已弃用但必需）

server.py CHANGED Viewed

@@ -3,6 +3,72 @@
 import argparse
 import os
 import logging
 # Allow an early --force-cpu flag to be passed so we can set the device
 # before importing backend modules that may instantiate models.

 import argparse
 import os
 import logging
+import multiprocessing
+# ============= 诊断并修复 OMP_NUM_THREADS 和 MKL_NUM_THREADS 环境变量（必须在导入任何可能使用 OpenMP 的库之前）=============
+def _diagnose_and_fix_thread_env_vars() -> None:
+    """
+    诊断并修复 OMP_NUM_THREADS 和 MKL_NUM_THREADS 环境变量，定位 libgomp 报错问题
+    在 HF Space 的 CUDA 容器中，可能预设了无效的环境变量值（如空字符串或 '3500m'）
+    这会导致 bitsandbytes 库初始化时 libgomp 报错
+    修复策略：所有无效值统一设置为实际 CPU 核数
+    """
+    # 获取实际 CPU 核数
+    actual_cores = multiprocessing.cpu_count()
+    # 需要诊断和修复的环境变量列表
+    env_vars = ['OMP_NUM_THREADS', 'MKL_NUM_THREADS']
+    is_first_fix = True  # 标记是否是第一次修复（用于打印标题）
+    for env_var in env_vars:
+        value = os.environ.get(env_var)
+        # 如果未设置，跳过（不打印）
+        if value is None:
+            continue
+        # 检查是否为有效值
+        stripped = value.strip()
+        is_valid = False
+        reason = ""
+        if not stripped:
+            reason = "值为空字符串"
+        elif not stripped.isdigit():
+            reason = f"包含非数字字符: {repr(stripped)}"
+        else:
+            try:
+                int_value = int(stripped)
+                if int_value <= 0:
+                    reason = f"值 <= 0: {int_value}"
+                else:
+                    is_valid = True
+            except ValueError:
+                reason = f"无法转换为整数: {repr(stripped)}"
+        # 只有在无效时才修复并立即打印
+        if not is_valid:
+            # 如果是第一次修复，先打印标题
+            if is_first_fix:
+                print(f"🔍 检测到无效的线程环境变量（实际 CPU 核数: {actual_cores}）:")
+                is_first_fix = False
+            # 统一修复为实际核数
+            os.environ[env_var] = str(actual_cores)
+            # 立即打印修复信息
+            print(f"   {env_var}:")
+            print(f"      - 原始值: {repr(value)}")
+            print(f"      - 问题: {reason}")
+            print(f"      - 🔧 已自动修复: {env_var}={actual_cores} (设置为实际 CPU 核数)")
+            if env_var == 'OMP_NUM_THREADS':
+                print(f"      - ⚠️  无效值可能导致 libgomp 报错: 'Invalid value for environment variable OMP_NUM_THREADS'")
+_diagnose_and_fix_thread_env_vars()
+# ============= 诊断结束 =============
 # Allow an early --force-cpu flag to be passed so we can set the device
 # before importing backend modules that may instantiate models.