Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -115,13 +115,13 @@ engine = None # InferEngine 对象
|
|
| 115 |
os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
|
| 116 |
|
| 117 |
def load_llm_model():
|
| 118 |
-
"""使用 ms-swift 的 PtEngine
|
| 119 |
global engine
|
| 120 |
if engine is not None:
|
| 121 |
return
|
| 122 |
|
| 123 |
try:
|
| 124 |
-
#
|
| 125 |
if not os.path.exists(os.path.join(LOCAL_MODEL_PATH, "config.json")):
|
| 126 |
print(f"正在从 HuggingFace 下载模型到 {LOCAL_MODEL_PATH}...")
|
| 127 |
snapshot_download(
|
|
@@ -134,21 +134,25 @@ def load_llm_model():
|
|
| 134 |
else:
|
| 135 |
print(f"✅ 模型已存在: {LOCAL_MODEL_PATH}")
|
| 136 |
|
| 137 |
-
#
|
| 138 |
-
print("正在使用 ms-swift PtEngine
|
| 139 |
|
| 140 |
-
# 🔥 关键修改:使用 PtEngine
|
| 141 |
engine = PtEngine(
|
| 142 |
model_id_or_path=LOCAL_MODEL_PATH,
|
| 143 |
-
torch_dtype=torch.float16,
|
| 144 |
-
max_batch_size=1,
|
| 145 |
-
device_map='cpu',
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
model_kwargs={
|
| 147 |
-
'low_cpu_mem_usage': True,
|
|
|
|
| 148 |
}
|
| 149 |
)
|
| 150 |
|
| 151 |
-
print("✅ ms-swift PtEngine
|
| 152 |
|
| 153 |
except Exception as e:
|
| 154 |
print(f"❌ 模型加载失败: {e}")
|
|
|
|
| 115 |
os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
|
| 116 |
|
| 117 |
def load_llm_model():
|
| 118 |
+
"""使用 ms-swift 的 PtEngine 加载量化模型(int4/int8)"""
|
| 119 |
global engine
|
| 120 |
if engine is not None:
|
| 121 |
return
|
| 122 |
|
| 123 |
try:
|
| 124 |
+
# 检查模型是否存在
|
| 125 |
if not os.path.exists(os.path.join(LOCAL_MODEL_PATH, "config.json")):
|
| 126 |
print(f"正在从 HuggingFace 下载模型到 {LOCAL_MODEL_PATH}...")
|
| 127 |
snapshot_download(
|
|
|
|
| 134 |
else:
|
| 135 |
print(f"✅ 模型已存在: {LOCAL_MODEL_PATH}")
|
| 136 |
|
| 137 |
+
# 🔥 关键修改:使用量化 + 内存优化
|
| 138 |
+
print("正在使用 ms-swift PtEngine 加载模型(量化模式)...")
|
| 139 |
|
|
|
|
| 140 |
engine = PtEngine(
|
| 141 |
model_id_or_path=LOCAL_MODEL_PATH,
|
| 142 |
+
torch_dtype=torch.float16,
|
| 143 |
+
max_batch_size=1,
|
| 144 |
+
device_map='cpu',
|
| 145 |
+
# 🔥 添加量化参数
|
| 146 |
+
quant_method='bnb', # 使用 bitsandbytes 量化
|
| 147 |
+
quant_bits=4, # 4-bit 量化(也可以用8)
|
| 148 |
+
bnb_4bit_compute_dtype=torch.float16,
|
| 149 |
model_kwargs={
|
| 150 |
+
'low_cpu_mem_usage': True,
|
| 151 |
+
'max_memory': {'cpu': '12GB'}, # 限制最大内存使用
|
| 152 |
}
|
| 153 |
)
|
| 154 |
|
| 155 |
+
print("✅ ms-swift PtEngine 加载完成(已启用量化)")
|
| 156 |
|
| 157 |
except Exception as e:
|
| 158 |
print(f"❌ 模型加载失败: {e}")
|