Spaces:
Sleeping
Sleeping
CPU-optimized version: remove bitsandbytes, reduce max_length
Browse files- app.py +12 -16
- requirements.txt +0 -1
app.py
CHANGED
|
@@ -4,25 +4,21 @@ HuggingFace Spaces 推理应用
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 8 |
import torch
|
| 9 |
|
| 10 |
# 模型配置
|
| 11 |
MODEL_NAME = "zhman/llama-SFT-GRPO"
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
quantization_config = BitsAndBytesConfig(
|
| 15 |
-
load_in_8bit=True,
|
| 16 |
-
llm_int8_threshold=6.0
|
| 17 |
-
)
|
| 18 |
-
|
| 19 |
-
# 加载模型和分词器
|
| 20 |
print("🔄 加载模型...")
|
| 21 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|
|
|
|
|
|
| 22 |
model = AutoModelForCausalLM.from_pretrained(
|
| 23 |
MODEL_NAME,
|
| 24 |
-
|
| 25 |
-
|
| 26 |
)
|
| 27 |
print("✅ 模型加载完成!")
|
| 28 |
|
|
@@ -50,7 +46,7 @@ def solve_math_problem(question, max_length=512, temperature=0.7, top_p=0.9):
|
|
| 50 |
with torch.no_grad():
|
| 51 |
outputs = model.generate(
|
| 52 |
**inputs,
|
| 53 |
-
|
| 54 |
temperature=temperature,
|
| 55 |
top_p=top_p,
|
| 56 |
do_sample=True,
|
|
@@ -81,8 +77,8 @@ demo = gr.Interface(
|
|
| 81 |
),
|
| 82 |
gr.Slider(
|
| 83 |
minimum=50,
|
| 84 |
-
maximum=
|
| 85 |
-
value=
|
| 86 |
step=50,
|
| 87 |
label="📏 最大长度"
|
| 88 |
),
|
|
@@ -120,9 +116,9 @@ demo = gr.Interface(
|
|
| 120 |
- 模型作者:zhman
|
| 121 |
""",
|
| 122 |
examples=[
|
| 123 |
-
["2+2等于多少?",
|
| 124 |
-
["一个长方形的长是8厘米,宽是5厘米,它的周长是多少?",
|
| 125 |
-
["小明有5个苹果,小红给了他3个,小明现在有多少个苹果?",
|
| 126 |
],
|
| 127 |
cache_examples=False, # 禁用示例缓存,避免启动时卡住
|
| 128 |
theme=gr.themes.Soft()
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 8 |
import torch
|
| 9 |
|
| 10 |
# 模型配置
|
| 11 |
MODEL_NAME = "zhman/llama-SFT-GRPO"
|
| 12 |
|
| 13 |
+
# 加载模型和分词器(CPU 优化版本)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
print("🔄 加载模型...")
|
| 15 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 16 |
+
|
| 17 |
+
# CPU 模式:使用 float32,不使用量化
|
| 18 |
model = AutoModelForCausalLM.from_pretrained(
|
| 19 |
MODEL_NAME,
|
| 20 |
+
torch_dtype=torch.float32, # CPU 友好的数据类型
|
| 21 |
+
low_cpu_mem_usage=True, # 降低内存使用
|
| 22 |
)
|
| 23 |
print("✅ 模型加载完成!")
|
| 24 |
|
|
|
|
| 46 |
with torch.no_grad():
|
| 47 |
outputs = model.generate(
|
| 48 |
**inputs,
|
| 49 |
+
max_new_tokens=min(max_length, 256), # 限制生成长度以加快速度
|
| 50 |
temperature=temperature,
|
| 51 |
top_p=top_p,
|
| 52 |
do_sample=True,
|
|
|
|
| 77 |
),
|
| 78 |
gr.Slider(
|
| 79 |
minimum=50,
|
| 80 |
+
maximum=512,
|
| 81 |
+
value=256, # 降低默认值以加快 CPU 推理
|
| 82 |
step=50,
|
| 83 |
label="📏 最大长度"
|
| 84 |
),
|
|
|
|
| 116 |
- 模型作者:zhman
|
| 117 |
""",
|
| 118 |
examples=[
|
| 119 |
+
["2+2等于多少?", 256, 0.7, 0.9],
|
| 120 |
+
["一个长方形的长是8厘米,宽是5厘米,它的周长是多少?", 256, 0.7, 0.9],
|
| 121 |
+
["小明有5个苹果,小红给了他3个,小明现在有多少个苹果?", 256, 0.7, 0.9]
|
| 122 |
],
|
| 123 |
cache_examples=False, # 禁用示例缓存,避免启动时卡住
|
| 124 |
theme=gr.themes.Soft()
|
requirements.txt
CHANGED
|
@@ -2,4 +2,3 @@ transformers>=4.30.0
|
|
| 2 |
torch>=2.0.0
|
| 3 |
gradio>=4.44.0
|
| 4 |
accelerate>=0.20.0
|
| 5 |
-
bitsandbytes
|
|
|
|
| 2 |
torch>=2.0.0
|
| 3 |
gradio>=4.44.0
|
| 4 |
accelerate>=0.20.0
|
|
|