Spaces:

zhman
/

llama-math-solver

Sleeping

App Files Files Community

zhman commited on Jan 7

Commit

40cee53

1 Parent(s): 7afc078

CPU-optimized version: remove bitsandbytes, reduce max_length

Browse files

Files changed (2) hide show

app.py +12 -16
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -4,25 +4,21 @@ HuggingFace Spaces 推理应用
 """
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import torch
 # 模型配置
 MODEL_NAME = "zhman/llama-SFT-GRPO"
-# 配置 8-bit 量化以提升速度和减少内存占用
-quantization_config = BitsAndBytesConfig(
-    load_in_8bit=True,
-    llm_int8_threshold=6.0
-)
-# 加载模型和分词器
 print("🔄 加载模型...")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_NAME,
-    quantization_config=quantization_config,  # 使用 8-bit 量化
-    device_map="auto"
 )
 print("✅ 模型加载完成！")
@@ -50,7 +46,7 @@ def solve_math_problem(question, max_length=512, temperature=0.7, top_p=0.9):
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_length=max_length,
             temperature=temperature,
             top_p=top_p,
             do_sample=True,
@@ -81,8 +77,8 @@ demo = gr.Interface(
         ),
         gr.Slider(
             minimum=50,
-            maximum=2048,
-            value=512,
             step=50,
             label="📏 最大长度"
         ),
@@ -120,9 +116,9 @@ demo = gr.Interface(
     - 模型作者：zhman
     """,
     examples=[
-        ["2+2等于多少？", 512, 0.7, 0.9],
-        ["一个长方形的长是8厘米，宽是5厘米，它的周长是多少？", 512, 0.7, 0.9],
-        ["小明有5个苹果，小红给了他3个，小明现在有多少个苹果？", 512, 0.7, 0.9]
     ],
     cache_examples=False,  # 禁用示例缓存，避免启动时卡住
     theme=gr.themes.Soft()

 """
 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 # 模型配置
 MODEL_NAME = "zhman/llama-SFT-GRPO"
+# 加载模型和分词器（CPU 优化版本）
 print("🔄 加载模型...")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+# CPU 模式：使用 float32，不使用量化
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_NAME,
+    torch_dtype=torch.float32,  # CPU 友好的数据类型
+    low_cpu_mem_usage=True,     # 降低内存使用
 )
 print("✅ 模型加载完成！")
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=min(max_length, 256),  # 限制生成长度以加快速度
             temperature=temperature,
             top_p=top_p,
             do_sample=True,
         ),
         gr.Slider(
             minimum=50,
+            maximum=512,
+            value=256,  # 降低默认值以加快 CPU 推理
             step=50,
             label="📏 最大长度"
         ),
     - 模型作者：zhman
     """,
     examples=[
+        ["2+2等于多少？", 256, 0.7, 0.9],
+        ["一个长方形的长是8厘米，宽是5厘米，它的周长是多少？", 256, 0.7, 0.9],
+        ["小明有5个苹果，小红给了他3个，小明现在有多少个苹果？", 256, 0.7, 0.9]
     ],
     cache_examples=False,  # 禁用示例缓存，避免启动时卡住
     theme=gr.themes.Soft()

requirements.txt CHANGED Viewed

@@ -2,4 +2,3 @@ transformers>=4.30.0
 torch>=2.0.0
 gradio>=4.44.0
 accelerate>=0.20.0
-bitsandbytes

 torch>=2.0.0
 gradio>=4.44.0
 accelerate>=0.20.0