File size: 7,496 Bytes
0695d0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
import os
# ================= 🔧 强制单卡模式 (最优先执行) =================
# 这行代码必须在 import torch 之前!
# 它可以解决 "RuntimeError: ... cuda:1 and cuda:0" 的双卡冲突问题
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# ==============================================================
import torch
import time
import gc
import shutil
import psutil
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig
# ===================== ⚙️ 用户配置区域 =====================
# 1. 模型路径
MODEL_PATH = "/home/nashen/deepseek-ocr/DeepSeek-OCR-master/DeepSeek-OCR-vllm/model/"
# 2. 测试图片 (请确保同目录下有这张图,随便找张图改名即可)
TEST_IMAGE = "./chart_with_line.jpg"
# 3. 结果保存位置
RESULT_ROOT = "./benchmark_output_chart_4bit"
# ==========================================================
class VRAMMonitor:
"""显存监控工具"""
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
def get_current_mem(self):
if self.device == "cpu": return 0
return torch.cuda.memory_allocated() / (1024 ** 3)
def reset_peak(self):
if self.device != "cpu":
torch.cuda.reset_peak_memory_stats()
def get_peak_mem(self):
if self.device == "cpu": return 0
return torch.cuda.max_memory_allocated() / (1024 ** 3)
def run_evaluation(mode="original"):
# 准备输出目录
output_dir = os.path.join(RESULT_ROOT, mode)
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.makedirs(output_dir, exist_ok=True)
monitor = VRAMMonitor()
print(f"\n{'='*40}")
print(f"🚀 正在启动评测: [{mode.upper()}] 模式")
print(f"📂 结果将保存至: {output_dir}")
print(f"{'='*40}")
# 1. 环境清理
gc.collect()
torch.cuda.empty_cache()
monitor.reset_peak()
# 2. 加载模型
print("⏳ [1/3] 正在加载模型...")
start_load_time = time.time()
try:
# 加载 Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
# 补丁:消除 pad_token 警告
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
if mode == "quantized":
# === 量化版配置 (4-bit) ===
q_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
# 关键:跳过视觉层和输出层,防止精度崩坏
llm_int8_skip_modules=[
# 1. 修复报错的关键 (SAM 模型处理 4D 图像)
"sam_model",
"model.sam_model",
# 2. 视觉主干
"vision_model",
"model.vision_model",
# 3. 修复表格竖线的关键 (投影层)
"projector",
"model.projector",
# 4. 基础 LLM 保护
"lm_head",
"embed_tokens"
]
)
model = AutoModel.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
quantization_config=q_config,
device_map="auto" # 这里会自动映射到我们指定的单卡 CUDA:0
)
else:
# === 原版配置 (BF16) ===
model = AutoModel.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
device_map="auto"
)
# 确保 Use Cache 开启
model.config.use_cache = True
model.eval()
except Exception as e:
import traceback
traceback.print_exc()
print(f"❌ 加载失败: {e}")
return None
load_time = time.time() - start_load_time
static_vram = monitor.get_current_mem()
print(f" ✅ 加载耗时: {load_time:.2f}s | 静态显存: {static_vram:.2f} GB")
# 3. 推理测试
print("⏳ [2/3] 正在运行 OCR 推理...")
if not os.path.exists(TEST_IMAGE):
print(f"❌ 错误: 没找到测试图片 {TEST_IMAGE}!")
return None
monitor.reset_peak()
prompt = "<image>\nConvert the image to markdown format. Preserve all table structures and separators carefully."
try:
start_infer_time = time.time()
with torch.no_grad():
res = model.infer(
tokenizer,
prompt=prompt,
image_file=TEST_IMAGE,
base_size=1024,
image_size=640,
crop_mode=True,
# 关键参数:防止 output_path 报错
output_path=output_dir,
save_results=True
)
infer_time = time.time() - start_infer_time
peak_vram = monitor.get_peak_mem()
print(f" ✅ 推理耗时: {infer_time:.2f}s | 峰值显存: {peak_vram:.2f} GB")
# 4. 保存文本结果
text_save_path = os.path.join(output_dir, "full_result.md")
with open(text_save_path, "w", encoding="utf-8") as f:
f.write(res)
print(f" 💾 结果已保存: {text_save_path}")
except Exception as e:
import traceback
traceback.print_exc()
print(f"❌ 推理过程报错: {e}")
return None
# 5. 清理内存 (为下一轮腾地方)
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()
return {
"mode": mode,
"static_vram": static_vram,
"infer_time": infer_time,
"peak_vram": peak_vram,
"result_head": res[:100].replace('\n', ' ') + "..."
}
# ===================== 🏆 主程序 =====================
if __name__ == "__main__":
if not os.path.exists(TEST_IMAGE):
print(f"⚠️ 请准备一张图片,重命名为 {TEST_IMAGE} 放在脚本同目录下!")
exit()
print("🏁 开始 A/B 对比测试...")
# 1. 跑量化版
res_quant = run_evaluation("quantized")
# 2. 跑原版
res_origin = run_evaluation("original")
if res_quant and res_origin:
mem_saved = (1 - res_quant['peak_vram'] / res_origin['peak_vram']) * 100
print("\n\n")
print(f"{'='*30} 📊 最终数据对比报告 {'='*30}")
print(f"{'指标':<20} | {'原版 (Original)':<18} | {'量化版 (Quantized)':<18} | {'变化'}")
print("-" * 90)
print(f"{'静态显存 (Static)':<20} | {res_origin['static_vram']:.2f} GB {'':<9} | {res_quant['static_vram']:.2f} GB {'':<9} | 📉 节省 {100*(1-res_quant['static_vram']/res_origin['static_vram']):.1f}%")
print(f"{'峰值显存 (Peak)':<20} | {res_origin['peak_vram']:.2f} GB {'':<9} | {res_quant['peak_vram']:.2f} GB {'':<9} | 📉 节省 {mem_saved:.1f}%")
print(f"{'推理时间 (Time)':<20} | {res_origin['infer_time']:.2f} s {'':<10} | {res_quant['infer_time']:.2f} s {'':<10} | {'🟢' if res_quant['infer_time']<res_origin['infer_time'] else '🟡'} 差 {abs(res_origin['infer_time']-res_quant['infer_time']):.2f}s")
print("-" * 90)
|