File size: 7,496 Bytes
0695d0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import os

# ================= 🔧 强制单卡模式 (最优先执行) =================
# 这行代码必须在 import torch 之前!
# 它可以解决 "RuntimeError: ... cuda:1 and cuda:0" 的双卡冲突问题
os.environ["CUDA_VISIBLE_DEVICES"] = "1" 
# ==============================================================

import torch
import time
import gc
import shutil
import psutil
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig

# ===================== ⚙️ 用户配置区域 =====================
# 1. 模型路径
MODEL_PATH = "/home/nashen/deepseek-ocr/DeepSeek-OCR-master/DeepSeek-OCR-vllm/model/"

# 2. 测试图片 (请确保同目录下有这张图,随便找张图改名即可)
TEST_IMAGE = "./chart_with_line.jpg" 

# 3. 结果保存位置
RESULT_ROOT = "./benchmark_output_chart_4bit"
# ==========================================================

class VRAMMonitor:
    """显存监控工具"""
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
    
    def get_current_mem(self):
        if self.device == "cpu": return 0
        return torch.cuda.memory_allocated() / (1024 ** 3)

    def reset_peak(self):
        if self.device != "cpu":
            torch.cuda.reset_peak_memory_stats()

    def get_peak_mem(self):
        if self.device == "cpu": return 0
        return torch.cuda.max_memory_allocated() / (1024 ** 3)

def run_evaluation(mode="original"):
    # 准备输出目录
    output_dir = os.path.join(RESULT_ROOT, mode)
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)

    monitor = VRAMMonitor()
    print(f"\n{'='*40}")
    print(f"🚀 正在启动评测: [{mode.upper()}] 模式")
    print(f"📂 结果将保存至: {output_dir}")
    print(f"{'='*40}")

    # 1. 环境清理
    gc.collect()
    torch.cuda.empty_cache()
    monitor.reset_peak()

    # 2. 加载模型
    print("⏳ [1/3] 正在加载模型...")
    start_load_time = time.time()
    
    try:
        # 加载 Tokenizer
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
        # 补丁:消除 pad_token 警告
        if tokenizer.pad_token_id is None:
            tokenizer.pad_token_id = tokenizer.eos_token_id

        if mode == "quantized":
            # === 量化版配置 (4-bit) ===
            q_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                # 关键:跳过视觉层和输出层,防止精度崩坏
                llm_int8_skip_modules=[
                    # 1. 修复报错的关键 (SAM 模型处理 4D 图像)
                    "sam_model",
                    "model.sam_model",
                    
                    # 2. 视觉主干
                    "vision_model",
                    "model.vision_model",
                    
                    # 3. 修复表格竖线的关键 (投影层)
                    "projector",
                    "model.projector",
                    
                    # 4. 基础 LLM 保护
                    "lm_head",
                    "embed_tokens"
                    
                ]
            )
            model = AutoModel.from_pretrained(
                MODEL_PATH, 
                trust_remote_code=True, 
                quantization_config=q_config, 
                device_map="auto" # 这里会自动映射到我们指定的单卡 CUDA:0
            )
        else:
            # === 原版配置 (BF16) ===
            model = AutoModel.from_pretrained(
                MODEL_PATH, 
                trust_remote_code=True, 
                torch_dtype=torch.bfloat16, 
                device_map="auto"
            )

        # 确保 Use Cache 开启
        model.config.use_cache = True
        model.eval()
        
    except Exception as e:
        import traceback
        traceback.print_exc()
        print(f"❌ 加载失败: {e}")
        return None

    load_time = time.time() - start_load_time
    static_vram = monitor.get_current_mem()
    print(f"   ✅ 加载耗时: {load_time:.2f}s | 静态显存: {static_vram:.2f} GB")

    # 3. 推理测试
    print("⏳ [2/3] 正在运行 OCR 推理...")
    if not os.path.exists(TEST_IMAGE):
        print(f"❌ 错误: 没找到测试图片 {TEST_IMAGE}!")
        return None

    monitor.reset_peak()
    
    prompt = "<image>\nConvert the image to markdown format. Preserve all table structures and separators carefully."
    
    try:
        start_infer_time = time.time()
        with torch.no_grad():
            res = model.infer(
                tokenizer, 
                prompt=prompt, 
                image_file=TEST_IMAGE, 
                base_size=1024, 
                image_size=640, 
                crop_mode=True, 
                
                # 关键参数:防止 output_path 报错
                output_path=output_dir, 
                save_results=True 
            )
        infer_time = time.time() - start_infer_time
        peak_vram = monitor.get_peak_mem()
        
        print(f"   ✅ 推理耗时: {infer_time:.2f}s | 峰值显存: {peak_vram:.2f} GB")

        # 4. 保存文本结果
        text_save_path = os.path.join(output_dir, "full_result.md")
        with open(text_save_path, "w", encoding="utf-8") as f:
            f.write(res)
        print(f"   💾 结果已保存: {text_save_path}")

    except Exception as e:
        import traceback
        traceback.print_exc()
        print(f"❌ 推理过程报错: {e}")
        return None

    # 5. 清理内存 (为下一轮腾地方)
    del model
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()

    return {
        "mode": mode,
        "static_vram": static_vram,
        "infer_time": infer_time,
        "peak_vram": peak_vram,
        "result_head": res[:100].replace('\n', ' ') + "..."
    }

# ===================== 🏆 主程序 =====================
if __name__ == "__main__":
    if not os.path.exists(TEST_IMAGE):
        print(f"⚠️  请准备一张图片,重命名为 {TEST_IMAGE} 放在脚本同目录下!")
        exit()

    print("🏁 开始 A/B 对比测试...")

    # 1. 跑量化版
    res_quant = run_evaluation("quantized")
    
    # 2. 跑原版
    res_origin = run_evaluation("original")

    if res_quant and res_origin:
        mem_saved = (1 - res_quant['peak_vram'] / res_origin['peak_vram']) * 100
        
        print("\n\n")
        print(f"{'='*30} 📊 最终数据对比报告 {'='*30}")
        print(f"{'指标':<20} | {'原版 (Original)':<18} | {'量化版 (Quantized)':<18} | {'变化'}")
        print("-" * 90)
        print(f"{'静态显存 (Static)':<20} | {res_origin['static_vram']:.2f} GB {'':<9} | {res_quant['static_vram']:.2f} GB {'':<9} | 📉 节省 {100*(1-res_quant['static_vram']/res_origin['static_vram']):.1f}%")
        print(f"{'峰值显存 (Peak)':<20} | {res_origin['peak_vram']:.2f} GB {'':<9} | {res_quant['peak_vram']:.2f} GB {'':<9} | 📉 节省 {mem_saved:.1f}%")
        print(f"{'推理时间 (Time)':<20} | {res_origin['infer_time']:.2f} s {'':<10} | {res_quant['infer_time']:.2f} s {'':<10} | {'🟢' if res_quant['infer_time']<res_origin['infer_time'] else '🟡'}{abs(res_origin['infer_time']-res_quant['infer_time']):.2f}s")
        print("-" * 90)