import os

# ================= 🔧 强制单卡模式 (最优先执行) =================
# 这行代码必须在 import torch 之前！
# 它可以解决 "RuntimeError: ... cuda:1 and cuda:0" 的双卡冲突问题
os.environ["CUDA_VISIBLE_DEVICES"] = "1" 
# ==============================================================

import torch
import time
import gc
import shutil
import psutil
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig

# ===================== ⚙️ 用户配置区域 =====================
# 1. 模型路径
MODEL_PATH = "/home/nashen/deepseek-ocr/DeepSeek-OCR-master/DeepSeek-OCR-vllm/model/"

# 2. 测试图片 (请确保同目录下有这张图，随便找张图改名即可)
TEST_IMAGE = "./chart_with_line.jpg" 

# 3. 结果保存位置
RESULT_ROOT = "./benchmark_output_chart_4bit"
# ==========================================================

class VRAMMonitor:
    """显存监控工具"""
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
    
    def get_current_mem(self):
        if self.device == "cpu": return 0
        return torch.cuda.memory_allocated() / (1024 ** 3)

    def reset_peak(self):
        if self.device != "cpu":
            torch.cuda.reset_peak_memory_stats()

    def get_peak_mem(self):
        if self.device == "cpu": return 0
        return torch.cuda.max_memory_allocated() / (1024 ** 3)

def run_evaluation(mode="original"):
    # 准备输出目录
    output_dir = os.path.join(RESULT_ROOT, mode)
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)

    monitor = VRAMMonitor()
    print(f"\n{'='*40}")
    print(f"🚀 正在启动评测: [{mode.upper()}] 模式")
    print(f"📂 结果将保存至: {output_dir}")
    print(f"{'='*40}")

    # 1. 环境清理
    gc.collect()
    torch.cuda.empty_cache()
    monitor.reset_peak()

    # 2. 加载模型
    print("⏳ [1/3] 正在加载模型...")
    start_load_time = time.time()
    
    try:
        # 加载 Tokenizer
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
        # 补丁：消除 pad_token 警告
        if tokenizer.pad_token_id is None:
            tokenizer.pad_token_id = tokenizer.eos_token_id

        if mode == "quantized":
            # === 量化版配置 (4-bit) ===
            q_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                # 关键：跳过视觉层和输出层，防止精度崩坏
                llm_int8_skip_modules=[
                    # 1. 修复报错的关键 (SAM 模型处理 4D 图像)
                    "sam_model",
                    "model.sam_model",
                    
                    # 2. 视觉主干
                    "vision_model",
                    "model.vision_model",
                    
                    # 3. 修复表格竖线的关键 (投影层)
                    "projector",
                    "model.projector",
                    
                    # 4. 基础 LLM 保护
                    "lm_head",
                    "embed_tokens"
                    
                ]
            )
            model = AutoModel.from_pretrained(
                MODEL_PATH, 
                trust_remote_code=True, 
                quantization_config=q_config, 
                device_map="auto" # 这里会自动映射到我们指定的单卡 CUDA:0
            )
        else:
            # === 原版配置 (BF16) ===
            model = AutoModel.from_pretrained(
                MODEL_PATH, 
                trust_remote_code=True, 
                torch_dtype=torch.bfloat16, 
                device_map="auto"
            )

        # 确保 Use Cache 开启
        model.config.use_cache = True
        model.eval()
        
    except Exception as e:
        import traceback
        traceback.print_exc()
        print(f"❌ 加载失败: {e}")
        return None

    load_time = time.time() - start_load_time
    static_vram = monitor.get_current_mem()
    print(f"   ✅ 加载耗时: {load_time:.2f}s | 静态显存: {static_vram:.2f} GB")

    # 3. 推理测试
    print("⏳ [2/3] 正在运行 OCR 推理...")
    if not os.path.exists(TEST_IMAGE):
        print(f"❌ 错误: 没找到测试图片 {TEST_IMAGE}！")
        return None

    monitor.reset_peak()
    
    prompt = "<image>\nConvert the image to markdown format. Preserve all table structures and separators carefully."
    
    try:
        start_infer_time = time.time()
        with torch.no_grad():
            res = model.infer(
                tokenizer, 
                prompt=prompt, 
                image_file=TEST_IMAGE, 
                base_size=1024, 
                image_size=640, 
                crop_mode=True, 
                
                # 关键参数：防止 output_path 报错
                output_path=output_dir, 
                save_results=True 
            )
        infer_time = time.time() - start_infer_time
        peak_vram = monitor.get_peak_mem()
        
        print(f"   ✅ 推理耗时: {infer_time:.2f}s | 峰值显存: {peak_vram:.2f} GB")

        # 4. 保存文本结果
        text_save_path = os.path.join(output_dir, "full_result.md")
        with open(text_save_path, "w", encoding="utf-8") as f:
            f.write(res)
        print(f"   💾 结果已保存: {text_save_path}")

    except Exception as e:
        import traceback
        traceback.print_exc()
        print(f"❌ 推理过程报错: {e}")
        return None

    # 5. 清理内存 (为下一轮腾地方)
    del model
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()

    return {
        "mode": mode,
        "static_vram": static_vram,
        "infer_time": infer_time,
        "peak_vram": peak_vram,
        "result_head": res[:100].replace('\n', ' ') + "..."
    }

# ===================== 🏆 主程序 =====================
if __name__ == "__main__":
    if not os.path.exists(TEST_IMAGE):
        print(f"⚠️  请准备一张图片，重命名为 {TEST_IMAGE} 放在脚本同目录下！")
        exit()

    print("🏁 开始 A/B 对比测试...")

    # 1. 跑量化版
    res_quant = run_evaluation("quantized")
    
    # 2. 跑原版
    res_origin = run_evaluation("original")

    if res_quant and res_origin:
        mem_saved = (1 - res_quant['peak_vram'] / res_origin['peak_vram']) * 100
        
        print("\n\n")
        print(f"{'='*30} 📊 最终数据对比报告 {'='*30}")
        print(f"{'指标':<20} | {'原版 (Original)':<18} | {'量化版 (Quantized)':<18} | {'变化'}")
        print("-" * 90)
        print(f"{'静态显存 (Static)':<20} | {res_origin['static_vram']:.2f} GB {'':<9} | {res_quant['static_vram']:.2f} GB {'':<9} | 📉 节省 {100*(1-res_quant['static_vram']/res_origin['static_vram']):.1f}%")
        print(f"{'峰值显存 (Peak)':<20} | {res_origin['peak_vram']:.2f} GB {'':<9} | {res_quant['peak_vram']:.2f} GB {'':<9} | 📉 节省 {mem_saved:.1f}%")
        print(f"{'推理时间 (Time)':<20} | {res_origin['infer_time']:.2f} s {'':<10} | {res_quant['infer_time']:.2f} s {'':<10} | {'🟢' if res_quant['infer_time']<res_origin['infer_time'] else '🟡'} 差 {abs(res_origin['infer_time']-res_quant['infer_time']):.2f}s")
        print("-" * 90)