Larer commited on Feb 25

Commit

5c19a88

1 Parent(s): 65e04f2

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

diffusion-dpo-ocr/check_video_resolution.py +194 -0
diffusion-dpo-ocr/prepare_roadtext.py +625 -0
diffusion-dpo-ocr/results/roadtext_eval_results_BSRGAN.json +17 -0
diffusion-dpo-ocr/results/roadtext_eval_results_DP2O-SR.json +17 -0
diffusion-dpo-ocr/results/roadtext_eval_results_DiT4SR.json +17 -0
diffusion-dpo-ocr/results/roadtext_eval_results_DiffBIR.json +17 -0
diffusion-dpo-ocr/results/roadtext_eval_results_FaithDiff.json +17 -0
diffusion-dpo-ocr/results/roadtext_eval_results_Ours.json +17 -0
diffusion-dpo-ocr/results/roadtext_eval_results_Real-ESRGAN.json +17 -0
diffusion-dpo-ocr/results/roadtext_eval_results_SUPSR.json +17 -0
diffusion-dpo-ocr/results/roadtext_eval_results_SeeSR.json +17 -0
diffusion-dpo-ocr/results/roadtext_eval_results_StableSR.json +17 -0
diffusion-dpo-ocr/results/roadtext_eval_results_SwinIR.json +17 -0
diffusion-dpo-ocr/results/roadtext_eval_results_gt.json +17 -0
diffusion-dpo-ocr/results/roadtext_eval_results_sample00.json +17 -0
diffusion-dpo-ocr/results/roadtext_eval_results_zoomlr.json +17 -0
diffusion-dpo-ocr/roadtext_eval_results_output.json +17 -0
diffusion-dpo-ocr/test_roadtext.py +514 -0
diffusion-dpo-ocr/verify_roadtext_annotations.py +223 -0
diffusion-dpo-test/DIAGNOSTIC_CHECKLIST.md +297 -0
diffusion-dpo-test/DIV2K-val/sobolev-400/0000843-seed-0.png +0 -0
diffusion-dpo-test/__pycache__/color_fix.cpython-310.pyc +0 -0
diffusion-dpo-test/analyze_lora_magnitude.py +179 -0
diffusion-dpo-test/check_lora_keys.py +76 -0
diffusion-dpo-test/color_fix.py +119 -0
diffusion-dpo-test/compare.py +73 -0
diffusion-dpo-test/compare_checkpoints.py +147 -0
diffusion-dpo-test/data_val/0000009-seed-0.png +0 -0
diffusion-dpo-test/data_val/0000010-seed-0.png +0 -0
diffusion-dpo-test/fix_lora_keys.py +132 -0
diffusion-dpo-test/inspect_safetensor.py +115 -0
diffusion-dpo-test/metrics.json +142 -0
diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000263-seed-0.png +0 -0
diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000463-seed-0.png +0 -0
diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000563-seed-0.png +0 -0
diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000763-seed-0.png +0 -0
diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000863-seed-0.png +0 -0
diffusion-dpo-test/results-test/DrealSR/sony_160_x4.png +0 -0
diffusion-dpo-test/results-test/DrealSR/sony_189_x4.png +0 -0
diffusion-dpo-test/src/flux/__pycache__/block.cpython-310.pyc +0 -0
diffusion-dpo-test/src/flux/__pycache__/block.cpython-311.pyc +0 -0
diffusion-dpo-test/src/flux/__pycache__/condition.cpython-310.pyc +0 -0
diffusion-dpo-test/src/flux/__pycache__/condition.cpython-311.pyc +0 -0
diffusion-dpo-test/src/flux/__pycache__/generate.cpython-310.pyc +0 -0
diffusion-dpo-test/src/flux/__pycache__/generate.cpython-311.pyc +0 -0
diffusion-dpo-test/src/flux/__pycache__/lora_controller.cpython-310.pyc +0 -0
diffusion-dpo-test/src/flux/__pycache__/lora_controller.cpython-311.pyc +0 -0
diffusion-dpo-test/src/flux/__pycache__/pipeline_tools.cpython-310.pyc +0 -0
diffusion-dpo-test/src/flux/__pycache__/pipeline_tools.cpython-311.pyc +0 -0
diffusion-dpo-test/src/flux/__pycache__/transformer.cpython-310.pyc +0 -0

diffusion-dpo-ocr/check_video_resolution.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""
+检查 RoadText1K Videos 目录下所有视频的分辨率
+"""
+import os
+from pathlib import Path
+from collections import Counter
+import cv2
+from tqdm import tqdm
+# ============================================================================
+# 配置参数 - 请修改这里
+# ============================================================================
+CONFIG = {
+    # RoadText1K Videos 目录
+    'videos_dir': '/home/wanghongbo06/baipurui/DATA/RoadText1k/Videos',
+    # 目标分辨率 (可选，用于检查是否匹配)
+    'target_resolution': (1920, 1080),  # 例如: (1920, 1080) 或 None 只统计
+    # 检查哪个数据集
+    'split': 'test',  # 'test', 'train', 'val', 或 'all' 检查全部
+}
+# ============================================================================
+def get_video_resolution(video_path: Path) -> tuple:
+    """
+    获取视频分辨率
+    Returns:
+        (width, height) 或 None 如果无法读取
+    """
+    cap = cv2.VideoCapture(str(video_path))
+    if not cap.isOpened():
+        return None
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    cap.release()
+    return {
+        'width': width,
+        'height': height,
+        'fps': fps,
+        'frame_count': frame_count,
+    }
+def main():
+    videos_dir = Path(CONFIG['videos_dir'])
+    split = CONFIG['split']
+    target_res = CONFIG['target_resolution']
+    print("="*60)
+    print("Video Resolution Checker")
+    print("="*60)
+    print(f"Videos directory: {videos_dir}")
+    print(f"Split: {split}")
+    if target_res:
+        print(f"Target resolution: {target_res[0]}x{target_res[1]}")
+    print()
+    # 确定要检查的目录
+    if split == 'all':
+        check_dirs = ['test', 'train', 'val']
+    else:
+        check_dirs = [split]
+    all_resolutions = []
+    resolution_stats = Counter()
+    mismatched_videos = []
+    total_videos = 0
+    for split_name in check_dirs:
+        split_dir = videos_dir / split_name
+        if not split_dir.exists():
+            print(f"Warning: Directory not found: {split_dir}")
+            continue
+        print(f"\nChecking {split_name}...")
+        # 查找所有子目录中的视频文件（支持嵌套结构）
+        video_files = []
+        for subdir in sorted(split_dir.iterdir()):
+            if subdir.is_dir():
+                # 递归查找子目录中的视频
+                video_files.extend(subdir.glob('*.mp4'))
+                video_files.extend(subdir.glob('*.avi'))
+            elif subdir.suffix in ['.mp4', '.avi']:
+                # 视频文件直接在 split 目录下
+                video_files.append(subdir)
+        video_files = sorted(video_files)
+        print(f"Found {len(video_files)} videos")
+        for video_path in tqdm(video_files, desc=f"Processing {split_name}"):
+            info = get_video_resolution(video_path)
+            if info is None:
+                print(f"Warning: Cannot read {video_path.name}")
+                continue
+            width = info['width']
+            height = info['height']
+            resolution = (width, height)
+            all_resolutions.append({
+                'file': video_path.name,
+                'split': split_name,
+                'width': width,
+                'height': height,
+                'fps': info['fps'],
+                'frame_count': info['frame_count'],
+            })
+            resolution_stats[resolution] += 1
+            # 检查是否匹配目标分辨率
+            if target_res:
+                if resolution != target_res:
+                    mismatched_videos.append({
+                        'file': video_path.name,
+                        'split': split_name,
+                        'resolution': f"{width}x{height}",
+                        'expected': f"{target_res[0]}x{target_res[1]}",
+                    })
+            total_videos += 1
+    # 打印统计结果
+    print("\n" + "="*60)
+    print("STATISTICS")
+    print("="*60)
+    print(f"Total videos checked: {total_videos}")
+    print(f"Unique resolutions: {len(resolution_stats)}")
+    print()
+    print("Resolution distribution:")
+    print("-" * 60)
+    for (width, height), count in resolution_stats.most_common():
+        percentage = count / total_videos * 100
+        print(f"  {width:4d} x {height:4d}: {count:4d} videos ({percentage:5.1f}%)")
+    # 打印最常见的分辨率
+    if resolution_stats:
+        most_common = resolution_stats.most_common(1)[0]
+        print(f"\nMost common resolution: {most_common[0][0]}x{most_common[0][1]} ({most_common[1]} videos)")
+    # 如果有目标分辨率，显示不匹配的
+    if target_res:
+        print("\n" + "="*60)
+        print("RESOLUTION MATCHING")
+        print("="*60)
+        if mismatched_videos:
+            print(f"Videos NOT matching target resolution: {len(mismatched_videos)}")
+            print("\nMismatched videos (first 20):")
+            for item in mismatched_videos[:20]:
+                print(f"  {item['split']}/{item['file']}: {item['resolution']} (expected {item['expected']})")
+            if len(mismatched_videos) > 20:
+                print(f"  ... and {len(mismatched_videos) - 20} more")
+        else:
+            print(f"✓ All videos match target resolution {target_res[0]}x{target_res[1]}")
+    # 保存详细结果
+    output_file = videos_dir.parent / 'video_resolution_report.json'
+    import json
+    report = {
+        'total_videos': total_videos,
+        'resolution_distribution': {f"{w}x{h}": count for (w, h), count in resolution_stats.items()},
+        'videos': all_resolutions,
+    }
+    if target_res:
+        report['target_resolution'] = f"{target_res[0]}x{target_res[1]}"
+        report['mismatched_count'] = len(mismatched_videos)
+        report['mismatched_videos'] = mismatched_videos
+    with open(output_file, 'w') as f:
+        json.dump(report, f, indent=2)
+    print(f"\nDetailed report saved to: {output_file}")
+if __name__ == '__main__':
+    main()

diffusion-dpo-ocr/prepare_roadtext.py ADDED Viewed

	@@ -0,0 +1,625 @@

+"""
+RoadText1K 预处理脚本
+从视频中提取帧，resize 到 512x512，合并 Localisation 和 Text_Transcription 标注
+使用流程:
+1. 运行此脚本生成 GT images (512x512) 和合并后的标注
+2. 用你的方式生成 LR 和 SR images
+3. 运行 test_roadtext.py 评估 OCR 性能
+"""
+import os
+import json
+import random
+from pathlib import Path
+from typing import Dict, List, Tuple
+from PIL import Image
+import numpy as np
+from tqdm import tqdm
+import cv2
+# ============================================================================
+# 配置参数 - 请修改这里
+# ============================================================================
+CONFIG = {
+    # RoadText1K 根目录
+    'roadtext_root': '/home/wanghongbo06/baipurui/DATA/RoadText1k',
+    # 使用哪个数据集 (test/train/val)
+    'split': 'test',
+    # 输出目录
+    'output_dir': '/home/wanghongbo06/baipurui/DATA/RoadText1k_patch_crop',
+    # Crop 尺寸 (从原图中 crop 这个大小的区域)
+    'crop_size': 512,
+    # Crop 策略: 'center', 'random', 'text_center' (以文本区域为中心)
+    'crop_strategy': 'text_center',
+    # 最小文本框保留比例 (文本框至少要有这么多比例在 crop 区域内才保留)
+    # 1.0 表示只保留完全在 crop 区域内的文本框（推荐，避免坐标问题）
+    # 0.7 表示文本框至少 70% 在 crop 区域内（但可能导致坐标超出边界）
+    'min_box_overlap': 1.0,
+    # 最小文本框数量 (crop 后至少要有这么多有效文本框)
+    'min_text_boxes': 1,
+    # 最终图片数量限制
+    'max_frames': 1000,
+    # 随机种子
+    'seed': 42,
+}
+# ============================================================================
+def load_localisation_annotations(root_dir: str) -> Dict:
+    """加载所有 Localisation 标注"""
+    loc_dir = Path(root_dir) / 'Ground_truths' / 'Localisation'
+    all_annotations = {}
+    print("Loading Localisation annotations...")
+    json_files = sorted(loc_dir.glob('*.json'))
+    print(f"Found {len(json_files)} JSON files")
+    for json_file in json_files:
+        try:
+            with open(json_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                # data 应该是一个列表
+                if isinstance(data, list):
+                    for item in data:
+                        img_name = item.get('name', '')
+                        if img_name:
+                            all_annotations[img_name] = item
+                else:
+                    print(f"Warning: {json_file.name} is not a list format")
+        except Exception as e:
+            print(f"Error loading {json_file.name}: {e}")
+    print(f"Loaded {len(all_annotations)} image annotations")
+    return all_annotations
+def load_text_transcriptions(root_dir: str) -> Dict:
+    """加载所有 Text_Transcription 标注"""
+    text_dir = Path(root_dir) / 'Ground_truths' / 'Text_Transcription'
+    all_texts = {}
+    print("Loading Text_Transcription annotations...")
+    # 支持 *.json 和 *.json.json (双扩展名)
+    json_files = sorted(list(text_dir.glob('*.json')) + list(text_dir.glob('*.json.json')))
+    print(f"Found {len(json_files)} JSON files")
+    for json_file in json_files:
+        try:
+            with open(json_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                # data 应该是一个字典 {video_name: {label_id: text}}
+                if isinstance(data, dict):
+                    for video_name, texts in data.items():
+                        if video_name not in all_texts:
+                            all_texts[video_name] = {}
+                        if isinstance(texts, dict):
+                            all_texts[video_name].update(texts)
+                else:
+                    print(f"Warning: {json_file.name} is not a dict format")
+        except Exception as e:
+            print(f"Error loading {json_file.name}: {e}")
+    print(f"Loaded texts for {len(all_texts)} videos")
+    return all_texts
+def get_box_bounds(box: List[float]) -> Tuple[float, float, float, float]:
+    """从多边形获取边界框 (x1, y1, x2, y2)"""
+    # box2d 格式 [x1,y1,x2,y1,x2,y2,x1,y2] 或其他多边形格式
+    xs = [box[i] for i in range(0, len(box), 2)]
+    ys = [box[i] for i in range(1, len(box), 2)]
+    return min(xs), min(ys), max(xs), max(ys)
+def is_box_fully_inside(box: List[float], crop_x: int, crop_y: int, crop_size: int) -> bool:
+    """检查框是否完全在 crop 区域内"""
+    x1, y1, x2, y2 = get_box_bounds(box)
+    cx1, cy1 = crop_x, crop_y
+    cx2, cy2 = crop_x + crop_size, crop_y + crop_size
+    return x1 >= cx1 and y1 >= cy1 and x2 <= cx2 and y2 <= cy2
+def calc_box_overlap(box: List[float], crop_x: int, crop_y: int, crop_size: int) -> float:
+    """
+    计算文本框与 crop 区域的重叠比例
+    返回值: 0.0 - 1.0，表示文本框有多少比例在 crop 区域内
+    """
+    x1, y1, x2, y2 = get_box_bounds(box)
+    box_area = (x2 - x1) * (y2 - y1)
+    if box_area <= 0:
+        return 0.0
+    # crop 区域边界
+    cx1, cy1 = crop_x, crop_y
+    cx2, cy2 = crop_x + crop_size, crop_y + crop_size
+    # 计算交集
+    inter_x1 = max(x1, cx1)
+    inter_y1 = max(y1, cy1)
+    inter_x2 = min(x2, cx2)
+    inter_y2 = min(y2, cy2)
+    if inter_x1 >= inter_x2 or inter_y1 >= inter_y2:
+        return 0.0
+    inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
+    return inter_area / box_area
+def clip_polygon_to_crop(poly: List[float], crop_x: int, crop_y: int, crop_size: int) -> List[float]:
+    """
+    将多边形坐标转换到 crop 坐标系
+    当 min_box_overlap = 1.0 时，框完全在 crop 内，只需要平移
+    当 min_box_overlap < 1.0 时，需要将超出边界的坐标裁剪到边界
+    """
+    clipped = []
+    for i in range(0, len(poly), 2):
+        x = poly[i] - crop_x  # 先平移
+        y = poly[i+1] - crop_y
+        # 裁剪到 [0, crop_size] 范围内（安全措施）
+        x = max(0, min(x, crop_size))
+        y = max(0, min(y, crop_size))
+        clipped.append(x)
+        clipped.append(y)
+    return clipped
+def find_best_crop_position(
+    polygons: List[List[float]],
+    img_w: int,
+    img_h: int,
+    crop_size: int,
+    strategy: str = 'text_center',
+    min_overlap: float = 0.7
+) -> Tuple[int, int, List[int]]:
+    """
+    找到最佳的 crop 位置
+    Args:
+        polygons: 文本框多边形列表
+        img_w, img_h: 原图尺寸
+        crop_size: crop 尺寸
+        strategy: crop 策略
+        min_overlap: 最小重叠比例，文本框至少这么多比例在 crop 内才保留
+    Returns:
+        (crop_x, crop_y, valid_box_indices)
+    """
+    if not polygons:
+        crop_x = max(0, (img_w - crop_size) // 2)
+        crop_y = max(0, (img_h - crop_size) // 2)
+        return crop_x, crop_y, []
+    if strategy == 'center':
+        crop_x = max(0, (img_w - crop_size) // 2)
+        crop_y = max(0, (img_h - crop_size) // 2)
+    elif strategy == 'text_center':
+        # 计算所有文本框的中心，以平均中心为 crop 中心
+        centers_x = []
+        centers_y = []
+        for poly in polygons:
+            x1, y1, x2, y2 = get_box_bounds(poly)
+            centers_x.append((x1 + x2) / 2)
+            centers_y.append((y1 + y2) / 2)
+        avg_cx = sum(centers_x) / len(centers_x)
+        avg_cy = sum(centers_y) / len(centers_y)
+        crop_x = int(avg_cx - crop_size / 2)
+        crop_y = int(avg_cy - crop_size / 2)
+        # 边界检查
+        crop_x = max(0, min(crop_x, img_w - crop_size))
+        crop_y = max(0, min(crop_y, img_h - crop_size))
+    elif strategy == 'random':
+        max_x = max(0, img_w - crop_size)
+        max_y = max(0, img_h - crop_size)
+        crop_x = random.randint(0, max_x) if max_x > 0 else 0
+        crop_y = random.randint(0, max_y) if max_y > 0 else 0
+    else:
+        crop_x = max(0, (img_w - crop_size) // 2)
+        crop_y = max(0, (img_h - crop_size) // 2)
+    # 找出哪些文本框的重叠比例 >= min_overlap
+    valid_indices = []
+    for i, poly in enumerate(polygons):
+        if min_overlap >= 0.99:
+            # 严格模式：只保留完全在 crop 区域内的框
+            if is_box_fully_inside(poly, crop_x, crop_y, crop_size):
+                valid_indices.append(i)
+        else:
+            # 宽松模式：保留 overlap >= min_overlap 的框
+            overlap = calc_box_overlap(poly, crop_x, crop_y, crop_size)
+            if overlap >= min_overlap:
+                valid_indices.append(i)
+    return crop_x, crop_y, valid_indices
+def extract_frame_with_crop(
+    video_path: Path,
+    frame_idx: int,
+    output_dir: Path,
+    crop_x: int,
+    crop_y: int,
+    crop_size: int,
+) -> Tuple[str, bool]:
+    """
+    从视频中提取指定帧并 crop
+    Returns:
+        (saved_filename, success)
+    """
+    cap = cv2.VideoCapture(str(video_path))
+    if not cap.isOpened():
+        return '', False
+    # 跳到指定帧
+    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
+    ret, frame = cap.read()
+    cap.release()
+    if not ret:
+        return '', False
+    # 转换为 RGB
+    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    img = Image.fromarray(frame_rgb)
+    # Crop
+    img_cropped = img.crop((crop_x, crop_y, crop_x + crop_size, crop_y + crop_size))
+    # 保存
+    video_name = video_path.stem
+    img_filename = f"{video_name}-{frame_idx:07d}.png"
+    img_path = output_dir / img_filename
+    img_cropped.save(img_path)
+    return img_filename, True
+def adjust_polygon_for_crop(poly: List[float], crop_x: int, crop_y: int) -> List[float]:
+    """调整多边形坐标到 crop 后的坐标系"""
+    adjusted = []
+    for i in range(0, len(poly), 2):
+        adjusted.append(poly[i] - crop_x)
+        adjusted.append(poly[i+1] - crop_y)
+    return adjusted
+def merge_annotations(loc_ann: Dict, text_dict: Dict) -> Dict:
+    """
+    合并 Localisation 和 Text_Transcription
+    Args:
+        loc_ann: Localisation 标注项
+        text_dict: Text_Transcription 字典 {video_name: {label_id: text}}
+    Returns:
+        合并后的标注: {
+            'polygons': [[x1,y1,x2,y2,x3,y3,x4,y4], ...],
+            'texts': ['text1', 'text2', ...],
+            'ignore': [False, False, ...]
+        }
+    """
+    video_name = loc_ann.get('videoName', '')
+    labels = loc_ann.get('labels') or []  # 处理 None 的情况
+    # 获取该视频的文本字典
+    video_texts = text_dict.get(video_name, {})
+    polygons = []
+    texts = []
+    ignore = []
+    for label in labels:
+        label_id = str(label.get('id', ''))
+        box2d = label.get('box2d', {})
+        if not box2d:
+            continue
+        # 从 box2d 转换为多边形 (4个点)
+        x1, y1 = box2d.get('x1', 0), box2d.get('y1', 0)
+        x2, y2 = box2d.get('x2', 0), box2d.get('y2', 0)
+        # 转换为多边形格式 [x1,y1,x2,y1,x2,y2,x1,y2]
+        polygon = [x1, y1, x2, y1, x2, y2, x1, y2]
+        polygons.append(polygon)
+        # 获取文本
+        text = video_texts.get(label_id, '')
+        texts.append(text)
+        # 判断是否忽略 (空文本或特定标记)
+        ignore_flag = (text == '' or text == '###' or text.lower() == 'illegible')
+        ignore.append(ignore_flag)
+    return {
+        'polygons': polygons,
+        'texts': texts,
+        'ignore': ignore,
+    }
+def scale_polygon(polygon: List[float], scale_x: float, scale_y: float) -> List[float]:
+    """缩放多边形坐标"""
+    scaled = []
+    for i in range(0, len(polygon), 2):
+        scaled.append(polygon[i] * scale_x)
+        scaled.append(polygon[i+1] * scale_y)
+    return scaled
+def main():
+    random.seed(CONFIG['seed'])
+    np.random.seed(CONFIG['seed'])
+    root_dir = Path(CONFIG['roadtext_root'])
+    output_dir = Path(CONFIG['output_dir'])
+    split = CONFIG['split']
+    crop_size = CONFIG['crop_size']
+    # 创建输出目录
+    gt_dir = output_dir / 'gt'
+    gt_dir.mkdir(parents=True, exist_ok=True)
+    print(f"RoadText1K root: {root_dir}")
+    print(f"Split: {split}")
+    print(f"Output dir: {output_dir}")
+    print(f"Crop size: {crop_size}x{crop_size}")
+    print(f"Crop strategy: {CONFIG['crop_strategy']}")
+    print(f"Min text boxes per crop: {CONFIG['min_text_boxes']}")
+    print()
+    # 加载标注
+    loc_annotations = load_localisation_annotations(root_dir)
+    text_transcriptions = load_text_transcriptions(root_dir)
+    # 获取视频目录
+    video_dir = root_dir / 'Videos' / split
+    if not video_dir.exists():
+        print(f"Error: Video directory not found: {video_dir}")
+        return
+    # 获取所有视频文件（支持嵌套子目录结构）
+    video_files = []
+    for subdir in sorted(video_dir.iterdir()):
+        if subdir.is_dir():
+            # 子目录中的视频文件
+            video_files.extend(subdir.glob('*.mp4'))
+            video_files.extend(subdir.glob('*.avi'))
+        elif subdir.suffix in ['.mp4', '.avi']:
+            # 视频文件直接在 split 目录下
+            video_files.append(subdir)
+    video_files = sorted(video_files)
+    total_videos = len(video_files)
+    print(f"找到 {total_videos} 个视频")
+    print()
+    # 处理视频
+    new_annotations = {}
+    processed_count = 0
+    total_frames = 0
+    print("Processing videos...")
+    print("Step 1: 找出每个视频中有标注的帧...")
+    # 第一步：找出每个视频中有标注的帧索引，同时记录标注 key
+    video_annotated_frames = {}  # {video_name: [(frame_num, annotation_key), ...]}
+    video_name_to_path = {vp.stem: vp for vp in video_files}
+    for key, ann in loc_annotations.items():
+        # 从标注的 name 中提取视频名和帧号
+        # 格式可能是: "200_frames/170-0000001.jpg" 或 "test_frames/701-0000001.jpg"
+        parts = key.split('/')
+        if len(parts) >= 2:
+            filename = parts[-1]  # "170-0000001.jpg"
+        else:
+            filename = key  # "170-0000001.jpg"
+        # 提取帧号
+        if '-' in filename:
+            try:
+                frame_str = filename.split('-')[-1].split('.')[0]
+                frame_num = int(frame_str)
+                # 尝试提取视频名
+                video_name_candidate = filename.split('-')[0]
+                # 检查是否在我们的视频列表中
+                if video_name_candidate in video_name_to_path:
+                    if video_name_candidate not in video_annotated_frames:
+                        video_annotated_frames[video_name_candidate] = []
+                    # 存储 (frame_num, annotation_key) 以便后续使用
+                    entry = (frame_num, key)
+                    if entry not in video_annotated_frames[video_name_candidate]:
+                        video_annotated_frames[video_name_candidate].append(entry)
+            except:
+                pass
+    print(f"找到 {len(video_annotated_frames)} 个视频有标注")
+    total_annotated_frames = sum(len(frames) for frames in video_annotated_frames.values())
+    print(f"总共有 {total_annotated_frames} 帧有标注")
+    # 如果设置了 max_frames，随机选取
+    max_frames = CONFIG['max_frames']
+    if max_frames is not None and total_annotated_frames > max_frames:
+        print(f"\n限制最大帧数为 {max_frames}，随机选取中...")
+        # 将所有帧展开为列表 [(video_name, frame_num, annotation_key), ...]
+        all_frames = []
+        for video_name, frame_list in video_annotated_frames.items():
+            for frame_num, ann_key in frame_list:
+                all_frames.append((video_name, frame_num, ann_key))
+        # 随机选取
+        selected_frames = random.sample(all_frames, max_frames)
+        # 重新组织为 video_annotated_frames 格式
+        video_annotated_frames = {}
+        for video_name, frame_num, ann_key in selected_frames:
+            if video_name not in video_annotated_frames:
+                video_annotated_frames[video_name] = []
+            video_annotated_frames[video_name].append((frame_num, ann_key))
+        print(f"随机选取 {max_frames} 帧，涉及 {len(video_annotated_frames)} 个视频")
+    crop_size = CONFIG['crop_size']
+    crop_strategy = CONFIG['crop_strategy']
+    min_box_overlap = CONFIG['min_box_overlap']
+    min_text_boxes = CONFIG['min_text_boxes']
+    print()
+    print("Step 2: 提取帧并 Crop（保留有效文本框）...")
+    print(f"  Crop 尺寸: {crop_size}x{crop_size}")
+    print(f"  Crop 策略: {crop_strategy}")
+    print(f"  最小重叠比例: {min_box_overlap:.0%}")
+    print(f"  最小文本框数: {min_text_boxes}")
+    print()
+    skipped_no_boxes = 0
+    for video_path in tqdm(video_files, desc="Videos"):
+        video_name = video_path.stem
+        # 获取该视频有标注的帧信息 [(frame_num, ann_key), ...]
+        annotated_frame_info = video_annotated_frames.get(video_name, [])
+        if not annotated_frame_info:
+            continue
+        # 获取视频分辨率
+        cap = cv2.VideoCapture(str(video_path))
+        if not cap.isOpened():
+            continue
+        orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        cap.release()
+        if orig_w == 0 or orig_h == 0:
+            continue
+        # 处理每一帧
+        for frame_num, ann_key in annotated_frame_info:
+            if ann_key not in loc_annotations:
+                continue
+            loc_ann = loc_annotations[ann_key]
+            if loc_ann is None:
+                continue
+            # 合并标注
+            merged_ann = merge_annotations(loc_ann, text_transcriptions)
+            if not merged_ann['polygons']:
+                skipped_no_boxes += 1
+                continue
+            # 找到最佳的 crop 位置
+            crop_x, crop_y, valid_indices = find_best_crop_position(
+                merged_ann['polygons'],
+                orig_w,
+                orig_h,
+                crop_size,
+                crop_strategy,
+                min_box_overlap
+            )
+            # 检查有效文本框数量
+            if len(valid_indices) < min_text_boxes:
+                skipped_no_boxes += 1
+                continue
+            # 提取并 crop 帧
+            img_filename, success = extract_frame_with_crop(
+                video_path, frame_num, gt_dir,
+                crop_x, crop_y, crop_size
+            )
+            if not success:
+                continue
+            # 只保留有效的文本框，裁剪并调整坐标
+            cropped_polygons = []
+            cropped_texts = []
+            cropped_ignore = []
+            for i in valid_indices:
+                # 裁剪多边形到 crop 区域并转换坐标
+                clipped_poly = clip_polygon_to_crop(
+                    merged_ann['polygons'][i], crop_x, crop_y, crop_size
+                )
+                cropped_polygons.append(clipped_poly)
+                cropped_texts.append(merged_ann['texts'][i])
+                cropped_ignore.append(merged_ann['ignore'][i])
+            new_annotations[img_filename] = {
+                'polygons': cropped_polygons,
+                'texts': cropped_texts,
+                'ignore': cropped_ignore,
+                'original_name': ann_key,
+                'crop_position': [crop_x, crop_y],
+            }
+            total_frames += 1
+        processed_count += 1
+    # 保存标注
+    ann_output_path = output_dir / 'annotations.json'
+    with open(ann_output_path, 'w', encoding='utf-8') as f:
+        json.dump(new_annotations, f, indent=2, ensure_ascii=False)
+    # 统计有效文本框
+    total_boxes = sum(len(ann['texts']) for ann in new_annotations.values())
+    valid_boxes = sum(
+        sum(1 for ig in ann['ignore'] if not ig)
+        for ann in new_annotations.values()
+    )
+    print()
+    print("="*60)
+    print("完成!")
+    print("="*60)
+    print(f"处理视频数: {processed_count}")
+    print(f"提取帧数: {total_frames}")
+    print(f"跳过帧数(文本框不足): {skipped_no_boxes}")
+    print(f"有标注的图片数: {len(new_annotations)}")
+    print(f"文本框总数: {total_boxes}")
+    print(f"有效文本框: {valid_boxes}")
+    print(f"每张图平均文本框: {total_boxes/max(1,len(new_annotations)):.1f}")
+    print()
+    print("输出文件:")
+    print(f"  GT images ({crop_size}x{crop_size}): {gt_dir}")
+    print(f"  Annotations: {ann_output_path}")
+    print()
+    print("下一步:")
+    print(f"  1. 用你的方式生成 LR images (如 128x128)")
+    print(f"  2. 超分得到 SR images ({crop_size}x{crop_size})")
+    print(f"  3. 将 SR images 保存到 {output_dir}/sr")
+    print(f"  4. 运行 test_roadtext.py 评估")
+if __name__ == '__main__':
+    main()

diffusion-dpo-ocr/results/roadtext_eval_results_BSRGAN.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 51,
+  "FP": 80,
+  "FN": 1920,
+  "Precision": 0.3893129770992366,
+  "Recall": 0.0258751902587519,
+  "F1-Score": 0.04852521408182683,
+  "OCR_detections": 131,
+  "GT_boxes": 1971,
+  "Detection_rate": 0.06646372399797057,
+  "Det_Precision": 0.6183206106870229,
+  "Det_Recall": 0.0410958904109589,
+  "Det_F1": 0.07706945765937202,
+  "Det_matched": 81,
+  "Text_matched": 51,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/results/roadtext_eval_results_DP2O-SR.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 314,
+  "FP": 1182,
+  "FN": 1657,
+  "Precision": 0.20989304812834225,
+  "Recall": 0.15930999492643327,
+  "F1-Score": 0.181136429189501,
+  "OCR_detections": 1496,
+  "GT_boxes": 1971,
+  "Detection_rate": 0.7590055809233891,
+  "Det_Precision": 0.5274064171122995,
+  "Det_Recall": 0.4003044140030441,
+  "Det_F1": 0.4551485434092875,
+  "Det_matched": 789,
+  "Text_matched": 314,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/results/roadtext_eval_results_DiT4SR.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 226,
+  "FP": 903,
+  "FN": 1745,
+  "Precision": 0.20017714791851196,
+  "Recall": 0.11466260781329274,
+  "F1-Score": 0.14580645161290323,
+  "OCR_detections": 1129,
+  "GT_boxes": 1971,
+  "Detection_rate": 0.5728056823947235,
+  "Det_Precision": 0.5234720992028343,
+  "Det_Recall": 0.2998477929984779,
+  "Det_F1": 0.3812903225806451,
+  "Det_matched": 591,
+  "Text_matched": 226,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/results/roadtext_eval_results_DiffBIR.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 172,
+  "FP": 687,
+  "FN": 1799,
+  "Precision": 0.20023282887077998,
+  "Recall": 0.08726534753932014,
+  "F1-Score": 0.1215547703180212,
+  "OCR_detections": 859,
+  "GT_boxes": 1971,
+  "Detection_rate": 0.43581938102486045,
+  "Det_Precision": 0.5448195576251456,
+  "Det_Recall": 0.2374429223744292,
+  "Det_F1": 0.33074204946996466,
+  "Det_matched": 468,
+  "Text_matched": 172,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/results/roadtext_eval_results_FaithDiff.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 210,
+  "FP": 1068,
+  "FN": 1761,
+  "Precision": 0.1643192488262911,
+  "Recall": 0.106544901065449,
+  "F1-Score": 0.12927054478301014,
+  "OCR_detections": 1278,
+  "GT_boxes": 1971,
+  "Detection_rate": 0.6484018264840182,
+  "Det_Precision": 0.4890453834115806,
+  "Det_Recall": 0.31709791983764585,
+  "Det_F1": 0.38473376423514927,
+  "Det_matched": 625,
+  "Text_matched": 210,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/results/roadtext_eval_results_Ours.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 233,
+  "FP": 1483,
+  "FN": 1738,
+  "Precision": 0.1357808857808858,
+  "Recall": 0.11821410451547437,
+  "F1-Score": 0.12639001898562516,
+  "OCR_detections": 1716,
+  "GT_boxes": 1971,
+  "Detection_rate": 0.8706240487062404,
+  "Det_Precision": 0.3275058275058275,
+  "Det_Recall": 0.28513444951801115,
+  "Det_F1": 0.3048548955790616,
+  "Det_matched": 562,
+  "Text_matched": 233,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/results/roadtext_eval_results_Real-ESRGAN.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 64,
+  "FP": 191,
+  "FN": 1907,
+  "Precision": 0.25098039215686274,
+  "Recall": 0.032470826991374935,
+  "F1-Score": 0.05750224618149146,
+  "OCR_detections": 255,
+  "GT_boxes": 1971,
+  "Detection_rate": 0.1293759512937595,
+  "Det_Precision": 0.5882352941176471,
+  "Det_Recall": 0.076103500761035,
+  "Det_F1": 0.1347708894878706,
+  "Det_matched": 150,
+  "Text_matched": 64,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/results/roadtext_eval_results_SUPSR.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 219,
+  "FP": 1267,
+  "FN": 1752,
+  "Precision": 0.14737550471063257,
+  "Recall": 0.1111111111111111,
+  "F1-Score": 0.126699450390512,
+  "OCR_detections": 1486,
+  "GT_boxes": 1971,
+  "Detection_rate": 0.7539320142059868,
+  "Det_Precision": 0.4878869448183042,
+  "Det_Recall": 0.3678335870116692,
+  "Det_F1": 0.41943881978594155,
+  "Det_matched": 725,
+  "Text_matched": 219,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/results/roadtext_eval_results_SeeSR.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 0,
+  "FP": 0,
+  "FN": 0,
+  "Precision": 0,
+  "Recall": 0,
+  "F1-Score": 0,
+  "OCR_detections": 0,
+  "GT_boxes": 0,
+  "Detection_rate": 0,
+  "Det_Precision": 0,
+  "Det_Recall": 0,
+  "Det_F1": 0,
+  "Det_matched": 0,
+  "Text_matched": 0,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/results/roadtext_eval_results_StableSR.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 141,
+  "FP": 672,
+  "FN": 1830,
+  "Precision": 0.17343173431734318,
+  "Recall": 0.0715372907153729,
+  "F1-Score": 0.10129310344827586,
+  "OCR_detections": 813,
+  "GT_boxes": 1971,
+  "Detection_rate": 0.4124809741248097,
+  "Det_Precision": 0.5276752767527675,
+  "Det_Recall": 0.2176560121765601,
+  "Det_F1": 0.3081896551724138,
+  "Det_matched": 429,
+  "Text_matched": 141,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/results/roadtext_eval_results_SwinIR.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 66,
+  "FP": 182,
+  "FN": 1905,
+  "Precision": 0.2661290322580645,
+  "Recall": 0.0334855403348554,
+  "F1-Score": 0.05948625506985128,
+  "OCR_detections": 248,
+  "GT_boxes": 1971,
+  "Detection_rate": 0.12582445459157787,
+  "Det_Precision": 0.5806451612903226,
+  "Det_Recall": 0.0730593607305936,
+  "Det_F1": 0.12978819287967552,
+  "Det_matched": 144,
+  "Text_matched": 66,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/results/roadtext_eval_results_gt.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 953,
+  "FP": 576,
+  "FN": 1018,
+  "Precision": 0.6232831916285154,
+  "Recall": 0.4835109081684424,
+  "F1-Score": 0.5445714285714285,
+  "OCR_detections": 1529,
+  "GT_boxes": 1971,
+  "Detection_rate": 0.7757483510908169,
+  "Det_Precision": 0.6487900588620014,
+  "Det_Recall": 0.5032978183663115,
+  "Det_F1": 0.5668571428571428,
+  "Det_matched": 992,
+  "Text_matched": 953,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/results/roadtext_eval_results_sample00.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 0,
+  "FP": 0,
+  "FN": 0,
+  "Precision": 0,
+  "Recall": 0,
+  "F1-Score": 0,
+  "OCR_detections": 0,
+  "GT_boxes": 0,
+  "Detection_rate": 0,
+  "Det_Precision": 0,
+  "Det_Recall": 0,
+  "Det_F1": 0,
+  "Det_matched": 0,
+  "Text_matched": 0,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/results/roadtext_eval_results_zoomlr.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 8,
+  "FP": 106,
+  "FN": 1963,
+  "Precision": 0.07017543859649122,
+  "Recall": 0.004058853373921867,
+  "F1-Score": 0.007673860911270983,
+  "OCR_detections": 114,
+  "GT_boxes": 1971,
+  "Detection_rate": 0.0578386605783866,
+  "Det_Precision": 0.07017543859649122,
+  "Det_Recall": 0.004058853373921867,
+  "Det_F1": 0.007673860911270983,
+  "Det_matched": 8,
+  "Text_matched": 8,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/roadtext_eval_results_output.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "TP": 167,
+  "FP": 707,
+  "FN": 1804,
+  "Precision": 0.19107551487414187,
+  "Recall": 0.08472856418061897,
+  "F1-Score": 0.11739894551845341,
+  "OCR_detections": 874,
+  "GT_boxes": 1971,
+  "Detection_rate": 0.44342973110096395,
+  "Det_Precision": 0.540045766590389,
+  "Det_Recall": 0.23947234906139014,
+  "Det_F1": 0.3318101933216168,
+  "Det_matched": 472,
+  "Text_matched": 167,
+  "eval_mode": "end2end"
+}

diffusion-dpo-ocr/test_roadtext.py ADDED Viewed

	@@ -0,0 +1,514 @@

+"""
+RoadText1K OCR 评估脚本
+评估超分图片在 OCR 任务上的 Precision 和 Recall
+Metrics:
+    - Precision: TP / (TP + FP)
+    - Recall: TP / (TP + FN)
+    - F1-Score: 2 * Precision * Recall / (Precision + Recall)
+"""
+import os
+import json
+from pathlib import Path
+from typing import List, Dict, Tuple
+import difflib
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+# ============================================================================
+# 配置参数 - 请修改这里
+# ============================================================================
+CONFIG = {
+    # SR images 目录
+    'sr_dir': '/home/wanghongbo06/baipurui/results/RoadText/DreamClear/results/output',
+    # 'sr_dir': '/home/wanghongbo06/baipurui/DATA/RoadText1k_patch_crop/SR-Eval/zoomlr',
+    # 标注文件 (prepare_roadtext.py 生成的)
+    'annotation_file': '/home/wanghongbo06/baipurui/DATA/RoadText1k_patch_crop/annotations.json',
+    # OCR 引擎选择: 'paddleocr', 'easyocr', 或 'tesseract'
+    'ocr_engine': 'paddleocr',
+    # 匹配参数
+    'iou_threshold': 0.3,              # 检测框 IoU 阈值
+    'text_similarity_threshold': 0.3,  # 文本相似度阈值 (0 = 只看检测，不看识别)
+    # 评估模式:
+    #   'detection_only' - 只评估检测 (忽略文本内容)
+    #   'end2end' - 端到端评估 (检测 + 识别)
+    #   'recognition_only' - 只评估识别 (在GT框上裁剪后识别)
+    'eval_mode': 'end2end',
+    # OCR 配置
+    'device': 'gpu',                   # 'gpu' 或 'cpu'
+    # 调试选项
+    'debug_visualize': True,          # 是否可视化前几张图的检测框
+    'debug_save_dir': './debug_vis',   # 可视化保存目录
+    # 输出
+    'output': './roadtext_eval_results.json',
+}
+# ============================================================================
+def load_ocr_model(engine='paddleocr', device='gpu'):
+    """加载 OCR 模型"""
+    if engine == 'paddleocr':
+        try:
+            from paddleocr import PaddleOCR
+            print("Loading PaddleOCR...")
+            ocr = PaddleOCR(
+                lang='en',
+                device=device,
+            )
+            return ocr, 'paddleocr'
+        except ImportError:
+            print("PaddleOCR not found. Install: pip install paddleocr")
+            raise
+    elif engine == 'easyocr':
+        try:
+            import easyocr
+            print("Loading EasyOCR...")
+            use_gpu = (device == 'gpu')
+            reader = easyocr.Reader(['en'], gpu=use_gpu)
+            return reader, 'easyocr'
+        except ImportError:
+            print("EasyOCR not found. Install: pip install easyocr")
+            raise
+    else:
+        raise ValueError(f"Unsupported OCR engine: {engine}")
+def run_ocr(image_path: str, ocr_model, engine: str) -> List[Dict]:
+    """
+    运行 OCR
+    Returns:
+        List of dicts with keys: 'polygon', 'text', 'confidence'
+    """
+    results = []
+    if engine == 'paddleocr':
+        ocr_result = ocr_model.ocr(str(image_path))
+        if ocr_result and ocr_result[0]:
+            for line in ocr_result[0]:
+                polygon = np.array(line[0]).flatten().tolist()  # [[x1,y1],[x2,y2],...] -> [x1,y1,x2,y2,...]
+                text = line[1][0]
+                confidence = line[1][1]
+                results.append({
+                    'polygon': polygon,
+                    'text': text,
+                    'confidence': confidence,
+                })
+    elif engine == 'easyocr':
+        ocr_result = ocr_model.readtext(str(image_path))
+        for detection in ocr_result:
+            polygon = np.array(detection[0]).flatten().tolist()
+            text = detection[1]
+            confidence = detection[2]
+            results.append({
+                'polygon': polygon,
+                'text': text,
+                'confidence': confidence,
+            })
+    return results
+def polygon_iou(poly1: List[float], poly2: List[float]) -> float:
+    """
+    计算两个多边形的 IoU
+    poly: [x1,y1,x2,y2,x3,y3,x4,y4]
+    """
+    try:
+        from shapely.geometry import Polygon
+        # 转换为 Polygon 对象
+        p1 = Polygon([(poly1[i], poly1[i+1]) for i in range(0, len(poly1), 2)])
+        p2 = Polygon([(poly2[i], poly2[i+1]) for i in range(0, len(poly2), 2)])
+        if not p1.is_valid or not p2.is_valid:
+            return 0.0
+        # 计算 IoU
+        intersection = p1.intersection(p2).area
+        union = p1.union(p2).area
+        if union == 0:
+            return 0.0
+        return intersection / union
+    except ImportError:
+        # 使用 bbox IoU 近似
+        return bbox_iou_from_polygon(poly1, poly2)
+def bbox_iou_from_polygon(poly1: List[float], poly2: List[float]) -> float:
+    """使用 bbox 近似计算 IoU"""
+    # 获取 bbox
+    x1_min = min(poly1[0::2])
+    y1_min = min(poly1[1::2])
+    x1_max = max(poly1[0::2])
+    y1_max = max(poly1[1::2])
+    x2_min = min(poly2[0::2])
+    y2_min = min(poly2[1::2])
+    x2_max = max(poly2[0::2])
+    y2_max = max(poly2[1::2])
+    # 计算交集
+    inter_xmin = max(x1_min, x2_min)
+    inter_ymin = max(y1_min, y2_min)
+    inter_xmax = min(x1_max, x2_max)
+    inter_ymax = min(y1_max, y2_max)
+    if inter_xmax <= inter_xmin or inter_ymax <= inter_ymin:
+        return 0.0
+    inter_area = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin)
+    area1 = (x1_max - x1_min) * (y1_max - y1_min)
+    area2 = (x2_max - x2_min) * (y2_max - y2_min)
+    union_area = area1 + area2 - inter_area
+    return inter_area / union_area if union_area > 0 else 0.0
+def text_similarity(text1: str, text2: str) -> float:
+    """计算文本相似度"""
+    text1 = text1.lower().strip()
+    text2 = text2.lower().strip()
+    if text1 == text2:
+        return 1.0
+    # 使用编辑距离
+    return difflib.SequenceMatcher(None, text1, text2).ratio()
+def match_detections(
+    pred_results: List[Dict],
+    gt_annotations: Dict,
+    iou_thresh: float = 0.5,
+    text_sim_thresh: float = 0.5,
+    eval_mode: str = 'end2end',
+) -> Tuple[int, int, int, Dict]:
+    """
+    匹配预测和GT
+    Args:
+        eval_mode: 'detection_only', 'end2end', 'recognition_only'
+    Returns:
+        (TP, FP, FN, details)
+    """
+    gt_polygons = gt_annotations['polygons']
+    gt_texts = gt_annotations['texts']
+    gt_ignore = gt_annotations.get('ignore', [False] * len(gt_texts))
+    matched_gt = set()
+    tp = 0
+    fp = 0
+    # 详细统计
+    details = {
+        'det_matched': 0,      # 检测匹配数 (IoU 通过)
+        'text_matched': 0,     # 文本匹配数
+        'det_fp': 0,           # 检测误检
+    }
+    # 遍历预测结果
+    for pred in pred_results:
+        pred_poly = pred['polygon']
+        pred_text = pred['text']
+        best_iou = 0
+        best_gt_idx = -1
+        # 找到最佳匹配的 GT
+        for gt_idx, (gt_poly, gt_text, ignore) in enumerate(zip(gt_polygons, gt_texts, gt_ignore)):
+            if ignore or gt_idx in matched_gt:
+                continue
+            iou = polygon_iou(pred_poly, gt_poly)
+            if iou > best_iou:
+                best_iou = iou
+                best_gt_idx = gt_idx
+        # 判断是否匹配成功
+        if best_iou >= iou_thresh and best_gt_idx >= 0:
+            details['det_matched'] += 1
+            if eval_mode == 'detection_only':
+                # 只看检测，不看文本
+                tp += 1
+                matched_gt.add(best_gt_idx)
+            else:
+                # 检查文本相似度
+                gt_text = gt_texts[best_gt_idx]
+                sim = text_similarity(pred_text, gt_text)
+                if sim >= text_sim_thresh:
+                    tp += 1
+                    matched_gt.add(best_gt_idx)
+                    details['text_matched'] += 1
+                else:
+                    # 根据评估模式决定是否计为 FP
+                    if eval_mode == 'end2end':
+                        fp += 1  # 位置对但文字错，计为 FP
+        else:
+            fp += 1  # 误检
+            details['det_fp'] += 1
+    # 计算 FN (未检测到的GT)
+    fn = 0
+    for gt_idx, ignore in enumerate(gt_ignore):
+        if not ignore and gt_idx not in matched_gt:
+            fn += 1
+    return tp, fp, fn, details
+def visualize_detections(
+    img_path: Path,
+    gt_ann: Dict,
+    pred_results: List[Dict],
+    save_path: Path,
+):
+    """可视化 GT 和 OCR 检测框对比"""
+    import cv2
+    img = cv2.imread(str(img_path))
+    if img is None:
+        return
+    # 绘制 GT 框 (绿色)
+    for i, (poly, text, ignore) in enumerate(zip(
+        gt_ann['polygons'], gt_ann['texts'], gt_ann.get('ignore', [False] * len(gt_ann['texts']))
+    )):
+        if ignore:
+            continue
+        pts = np.array(poly).reshape(-1, 2).astype(np.int32)
+        cv2.polylines(img, [pts], True, (0, 255, 0), 2)
+        # 标注 GT 文本
+        x, y = int(pts[0][0]), int(pts[0][1]) - 5
+        cv2.putText(img, f"GT:{text[:10]}", (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1)
+    # 绘制 OCR 检测框 (红色)
+    for pred in pred_results:
+        poly = pred['polygon']
+        text = pred['text']
+        pts = np.array(poly).reshape(-1, 2).astype(np.int32)
+        cv2.polylines(img, [pts], True, (0, 0, 255), 2)
+        # 标注 OCR 文本
+        x, y = int(pts[0][0]), int(pts[0][1]) + 15
+        cv2.putText(img, f"OCR:{text[:10]}", (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 255), 1)
+    cv2.imwrite(str(save_path), img)
+def evaluate_dataset(
+    sr_dir: str,
+    annotation_file: str,
+    ocr_model,
+    engine: str,
+    debug: bool = False,
+) -> Dict:
+    """评估整个数据集"""
+    sr_dir = Path(sr_dir)
+    eval_mode = CONFIG.get('eval_mode', 'end2end')
+    debug_visualize = CONFIG.get('debug_visualize', False)
+    debug_save_dir = Path(CONFIG.get('debug_save_dir', './debug_vis'))
+    if debug_visualize:
+        debug_save_dir.mkdir(parents=True, exist_ok=True)
+    # 加载标注
+    with open(annotation_file, 'r', encoding='utf-8') as f:
+        annotations = json.load(f)
+    # 调试：统计标注信息
+    total_gt_boxes = 0
+    total_ignored = 0
+    for img_name, ann in annotations.items():
+        total_gt_boxes += len(ann['texts'])
+        total_ignored += sum(ann.get('ignore', [False] * len(ann['texts'])))
+    print(f"\n标注统计: 总共 {len(annotations)} 张图片, {total_gt_boxes} 个文本框, {total_ignored} 个被忽略")
+    print(f"评估模式: {eval_mode}")
+    total_tp = 0
+    total_fp = 0
+    total_fn = 0
+    # 汇总详细统计
+    total_details = {
+        'det_matched': 0,
+        'text_matched': 0,
+        'det_fp': 0,
+    }
+    print("Running OCR evaluation...")
+    total_ocr_detections = 0
+    debug_count = 0
+    vis_count = 0
+    for img_name, gt_ann in tqdm(annotations.items()):
+        img_path = sr_dir / img_name
+        if not img_path.exists():
+            # 尝试其他扩展名
+            img_path = sr_dir / (Path(img_name).stem + '.jpg')
+            if not img_path.exists():
+                continue
+        # 运行 OCR
+        pred_results = run_ocr(img_path, ocr_model, engine)
+        total_ocr_detections += len(pred_results)
+        # 调试：打印前3张图片的详细信息
+        if debug_count < 3:
+            print(f"\n[DEBUG] Image: {img_name}")
+            print(f"  GT boxes: {len(gt_ann['polygons'])}, ignored: {sum(gt_ann.get('ignore', []))}")
+            print(f"  OCR detections: {len(pred_results)}")
+            if gt_ann['polygons']:
+                print(f"  GT[0] polygon: {gt_ann['polygons'][0][:4]}... text: '{gt_ann['texts'][0]}'")
+            if pred_results:
+                print(f"  OCR[0] polygon: {pred_results[0]['polygon'][:4]}... text: '{pred_results[0]['text']}'")
+            debug_count += 1
+        # 可视化
+        if debug_visualize and vis_count < 20:
+            vis_path = debug_save_dir / f"vis_{img_name}"
+            visualize_detections(img_path, gt_ann, pred_results, vis_path)
+            vis_count += 1
+        # 匹配
+        tp, fp, fn, details = match_detections(
+            pred_results,
+            gt_ann,
+            iou_thresh=CONFIG['iou_threshold'],
+            text_sim_thresh=CONFIG['text_similarity_threshold'],
+            eval_mode=eval_mode,
+        )
+        total_tp += tp
+        total_fp += fp
+        total_fn += fn
+        for k, v in details.items():
+            total_details[k] += v
+    print(f"\nOCR 总共检测到 {total_ocr_detections} 个文本框")
+    # 计算指标
+    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
+    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
+    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
+    # 统计 GT 信息 (不含 ignore)
+    total_gt_boxes_valid = total_tp + total_fn
+    # 计算纯检测指标 (不考虑文本内容)
+    det_precision = total_details['det_matched'] / total_ocr_detections if total_ocr_detections > 0 else 0
+    det_recall = total_details['det_matched'] / total_gt_boxes_valid if total_gt_boxes_valid > 0 else 0
+    det_f1 = 2 * det_precision * det_recall / (det_precision + det_recall) if (det_precision + det_recall) > 0 else 0
+    return {
+        'TP': total_tp,
+        'FP': total_fp,
+        'FN': total_fn,
+        'Precision': precision,
+        'Recall': recall,
+        'F1-Score': f1_score,
+        'OCR_detections': total_ocr_detections,
+        'GT_boxes': total_gt_boxes_valid,
+        'Detection_rate': total_ocr_detections / total_gt_boxes_valid if total_gt_boxes_valid > 0 else 0,
+        # 新增：纯检测指标
+        'Det_Precision': det_precision,
+        'Det_Recall': det_recall,
+        'Det_F1': det_f1,
+        'Det_matched': total_details['det_matched'],
+        'Text_matched': total_details['text_matched'],
+        'eval_mode': eval_mode,
+    }
+def main():
+    # 自动生成 output 路径：根据 sr_dir 最后一个目录名
+    sr_dir = Path(CONFIG['sr_dir'])
+    baseline_name = sr_dir.name  # 获取最后一个目录名，如 'sr', 'gt', 'bicubic' 等
+    output_path = Path(f"./roadtext_eval_results_{baseline_name}.json")
+    print("="*60)
+    print("RoadText1K OCR Evaluation")
+    print("="*60)
+    print(f"SR images: {CONFIG['sr_dir']}")
+    print(f"Annotations: {CONFIG['annotation_file']}")
+    print(f"OCR engine: {CONFIG['ocr_engine']}")
+    print(f"IoU threshold: {CONFIG['iou_threshold']}")
+    print(f"Text similarity threshold: {CONFIG['text_similarity_threshold']}")
+    print(f"Output will be saved to: {output_path}")
+    print()
+    # 加载 OCR 模型
+    ocr_model, engine = load_ocr_model(
+        CONFIG['ocr_engine'],
+        device=CONFIG['device']
+    )
+    # 评估
+    results = evaluate_dataset(
+        CONFIG['sr_dir'],
+        CONFIG['annotation_file'],
+        ocr_model,
+        engine,
+    )
+    # 打印结果
+    print("\n" + "="*60)
+    print("EVALUATION RESULTS")
+    print("="*60)
+    print(f"评估模式: {results.get('eval_mode', 'end2end')}")
+    print("\n[Detection Statistics]")
+    print(f"  GT text boxes (valid):  {results['GT_boxes']}")
+    print(f"  OCR detections:         {results['OCR_detections']}")
+    print(f"  Detection matched (IoU): {results.get('Det_matched', 'N/A')}")
+    print(f"  Text matched:           {results.get('Text_matched', 'N/A')}")
+    print("\n[Detection-Only Metrics] (只看框位置，不看文字)")
+    print(f"  Det Precision:     {results.get('Det_Precision', 0)*100:.2f}%")
+    print(f"  Det Recall:        {results.get('Det_Recall', 0)*100:.2f}%")
+    print(f"  Det F1-Score:      {results.get('Det_F1', 0)*100:.2f}%")
+    print("\n[End-to-End Metrics] (检测 + 识别)")
+    print(f"  True Positives:    {results['TP']}")
+    print(f"  False Positives:   {results['FP']}")
+    print(f"  False Negatives:   {results['FN']}")
+    print(f"  Precision:         {results['Precision']*100:.2f}%")
+    print(f"  Recall:            {results['Recall']*100:.2f}%")
+    print(f"  F1-Score:          {results['F1-Score']*100:.2f}%")
+    # 保存结果
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, 'w') as f:
+        json.dump(results, f, indent=2)
+    print(f"\nResults saved to {output_path}")
+if __name__ == '__main__':
+    main()

diffusion-dpo-ocr/verify_roadtext_annotations.py ADDED Viewed

	@@ -0,0 +1,223 @@

+"""
+验证 RoadText1K 预处理结果的脚本
+可视化 GT 标注框与 crop 后图像的对应关系，检查坐标是否正确
+"""
+import os
+import json
+import random
+from pathlib import Path
+from typing import Dict, List
+import cv2
+import numpy as np
+from PIL import Image
+# ============================================================================
+# 配置
+# ============================================================================
+CONFIG = {
+    # GT 图像目录
+    'gt_dir': '/home/wanghongbo06/baipurui/DATA/RoadText1k_patch_crop/gt',
+    # 标注文件路径
+    'annotation_file': '/home/wanghongbo06/baipurui/DATA/RoadText1k_patch_crop/annotations.json',
+    # 可视化输出目录
+    'vis_output_dir': './verify_roadtext_vis',
+    # 可视化图片数量
+    'num_samples': 20,
+    # 随机种子
+    'seed': 42,
+}
+# ============================================================================
+def draw_annotations(img: np.ndarray, ann: Dict, color=(0, 255, 0), thickness=2) -> np.ndarray:
+    """在图像上绘制标注框"""
+    img_vis = img.copy()
+    polygons = ann.get('polygons', [])
+    texts = ann.get('texts', [])
+    ignores = ann.get('ignore', [False] * len(texts))
+    for i, (poly, text, ignore) in enumerate(zip(polygons, texts, ignores)):
+        if ignore:
+            box_color = (128, 128, 128)  # 灰色 = ignore
+        else:
+            box_color = color
+        # 绘制多边形
+        pts = np.array(poly).reshape(-1, 2).astype(np.int32)
+        cv2.polylines(img_vis, [pts], True, box_color, thickness)
+        # 标注文本
+        x, y = int(pts[0][0]), int(pts[0][1]) - 5
+        if y < 15:
+            y = int(pts[0][1]) + 15
+        # 缩短文本显示
+        display_text = text[:15] + "..." if len(text) > 15 else text
+        cv2.putText(img_vis, display_text, (x, y),
+                   cv2.FONT_HERSHEY_SIMPLEX, 0.4, box_color, 1)
+    return img_vis
+def verify_single_image(
+    img_path: Path,
+    ann: Dict,
+    save_path: Path,
+) -> Dict:
+    """验证单张图片的标注"""
+    img = cv2.imread(str(img_path))
+    if img is None:
+        return {'error': f'Cannot read image: {img_path}'}
+    h, w = img.shape[:2]
+    # 统计信息
+    stats = {
+        'img_size': (w, h),
+        'num_boxes': len(ann.get('polygons', [])),
+        'num_ignored': sum(ann.get('ignore', [])),
+        'boxes_in_bounds': 0,
+        'boxes_out_of_bounds': 0,
+    }
+    # 检查标注框是否在图像范围内
+    for poly in ann.get('polygons', []):
+        xs = [poly[i] for i in range(0, len(poly), 2)]
+        ys = [poly[i] for i in range(1, len(poly), 2)]
+        if min(xs) >= 0 and max(xs) <= w and min(ys) >= 0 and max(ys) <= h:
+            stats['boxes_in_bounds'] += 1
+        else:
+            stats['boxes_out_of_bounds'] += 1
+            print(f"  [WARNING] Box out of bounds: x=[{min(xs):.1f}, {max(xs):.1f}], y=[{min(ys):.1f}, {max(ys):.1f}], img={w}x{h}")
+    # 绘制标注
+    img_vis = draw_annotations(img, ann)
+    # 保存
+    cv2.imwrite(str(save_path), img_vis)
+    return stats
+def main():
+    random.seed(CONFIG['seed'])
+    gt_dir = Path(CONFIG['gt_dir'])
+    ann_file = Path(CONFIG['annotation_file'])
+    vis_dir = Path(CONFIG['vis_output_dir'])
+    vis_dir.mkdir(parents=True, exist_ok=True)
+    print("=" * 60)
+    print("RoadText1K 标注验证脚本")
+    print("=" * 60)
+    print(f"GT dir: {gt_dir}")
+    print(f"Annotations: {ann_file}")
+    print(f"Output: {vis_dir}")
+    print()
+    # 检查文件是否存在
+    if not ann_file.exists():
+        print(f"Error: Annotation file not found: {ann_file}")
+        return
+    if not gt_dir.exists():
+        print(f"Error: GT directory not found: {gt_dir}")
+        return
+    # 加载标注
+    with open(ann_file, 'r', encoding='utf-8') as f:
+        annotations = json.load(f)
+    print(f"总共 {len(annotations)} 张图片有标注")
+    # 全局统计
+    total_boxes = 0
+    total_ignored = 0
+    total_in_bounds = 0
+    total_out_of_bounds = 0
+    # 计算所有图片的统计
+    for img_name, ann in annotations.items():
+        total_boxes += len(ann.get('polygons', []))
+        total_ignored += sum(ann.get('ignore', []))
+        # 检查边界
+        for poly in ann.get('polygons', []):
+            xs = [poly[i] for i in range(0, len(poly), 2)]
+            ys = [poly[i] for i in range(1, len(poly), 2)]
+            if min(xs) >= 0 and max(xs) <= 512 and min(ys) >= 0 and max(ys) <= 512:
+                total_in_bounds += 1
+            else:
+                total_out_of_bounds += 1
+    print(f"\n全局统计:")
+    print(f"  总文本框: {total_boxes}")
+    print(f"  忽略框: {total_ignored}")
+    print(f"  有效框 (非忽略): {total_boxes - total_ignored}")
+    print(f"  框在图像范围内: {total_in_bounds}")
+    print(f"  框超出范围: {total_out_of_bounds}")
+    if total_out_of_bounds > 0:
+        print(f"\n⚠️  有 {total_out_of_bounds} 个框超出图像范围！这可能是坐标转换问题。")
+    # 随机选取一些图片进行可视化
+    img_names = list(annotations.keys())
+    num_samples = min(CONFIG['num_samples'], len(img_names))
+    selected = random.sample(img_names, num_samples)
+    print(f"\n随机选取 {num_samples} 张图片进行可视化...")
+    for img_name in selected:
+        ann = annotations[img_name]
+        img_path = gt_dir / img_name
+        if not img_path.exists():
+            # 尝试其他扩展名
+            img_path = gt_dir / (Path(img_name).stem + '.jpg')
+        if not img_path.exists():
+            print(f"  [SKIP] Image not found: {img_name}")
+            continue
+        save_path = vis_dir / f"verify_{img_name}"
+        stats = verify_single_image(img_path, ann, save_path)
+        if 'error' not in stats:
+            print(f"  [OK] {img_name}: {stats['num_boxes']} boxes, "
+                  f"{stats['boxes_out_of_bounds']} out of bounds")
+    print(f"\n可视化结果保存到: {vis_dir}")
+    # 额外检查：打印一些标注样例
+    print("\n" + "=" * 60)
+    print("标注样例 (前 3 张图):")
+    print("=" * 60)
+    for i, (img_name, ann) in enumerate(list(annotations.items())[:3]):
+        print(f"\n[{i+1}] {img_name}")
+        print(f"  crop_position: {ann.get('crop_position', 'N/A')}")
+        print(f"  num_boxes: {len(ann.get('polygons', []))}")
+        for j, (poly, text, ignore) in enumerate(zip(
+            ann.get('polygons', [])[:2],  # 只显示前2个
+            ann.get('texts', [])[:2],
+            ann.get('ignore', [])[:2]
+        )):
+            xs = [poly[k] for k in range(0, len(poly), 2)]
+            ys = [poly[k] for k in range(1, len(poly), 2)]
+            print(f"    Box {j}: x=[{min(xs):.1f}, {max(xs):.1f}], y=[{min(ys):.1f}, {max(ys):.1f}], "
+                  f"text='{text[:20]}', ignore={ignore}")
+if __name__ == '__main__':
+    main()

diffusion-dpo-test/DIAGNOSTIC_CHECKLIST.md ADDED Viewed

	@@ -0,0 +1,297 @@

+# 🔍 完整诊断检查清单
+## 问题描述
+训练了 15 和 60 个 epoch，测试结果**逐像素完全相同**（262144 个 RGB 值一模一样）
+---
+## ✅ 已确认正常的部分
+### 1. 训练动态正常
+- ✅ `acc` 和 `l_acc` 在变化
+- ✅ `loss` 在下降
+- ✅ `Max lora_B Check` 从 `1.04e-05` 增长到 `2.23e-03`（200+ 倍增长）
+### 2. 代码逻辑正常
+- ✅ `disable_adapter()` 使用了 `with` 语句（已修复）
+- ✅ VAE 编码在 `no_grad()` 中
+- ✅ `ref_pred` 正确 detach
+- ✅ LoRA 键名已清理（去除 `base_model.model.` 前缀）
+- ✅ 优化器在正确的时机创建（dtype 转换之后）
+### 3. x_embedder 为 0 是正常的
+- ✅ 输入层通常不需要针对 DPO 偏好优化
+- ✅ 其他深层（`single_transformer_blocks.*`）的权重在增长
+- ✅ LoRA 是加法，不是乘法，0 不会阻塞梯度流
+---
+## ❓ 需要检查的部分
+### 🎯 检查 1: Checkpoint 权重是否真的在变化？
+**这是最关键的检查！**
+```bash
+# 在训练机器上运行
+cd <训练脚本运行目录>  # 即 run.sh 的执行目录
+# 1. 确认文件存在
+ls -lh results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors
+ls -lh results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors
+# 2. 对比权重
+python compare_checkpoints.py \
+  results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors \
+  results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors
+```
+**预期结果**:
+- ✅ **正常**: "50%+ 的参数在变化"，最大差异 > 1e-4
+- ❌ **异常**: "完全相同" 或 "< 10% 参数变化"
+**如果异常，说明**:
+- 保存逻辑有问题
+- 或者训练根本没生效（但这与 loss 下降矛盾）
+---
+### 🎯 检查 2: 测试代码加载的是哪个 checkpoint？
+**问题**: 测试代码路径和训练输出路径不匹配
+训练输出:
+```
+results_1202_4/checkpoint-XX/lora_train_unet/adapter_model.safetensors
+```
+测试代码加载:
+```python
+# diffusion-dpo-test/test.py line 33
+"/home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-50/..."
+```
+注意 `results_1130` vs `results_1202_4`！
+**验证方法**:
+```bash
+# 在测试机器上
+# 1. 确认测试代码实际加载的文件
+ls -lh /home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-50/lora_train_unet/adapter_model.safetensors
+ls -lh /home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-500/lora_train_unet/adapter_model.safetensors
+# 2. 检查文件修改时间（是否是最新训练的）
+stat /home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-50/lora_train_unet/adapter_model.safetensors
+# 3. 对比这两个文件
+python compare_checkpoints.py \
+  /home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-50/lora_train_unet/adapter_model.safetensors \
+  /home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-500/lora_train_unet/adapter_model.safetensors
+```
+**如果这两个文件完全相同**:
+- 说明您测试的不是最新训练的模型
+- 需要更新测试代码的路径，或者将最新的 checkpoint 复制到测试机器
+---
+### 🎯 检查 3: 测试代码的 LoRA 融合逻辑
+测试代码使用了**两次 fuse_lora**:
+```python
+# 第一次：SR base LoRA
+pipe.load_lora_weights("...pytorch_lora_weights_v2.safetensors", adapter_name="sr")
+pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr"])
+pipe.unload_lora_weights()
+# 第二次：DPO trained LoRA
+pipe.load_lora_weights("...adapter_model.safetensors", adapter_name="sr2")
+pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr2"])
+pipe.unload_lora_weights()
+```
+**潜在问题**:
+1. `unload_lora_weights()` 可能清除了刚融合的权重
+2. 第二次 `fuse_lora` 可能没有正确叠加到第一次的结果上
+3. 如果两个 LoRA 的权重完全相同，输出自然也相同
+**验证方法 A: 只加载 DPO LoRA**
+```python
+# 临时修改 test.py
+pipe = FluxPipeline.from_pretrained(...).to("cuda")
+# 只加载第一个 LoRA
+pipe.load_lora_weights(
+    "/home/wanghongbo06/baipurui/CKPTs/FLUX_SR/pytorch_lora_weights_v2.safetensors",
+    adapter_name="sr"
+)
+pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr"])
+pipe.unload_lora_weights()
+# 生成图像 -> 保存为 result_sr_only.png
+# 然后只加载第二个 LoRA
+pipe2 = FluxPipeline.from_pretrained(...).to("cuda")
+pipe2.load_lora_weights(
+    "/home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-60/lora_train_unet/adapter_model.safetensors",
+    adapter_name="dpo"
+)
+pipe2.fuse_lora(lora_scale=1.0, adapter_names=["dpo"])
+# 生成图像 -> 保存为 result_dpo_only.png
+```
+如果 `result_dpo_only.png` 在不同 epoch 之间也完全相同，说明问题在 checkpoint 本身。
+**验证方法 B: 检查融合后的权重**
+```python
+# 在 test.py 中添加调试代码
+import torch
+# 加载第一个 LoRA 后
+pipe.load_lora_weights("...pytorch_lora_weights_v2.safetensors", adapter_name="sr")
+pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr"])
+# 保存融合后的 transformer 权重快照
+transformer_weights_after_sr = {
+    k: v.clone() for k, v in pipe.transformer.state_dict().items()
+    if 'single_transformer_blocks.30' in k
+}
+pipe.unload_lora_weights()
+# 加载第二个 LoRA 后
+pipe.load_lora_weights("...adapter_model.safetensors", adapter_name="sr2")
+pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr2"])
+transformer_weights_after_dpo = {
+    k: v.clone() for k, v in pipe.transformer.state_dict().items()
+    if 'single_transformer_blocks.30' in k
+}
+# 对比
+for k in transformer_weights_after_sr.keys():
+    diff = (transformer_weights_after_dpo[k] - transformer_weights_after_sr[k]).abs().max()
+    print(f"{k}: max_diff = {diff:.6e}")
+```
+如果所有 diff 都是 0，说明第二个 LoRA 没有被正确应用。
+---
+### 🎯 检查 4: Checkpoint 文件的完整性
+```bash
+# 在训练机器上
+python inspect_safetensor.py results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors
+```
+**预期结果**:
+- 应该有 100+ 个参数张量
+- `single_transformer_blocks.*` 的 lora_B 应该有非零值
+- 非零参数比例应该 > 50%
+**如果异常**:
+- 文件损坏
+- 或者保存逻辑有问题
+---
+## 🔧 可能的修复方案
+### 方案 1: 如果 checkpoint 权重没有变化
+**原因**: 保存逻辑有问题
+**修复**: 检查 `save_model_hook` 中的状态字典获取逻辑
+```python
+# train_single_lora.py line 672
+full_state_dict = accelerator.get_state_dict(model)
+```
+可能需要改为：
+```python
+from peft import get_peft_model_state_dict
+full_state_dict = get_peft_model_state_dict(model, adapter_name="train_unet")
+```
+---
+### 方案 2: 如果测试代码路径错误
+**原因**: 加载了旧的 checkpoint
+**修复**: 更新测试代码的路径，或者将最新 checkpoint 复制到测试机器
+```bash
+# 在训练机器上
+scp -r results_1202_4/checkpoint-60 user@test-machine:/home/wanghongbo06/diffusion-dpo-adv/
+```
+---
+### 方案 3: 如果 LoRA 融合有问题
+**原因**: `fuse_lora` 和 `unload_lora_weights` 的交互有问题
+**修复**: 使用 `set_adapters` 而不是 `fuse_lora`
+```python
+# 新的测试代码
+pipe = FluxPipeline.from_pretrained(...).to("cuda")
+# 加载两个 LoRA（不融合）
+pipe.load_lora_weights("...pytorch_lora_weights_v2.safetensors", adapter_name="sr")
+pipe.load_lora_weights("...adapter_model.safetensors", adapter_name="dpo")
+# 同时启用两个 adapter
+pipe.set_adapters(["sr", "dpo"], adapter_weights=[1.0, 1.0])
+# 生成图像
+result_img = generate(pipe, ...)
+```
+---
+## 📊 诊断流程图
+```
+开始
+  ↓
+检查 1: 对比 checkpoint 权重
+  ├─ 权重有变化 → 继续检查 2
+  └─ 权重无变化 → 【问题在训练/保存】→ 方案 1
+  ↓
+检查 2: 确认测试代码加载的路径
+  ├─ 路径正确 → 继续检查 3
+  └─ 路径错误 → 【问题在测试代码】→ 方案 2
+  ↓
+检查 3: 验证 LoRA 融合逻辑
+  ├─ 融合正确 → 继续检查 4
+  └─ 融合失败 → 【问题在测试代码】→ 方案 3
+  ↓
+检查 4: 检查 checkpoint 文件完整性
+  ├─ 文件完整 → 【未知问题，需要更深入调查】
+  └─ 文件损坏 → 【问题在保存】→ 方案 1
+```
+---
+## 🚀 立即执行
+**请在训练机器上运行以下命令**:
+```bash
+cd <run.sh 的执行目录>
+# 1. 对比 checkpoint（最重要！）
+python /data2/hongbo.wang/DPO-SR/compare_checkpoints.py \
+  results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors \
+  results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors
+# 2. 检查单个 checkpoint
+python /data2/hongbo.wang/DPO-SR/inspect_safetensor.py \
+  results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors
+```
+**请将输出结果发给我，我会根据结果进一步诊断！**

diffusion-dpo-test/DIV2K-val/sobolev-400/0000843-seed-0.png ADDED Viewed

diffusion-dpo-test/__pycache__/color_fix.cpython-310.pyc ADDED Viewed

Binary file (3.7 kB). View file

diffusion-dpo-test/analyze_lora_magnitude.py ADDED Viewed

	@@ -0,0 +1,179 @@

+#!/usr/bin/env python3
+"""
+分析 LoRA 权重的实际大小，理解为什么效果差异这么大
+"""
+import os
+os.environ["HF_HOME"] = "/home/wanghongbo06/.cache/huggingface"
+import torch
+from safetensors.torch import load_file
+import numpy as np
+def analyze_lora(path, name=""):
+    """分析单个 LoRA 文件"""
+    print(f"\n{'='*80}")
+    print(f"分析 LoRA: {name}")
+    print(f"路径: {path}")
+    print(f"{'='*80}")
+    state_dict = load_file(path)
+    # 分离 lora_A 和 lora_B
+    lora_a_keys = [k for k in state_dict.keys() if "lora_A" in k]
+    lora_b_keys = [k for k in state_dict.keys() if "lora_B" in k]
+    print(f"\n总参数数: {len(state_dict)}")
+    print(f"  - lora_A: {len(lora_a_keys)} 个")
+    print(f"  - lora_B: {len(lora_b_keys)} 个")
+    # 分析 lora_A
+    print(f"\n--- lora_A 统计 ---")
+    a_means = []
+    a_maxs = []
+    a_stds = []
+    for k in lora_a_keys:
+        v = state_dict[k].float()
+        a_means.append(v.mean().item())
+        a_maxs.append(v.abs().max().item())
+        a_stds.append(v.std().item())
+    print(f"  Mean of means: {np.mean(a_means):.6e}")
+    print(f"  Max of maxs:   {np.max(a_maxs):.6e}")
+    print(f"  Mean of stds:  {np.mean(a_stds):.6e}")
+    # 分析 lora_B
+    print(f"\n--- lora_B 统计 ---")
+    b_means = []
+    b_maxs = []
+    b_stds = []
+    b_nonzero_ratio = []
+    for k in lora_b_keys:
+        v = state_dict[k].float()
+        b_means.append(v.mean().item())
+        b_maxs.append(v.abs().max().item())
+        b_stds.append(v.std().item())
+        b_nonzero_ratio.append((v.abs() > 1e-10).float().mean().item())
+    print(f"  Mean of means:     {np.mean(b_means):.6e}")
+    print(f"  Max of maxs:       {np.max(b_maxs):.6e}")
+    print(f"  Mean of stds:      {np.mean(b_stds):.6e}")
+    print(f"  Avg non-zero ratio: {np.mean(b_nonzero_ratio)*100:.2f}%")
+    # LoRA 的实际贡献 = A @ B
+    # 估算 LoRA 对权重的影响大小
+    print(f"\n--- LoRA 影响估算 ---")
+    print(f"  LoRA 输出 ≈ x @ A.T @ B.T")
+    print(f"  |A| * |B| ≈ {np.max(a_maxs) * np.max(b_maxs):.6e}")
+    # 找出最大的几个 lora_B
+    print(f"\n--- 最大的 5 个 lora_B 层 ---")
+    b_with_max = [(k, state_dict[k].float().abs().max().item()) for k in lora_b_keys]
+    b_with_max.sort(key=lambda x: -x[1])
+    for k, v in b_with_max[:5]:
+        print(f"  {v:.6e}: {k}")
+    return state_dict
+def compare_loras(path1, path2, name1="LoRA 1", name2="LoRA 2"):
+    """对比两个 LoRA"""
+    print(f"\n{'='*80}")
+    print(f"对比 {name1} vs {name2}")
+    print(f"{'='*80}")
+    sd1 = load_file(path1)
+    sd2 = load_file(path2)
+    # 对比 lora_B 的变化
+    lora_b_keys = [k for k in sd1.keys() if "lora_B" in k]
+    diffs = []
+    for k in lora_b_keys:
+        if k in sd2:
+            diff = (sd2[k].float() - sd1[k].float()).abs()
+            diffs.append({
+                'key': k,
+                'max_diff': diff.max().item(),
+                'mean_diff': diff.mean().item(),
+                'val1_max': sd1[k].float().abs().max().item(),
+                'val2_max': sd2[k].float().abs().max().item(),
+            })
+    # 按差异排序
+    diffs.sort(key=lambda x: -x['max_diff'])
+    print(f"\n变化最大的 10 个 lora_B 层:")
+    print("-" * 100)
+    for d in diffs[:10]:
+        print(f"  max_diff={d['max_diff']:.6e}, {name1}_max={d['val1_max']:.6e}, {name2}_max={d['val2_max']:.6e}")
+        print(f"    {d['key']}")
+    # 总体统计
+    all_max_diffs = [d['max_diff'] for d in diffs]
+    print(f"\n总体统计:")
+    print(f"  最大差异: {max(all_max_diffs):.6e}")
+    print(f"  平均差异: {np.mean(all_max_diffs):.6e}")
+    print(f"  差异 > 1e-4 的层数: {sum(1 for d in all_max_diffs if d > 1e-4)}")
+    print(f"  差异 > 1e-5 的层数: {sum(1 for d in all_max_diffs if d > 1e-5)}")
+def compare_with_sr_lora(sr_path, dpo_path):
+    """对比 SR LoRA 和 DPO LoRA 的量级"""
+    print(f"\n{'='*80}")
+    print(f"对比 SR LoRA 和 DPO LoRA 的量级")
+    print(f"{'='*80}")
+    sr_sd = load_file(sr_path)
+    dpo_sd = load_file(dpo_path)
+    # SR LoRA 的量级
+    sr_b_maxs = []
+    for k, v in sr_sd.items():
+        if "lora_B" in k or "lora_down" in k:  # 不同格式可能用不同命名
+            sr_b_maxs.append(v.float().abs().max().item())
+    # DPO LoRA 的量级
+    dpo_b_maxs = []
+    for k, v in dpo_sd.items():
+        if "lora_B" in k:
+            dpo_b_maxs.append(v.float().abs().max().item())
+    print(f"\nSR LoRA (lora_B/lora_down):")
+    if sr_b_maxs:
+        print(f"  Max: {max(sr_b_maxs):.6e}")
+        print(f"  Mean: {np.mean(sr_b_maxs):.6e}")
+    else:
+        print(f"  (没有找到 lora_B 或 lora_down)")
+        # 打印所有 key 看看格式
+        print(f"  SR LoRA keys 示例: {list(sr_sd.keys())[:5]}")
+    print(f"\nDPO LoRA (lora_B):")
+    print(f"  Max: {max(dpo_b_maxs):.6e}")
+    print(f"  Mean: {np.mean(dpo_b_maxs):.6e}")
+    if sr_b_maxs and dpo_b_maxs:
+        ratio = max(dpo_b_maxs) / max(sr_b_maxs) if max(sr_b_maxs) > 0 else float('inf')
+        print(f"\n量级比较:")
+        print(f"  DPO / SR = {ratio:.4f}")
+        if ratio > 1:
+            print(f"  ⚠️ DPO LoRA 比 SR LoRA 大 {ratio:.1f} 倍！这可能导致效果变差")
+        else:
+            print(f"  DPO LoRA 比 SR LoRA 小 {1/ratio:.1f} 倍")
+if __name__ == "__main__":
+    # 分析 DPO LoRA checkpoints
+    ckpt_15 = "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors"
+    ckpt_105 = "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-105/lora_train_unet/adapter_model.safetensors"
+    sr_lora = "/home/wanghongbo06/baipurui/CKPTs/FLUX_SR/pytorch_lora_weights_v2.safetensors"
+    # 1. 分析各个 checkpoint
+    analyze_lora(ckpt_15, "DPO checkpoint-15")
+    analyze_lora(ckpt_105, "DPO checkpoint-105")
+    # 2. 对比 15 vs 105
+    compare_loras(ckpt_15, ckpt_105, "ckpt-15", "ckpt-105")
+    # 3. 对比 SR LoRA 和 DPO LoRA 的量级
+    compare_with_sr_lora(sr_lora, ckpt_15)

diffusion-dpo-test/check_lora_keys.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python3
+"""检查 LoRA safetensor 文件的 key 格式"""
+from safetensors.torch import load_file
+import sys
+def check_keys(path):
+    print(f"\n检查文件: {path}")
+    print("=" * 80)
+    state_dict = load_file(path)
+    print(f"总共 {len(state_dict)} 个 key\n")
+    print("前 20 个 key:")
+    print("-" * 80)
+    for i, key in enumerate(sorted(state_dict.keys())[:20]):
+        print(f"  {key}")
+    print("\n..." if len(state_dict) > 20 else "")
+    # 检查 key 格式
+    print("\nKey 格式分析:")
+    print("-" * 80)
+    has_transformer_prefix = any(k.startswith("transformer.") for k in state_dict.keys())
+    has_base_model_prefix = any(k.startswith("base_model.") for k in state_dict.keys())
+    has_lora_A = any("lora_A" in k for k in state_dict.keys())
+    has_lora_B = any("lora_B" in k for k in state_dict.keys())
+    has_train_unet = any("train_unet" in k for k in state_dict.keys())
+    print(f"  包含 'transformer.' 前缀: {has_transformer_prefix}")
+    print(f"  包含 'base_model.' 前缀: {has_base_model_prefix}")
+    print(f"  包含 'lora_A': {has_lora_A}")
+    print(f"  包含 'lora_B': {has_lora_B}")
+    print(f"  包含 'train_unet': {has_train_unet}")
+    # 显示一个完整的 key 示例
+    sample_key = list(state_dict.keys())[0]
+    print(f"\n示例 key: {sample_key}")
+    print(f"示例 shape: {state_dict[sample_key].shape}")
+    # 检查 Diffusers 期望的格式
+    print("\n" + "=" * 80)
+    print("Diffusers load_lora_weights 期望的 key 格式:")
+    print("-" * 80)
+    print("  transformer.single_transformer_blocks.0.attn.to_k.lora_A.weight")
+    print("  transformer.single_transformer_blocks.0.attn.to_k.lora_B.weight")
+    print("\n您的 key 格式:")
+    print(f"  {sample_key}")
+    # 判断是否兼容
+    print("\n" + "=" * 80)
+    if has_train_unet:
+        print("❌ 问题: 您的 key 包含 '.train_unet.' 后缀！")
+        print("   Diffusers 期望: xxx.lora_A.weight")
+        print("   您的格式:       xxx.lora_A.train_unet.weight")
+        print("\n   这就是 LoRA 无法加载的原因！")
+    elif not has_transformer_prefix:
+        print("⚠️  问题: 您的 key 缺少 'transformer.' 前缀！")
+        print("   Diffusers 期望: transformer.xxx.lora_A.weight")
+        print(f"   您的格式:       {sample_key}")
+    else:
+        print("✅ Key 格式看起来正确")
+if __name__ == "__main__":
+    paths = [
+        "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors",
+        "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors",
+    ]
+    for path in paths:
+        try:
+            check_keys(path)
+        except Exception as e:
+            print(f"❌ 无法读取 {path}: {e}")

diffusion-dpo-test/color_fix.py ADDED Viewed

	@@ -0,0 +1,119 @@

+'''
+# --------------------------------------------------------------------------------
+#   Color fixed script from Li Yi (https://github.com/pkuliyi2015/sd-webui-stablesr/blob/master/srmodule/colorfix.py)
+# --------------------------------------------------------------------------------
+'''
+import torch
+from PIL import Image
+from torch import Tensor
+from torch.nn import functional as F
+from torchvision.transforms import ToTensor, ToPILImage
+def adain_color_fix(target: Image, source: Image):
+    # Convert images to tensors
+    to_tensor = ToTensor()
+    target_tensor = to_tensor(target).unsqueeze(0)
+    source_tensor = to_tensor(source).unsqueeze(0)
+    # Apply adaptive instance normalization
+    result_tensor = adaptive_instance_normalization(target_tensor, source_tensor)
+    # Convert tensor back to image
+    to_image = ToPILImage()
+    result_image = to_image(result_tensor.squeeze(0).clamp_(0.0, 1.0))
+    return result_image
+def wavelet_color_fix(target: Image, source: Image):
+    # Convert images to tensors
+    to_tensor = ToTensor()
+    target_tensor = to_tensor(target).unsqueeze(0)
+    source_tensor = to_tensor(source).unsqueeze(0)
+    # Apply wavelet reconstruction
+    result_tensor = wavelet_reconstruction(target_tensor, source_tensor)
+    # Convert tensor back to image
+    to_image = ToPILImage()
+    result_image = to_image(result_tensor.squeeze(0).clamp_(0.0, 1.0))
+    return result_image
+def calc_mean_std(feat: Tensor, eps=1e-5):
+    """Calculate mean and std for adaptive_instance_normalization.
+    Args:
+        feat (Tensor): 4D tensor.
+        eps (float): A small value added to the variance to avoid
+            divide-by-zero. Default: 1e-5.
+    """
+    size = feat.size()
+    assert len(size) == 4, 'The input feature should be 4D tensor.'
+    b, c = size[:2]
+    feat_var = feat.reshape(b, c, -1).var(dim=2) + eps
+    feat_std = feat_var.sqrt().reshape(b, c, 1, 1)
+    feat_mean = feat.reshape(b, c, -1).mean(dim=2).reshape(b, c, 1, 1)
+    return feat_mean, feat_std
+def adaptive_instance_normalization(content_feat:Tensor, style_feat:Tensor):
+    """Adaptive instance normalization.
+    Adjust the reference features to have the similar color and illuminations
+    as those in the degradate features.
+    Args:
+        content_feat (Tensor): The reference feature.
+        style_feat (Tensor): The degradate features.
+    """
+    size = content_feat.size()
+    style_mean, style_std = calc_mean_std(style_feat)
+    content_mean, content_std = calc_mean_std(content_feat)
+    normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
+    return normalized_feat * style_std.expand(size) + style_mean.expand(size)
+def wavelet_blur(image: Tensor, radius: int):
+    """
+    Apply wavelet blur to the input tensor.
+    """
+    # input shape: (1, 3, H, W)
+    # convolution kernel
+    kernel_vals = [
+        [0.0625, 0.125, 0.0625],
+        [0.125, 0.25, 0.125],
+        [0.0625, 0.125, 0.0625],
+    ]
+    kernel = torch.tensor(kernel_vals, dtype=image.dtype, device=image.device)
+    # add channel dimensions to the kernel to make it a 4D tensor
+    kernel = kernel[None, None]
+    # repeat the kernel across all input channels
+    kernel = kernel.repeat(3, 1, 1, 1)
+    image = F.pad(image, (radius, radius, radius, radius), mode='replicate')
+    # apply convolution
+    output = F.conv2d(image, kernel, groups=3, dilation=radius)
+    return output
+def wavelet_decomposition(image: Tensor, levels=5):
+    """
+    Apply wavelet decomposition to the input tensor.
+    This function only returns the low frequency & the high frequency.
+    """
+    high_freq = torch.zeros_like(image)
+    for i in range(levels):
+        radius = 2 ** i
+        low_freq = wavelet_blur(image, radius)
+        high_freq += (image - low_freq)
+        image = low_freq
+    return high_freq, low_freq
+def wavelet_reconstruction(content_feat:Tensor, style_feat:Tensor):
+    """
+    Apply wavelet decomposition, so that the content will have the same color as the style.
+    """
+    # calculate the wavelet decomposition of the content feature
+    content_high_freq, content_low_freq = wavelet_decomposition(content_feat)
+    del content_low_freq
+    # calculate the wavelet decomposition of the style feature
+    style_high_freq, style_low_freq = wavelet_decomposition(style_feat)
+    del style_high_freq
+    # reconstruct the content feature with the style's high frequency
+    return content_high_freq + style_low_freq

diffusion-dpo-test/compare.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from PIL import Image
+import numpy as np
+def compare_images(img_path1, img_path2, diff_output_path=None):
+    """
+    逐像素比较两张 RGB 图像的差异。
+    要求两张图分辨率相同、都是 RGB 模式。
+    :param img_path1: 第一张图片路径
+    :param img_path2: 第二张图片路径
+    :param diff_output_path: 如果不为 None，则保存差异图到该路径
+    :return: 一个字典，包含一些差异统计信息
+    """
+    # 打开图片并转换为 RGB（避免有的图片是 RGBA/灰度模式）
+    img1 = Image.open(img_path1).convert("RGB")
+    img2 = Image.open(img_path2).convert("RGB")
+    # 检查分辨率是否一致
+    if img1.size != img2.size:
+        raise ValueError(f"两张图分辨率不同: {img1.size} vs {img2.size}")
+    # 转为 NumPy 数组，形状为 (H, W, 3)
+    arr1 = np.array(img1, dtype=np.int16)  # 用 int16 以免减法溢出
+    arr2 = np.array(img2, dtype=np.int16)
+    # 逐像素逐通道做差，取绝对值
+    diff = np.abs(arr1 - arr2)  # (H, W, 3)
+    # 每个像素的“差异强度”可以用 RGB 差分的和或均值来表示
+    # 这里用每像素的 RGB 差的平均值
+    per_pixel_diff = diff.mean(axis=2)  # (H, W)
+    # 一些统计信息
+    total_pixels = per_pixel_diff.size
+    # 差异为0的像素
+    same_pixels = np.sum(per_pixel_diff == 0)
+    different_pixels = total_pixels - same_pixels
+    max_diff = float(per_pixel_diff.max())          # 单像素最大平均差值
+    mean_diff = float(per_pixel_diff.mean())        # 所有像素平均差值
+    diff_ratio = different_pixels / total_pixels    # 有差异像素占比
+    stats = {
+        "total_pixels": int(total_pixels),
+        "same_pixels": int(same_pixels),
+        "different_pixels": int(different_pixels),
+        "different_ratio": diff_ratio,   # 0~1 之间
+        "max_diff_per_pixel": max_diff,  # 0~255
+        "mean_diff_per_pixel": mean_diff
+    }
+    # 如果需要输出一张差异图
+    if diff_output_path is not None:
+        # diff 目前是 0~255 范围内的 RGB 差值，可以直接保存成图像看
+        diff_img = np.clip(diff, 0, 255).astype(np.uint8)
+        diff_image = Image.fromarray(diff_img, mode="RGB")
+        diff_image.save(diff_output_path)
+        # 也可以考虑把差异增强一下再保存（例如乘个系数）
+    return stats
+if __name__ == "__main__":
+    img1_path = "./results-test/dpo_scale_ablation/dpo_scale_0.0/0000010-seed-0.png"
+    img2_path = "./results-test/dpo_scale_ablation/dpo_scale_1.0/0000010-seed-0.png"
+    diff_img_path = "diff.png"
+    stats = compare_images(img1_path, img2_path, diff_output_path=diff_img_path)
+    print("比较结果：")
+    for k, v in stats.items():
+        print(f"{k}: {v}")

diffusion-dpo-test/compare_checkpoints.py ADDED Viewed

	@@ -0,0 +1,147 @@

+#!/usr/bin/env python3
+"""对比两个 checkpoint 的 safetensor 文件，检查权重是否真的在变化"""
+import sys
+from safetensors.torch import load_file
+import torch
+import os
+# ==============================
+# 在这里手动填写 checkpoint 路径
+# ==============================
+CHECKPOINT_1 = r"/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors"
+CHECKPOINT_2 = r"/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors"
+# ==============================
+def compare_safetensors(path1, path2):
+    print(f"\n{'='*80}")
+    print(f"对比两个 checkpoint:")
+    print(f"  Checkpoint 1: {path1}")
+    print(f"  Checkpoint 2: {path2}")
+    print(f"{'='*80}\n")
+    try:
+        state_dict1 = load_file(path1)
+        state_dict2 = load_file(path2)
+        # 检查键是否一致
+        keys1 = set(state_dict1.keys())
+        keys2 = set(state_dict2.keys())
+        if keys1 != keys2:
+            print("⚠️  警告: 两个 checkpoint 的键不一致!")
+            print(f"  只在 checkpoint1 中: {keys1 - keys2}")
+            print(f"  只在 checkpoint2 中: {keys2 - keys1}")
+            return
+        print(f"✅ 两个 checkpoint 都有 {len(keys1)} 个参数张量\n")
+        # 统计差异
+        identical_count = 0
+        different_count = 0
+        max_diff_info = None
+        max_diff = 0
+        layer_diffs = {}
+        for key in sorted(keys1):
+            tensor1 = state_dict1[key]
+            tensor2 = state_dict2[key]
+            diff = (tensor2 - tensor1).float()
+            abs_diff = diff.abs()
+            max_abs_diff = abs_diff.max().item()
+            mean_abs_diff = abs_diff.mean().item()
+            if max_abs_diff == 0:
+                identical_count += 1
+            else:
+                different_count += 1
+            if max_abs_diff > max_diff:
+                max_diff = max_abs_diff
+                max_diff_info = {
+                    'key': key,
+                    'max_diff': max_abs_diff,
+                    'mean_diff': mean_abs_diff,
+                    'tensor1_max': tensor1.float().abs().max().item(),
+                    'tensor2_max': tensor2.float().abs().max().item(),
+                }
+            if '.lora_B.' in key:
+                layer_name = key.split('.lora_B.')[0]
+                if layer_name not in layer_diffs:
+                    layer_diffs[layer_name] = {
+                        'max_diff': max_abs_diff,
+                        'mean_diff': mean_abs_diff,
+                        'key': key
+                    }
+        print(f"差异统计:")
+        print(f"  完全相同的参数: {identical_count} / {len(keys1)} ({identical_count/len(keys1)*100:.2f}%)")
+        print(f"  有变化的参数:   {different_count} / {len(keys1)} ({different_count/len(keys1)*100:.2f}%)")
+        print()
+        if max_diff_info:
+            print(f"最大权重变化:")
+            print(f"  层: {max_diff_info['key']}")
+            print(f"  最大绝对差异: {max_diff_info['max_diff']:.6e}")
+            print(f"  平均绝对差异: {max_diff_info['mean_diff']:.6e}")
+            print(f"  Checkpoint1 最大值: {max_diff_info['tensor1_max']:.6e}")
+            print(f"  Checkpoint2 最大值: {max_diff_info['tensor2_max']:.6e}")
+            print()
+        key_layers = ['x_embedder', 'transformer_blocks.0', 'transformer_blocks.9',
+                      'single_transformer_blocks.30', 'proj_out']
+        print("关键层的 lora_B 权重变化:")
+        print("-" * 80)
+        for layer_prefix in key_layers:
+            matching = [k for k in layer_diffs.keys() if layer_prefix in k]
+            if matching:
+                for layer_name in matching[:2]:
+                    info = layer_diffs[layer_name]
+                    print(f"\n层: {layer_name}")
+                    print(f"  最大差异: {info['max_diff']:.6e}")
+                    print(f"  平均差异: {info['mean_diff']:.6e}")
+                    key = info['key']
+                    t1 = state_dict1[key].float()
+                    t2 = state_dict2[key].float()
+                    print(f"  Checkpoint1: mean={t1.mean():.6e}, max={t1.abs().max():.6e}")
+                    print(f"  Checkpoint2: mean={t2.mean():.6e}, max={t2.abs().max():.6e}")
+        print("\n" + "="*80)
+        print("lora_B 权重变化最大的前 10 个层:")
+        print("-" * 80)
+        sorted_layers = sorted(layer_diffs.items(), key=lambda x: x[1]['max_diff'], reverse=True)
+        for i, (layer_name, info) in enumerate(sorted_layers[:10], 1):
+            key = info['key']
+            t1 = state_dict1[key].float()
+            t2 = state_dict2[key].float()
+            print(f"\n{i}. {layer_name}")
+            print(f"   最大差异: {info['max_diff']:.6e}, 平均差异: {info['mean_diff']:.6e}")
+            print(f"   Ckpt1: mean={t1.mean():.6e}, max={t1.abs().max():.6e}")
+            print(f"   Ckpt2: mean={t2.mean():.6e}, max={t2.abs().max():.6e}")
+        print("\n" + "="*80)
+        if different_count == 0:
+            print("❌ 严重问题: 两个 checkpoint 完全相同，模型没有学习！")
+        elif different_count < len(keys1) * 0.1:
+            print(f"⚠️  警告: 只有 {different_count/len(keys1)*100:.2f}% 的参数在变化，可能存在梯度阻塞")
+        else:
+            print(f"✅ 正常: {different_count/len(keys1)*100:.2f}% 的参数在变化")
+            if max_diff < 1e-6:
+                print(f"⚠️  但是: 最大变化只有 {max_diff:.6e}，变化幅度可能太小")
+        print("="*80)
+    except Exception as e:
+        print(f"❌ 错误: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    compare_safetensors(CHECKPOINT_1, CHECKPOINT_2)

diffusion-dpo-test/data_val/0000009-seed-0.png ADDED Viewed

diffusion-dpo-test/data_val/0000010-seed-0.png ADDED Viewed

diffusion-dpo-test/fix_lora_keys.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#!/usr/bin/env python3
+"""
+修复已保存的 LoRA checkpoint 的 key 格式
+将 PEFT 格式转换为 Diffusers load_lora_weights 期望的格式
+PEFT 格式:     x_embedder.lora_A.train_unet.weight
+Diffusers 格式: transformer.x_embedder.lora_A.weight
+"""
+import os
+import sys
+from safetensors.torch import load_file, save_file
+def fix_lora_keys(input_path, output_path=None):
+    """
+    修复 LoRA checkpoint 的 key 格式
+    Args:
+        input_path: 输入的 safetensor 文件路径
+        output_path: 输出路径，默认覆盖原文件（会先备份）
+    """
+    if output_path is None:
+        output_path = input_path
+    print(f"\n{'='*80}")
+    print(f"修复 LoRA Key 格式")
+    print(f"  输入: {input_path}")
+    print(f"  输出: {output_path}")
+    print(f"{'='*80}\n")
+    # 加载原始 state_dict
+    state_dict = load_file(input_path)
+    print(f"加载了 {len(state_dict)} 个参数\n")
+    # 显示原始 key 格式
+    sample_key = list(state_dict.keys())[0]
+    print(f"原始 key 格式示例: {sample_key}")
+    # 检查是否需要修复
+    needs_fix = False
+    if "train_unet" in sample_key:
+        needs_fix = True
+        print("  ✓ 检测到 '.train_unet.' 后缀，需要移除")
+    if not sample_key.startswith("transformer."):
+        needs_fix = True
+        print("  ✓ 缺少 'transformer.' 前缀，需要添加")
+    if not needs_fix:
+        print("\n✅ Key 格式已经正确，无需修复！")
+        return
+    # 修复 key 格式
+    print("\n开始修复...")
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        new_k = k
+        # 1. 移除 base_model.model. 前缀（如果有）
+        new_k = new_k.replace("base_model.model.", "")
+        # 2. 移除 .train_unet 后缀
+        new_k = new_k.replace(".train_unet.", ".")
+        # 3. 添加 transformer. 前缀
+        if not new_k.startswith("transformer."):
+            new_k = "transformer." + new_k
+        new_state_dict[new_k] = v
+    # 显示修复后的 key 格式
+    new_sample_key = list(new_state_dict.keys())[0]
+    print(f"修复后 key 格式示例: {new_sample_key}")
+    # 备份原文件（如果覆盖）
+    if output_path == input_path:
+        backup_path = input_path + ".backup"
+        print(f"\n备份原文件到: {backup_path}")
+        os.rename(input_path, backup_path)
+    # 保存修复后的文件
+    save_file(new_state_dict, output_path)
+    print(f"\n✅ 已保存修复后的文件: {output_path}")
+    # 验证
+    print("\n验证修复结果...")
+    verify_dict = load_file(output_path)
+    verify_key = list(verify_dict.keys())[0]
+    if verify_key.startswith("transformer.") and ".train_unet." not in verify_key:
+        print("✅ 验证通过！Key 格式正确")
+    else:
+        print(f"❌ 验证失败！Key 格式仍有问题: {verify_key}")
+    return new_state_dict
+def fix_checkpoint_dir(checkpoint_dir):
+    """修复整个 checkpoint 目录"""
+    lora_dir = os.path.join(checkpoint_dir, "lora_train_unet")
+    adapter_path = os.path.join(lora_dir, "adapter_model.safetensors")
+    if os.path.exists(adapter_path):
+        fix_lora_keys(adapter_path)
+    else:
+        print(f"❌ 找不到文件: {adapter_path}")
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("用法:")
+        print("  python fix_lora_keys.py <checkpoint_path>")
+        print("  python fix_lora_keys.py <checkpoint_dir>")
+        print()
+        print("示例:")
+        print("  python fix_lora_keys.py results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors")
+        print("  python fix_lora_keys.py results_1202_4/checkpoint-15")
+        print()
+        # 默认修复 results_1202_4 下的所有 checkpoint
+        base_dir = "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4"
+        if os.path.exists(base_dir):
+            print(f"自动扫描 {base_dir} 下的所有 checkpoint...")
+            for item in sorted(os.listdir(base_dir)):
+                if item.startswith("checkpoint-"):
+                    ckpt_path = os.path.join(base_dir, item)
+                    fix_checkpoint_dir(ckpt_path)
+        else:
+            print(f"默认目录 {base_dir} 不存在")
+    else:
+        path = sys.argv[1]
+        if path.endswith(".safetensors"):
+            fix_lora_keys(path)
+        elif os.path.isdir(path):
+            fix_checkpoint_dir(path)
+        else:
+            print(f"❌ 无效路径: {path}")

diffusion-dpo-test/inspect_safetensor.py ADDED Viewed

	@@ -0,0 +1,115 @@

+#!/usr/bin/env python3
+"""检查 safetensor 文件的内容"""
+from safetensors.torch import load_file
+import torch
+# ==========================================================
+# 在这里手动填写 safetensors 文件路径（可填写多个）
+# 示例：
+# SAFETENSOR_PATHS = [
+#     r"/path/to/adapter_model1.safetensors",
+#     r"/path/to/adapter_model2.safetensors",
+# ]
+# ==========================================================
+SAFETENSOR_PATHS = [
+    r"/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors",
+]
+# ==========================================================
+def inspect_safetensor(path):
+    print(f"\n{'='*80}")
+    print(f"检查文件: {path}")
+    print(f"{'='*80}\n")
+    try:
+        state_dict = load_file(path)
+        print(f"总共有 {len(state_dict)} 个参数张量\n")
+        total_params = 0
+        zero_params = 0
+        non_zero_params = 0
+        layer_stats = {}
+        for key, tensor in state_dict.items():
+            num_params = tensor.numel()
+            total_params += num_params
+            non_zero_count = (tensor != 0).sum().item()
+            zero_count = num_params - non_zero_count
+            if non_zero_count == 0:
+                zero_params += num_params
+            else:
+                non_zero_params += num_params
+            if '.lora_A.' in key or '.lora_B.' in key:
+                layer_name = key.split('.lora_')[0]
+                if layer_name not in layer_stats:
+                    layer_stats[layer_name] = {
+                        'lora_A': None,
+                        'lora_B': None
+                    }
+                stats_entry = {
+                    'mean': tensor.float().mean().item(),
+                    'std': tensor.float().std().item(),
+                    'max': tensor.float().max().item(),
+                    'min': tensor.float().min().item(),
+                    'non_zero_ratio': non_zero_count / num_params
+                }
+                if '.lora_A.' in key:
+                    layer_stats[layer_name]['lora_A'] = stats_entry
+                else:
+                    layer_stats[layer_name]['lora_B'] = stats_entry
+        print(f"参数统计:")
+        print(f"  总参数数: {total_params:,}")
+        print(f"  非零参数: {non_zero_params:,} ({non_zero_params/total_params*100:.2f}%)")
+        print(f"  零参数:   {zero_params:,} ({zero_params/total_params*100:.2f}%)")
+        print()
+        key_layers = ['x_embedder', 'transformer_blocks.0', 'transformer_blocks.9',
+                      'single_transformer_blocks.30', 'proj_out']
+        print("关键层统计:")
+        print("-" * 80)
+        for layer_name in key_layers:
+            matching_layers = [k for k in layer_stats.keys() if layer_name in k]
+            if matching_layers:
+                for full_layer in matching_layers[:3]:
+                    stats = layer_stats[full_layer]
+                    print(f"\n层: {full_layer}")
+                    if stats['lora_A']:
+                        print(f"  lora_A: mean={stats['lora_A']['mean']:.6e}, "
+                              f"max={stats['lora_A']['max']:.6e}, "
+                              f"非零比例={stats['lora_A']['non_zero_ratio']*100:.2f}%")
+                    if stats['lora_B']:
+                        print(f"  lora_B: mean={stats['lora_B']['mean']:.6e}, "
+                              f"max={stats['lora_B']['max']:.6e}, "
+                              f"非零比例={stats['lora_B']['non_zero_ratio']*100:.2f}%")
+        print("\n" + "="*80)
+        print("lora_B 权重最大的前 5 个层:")
+        print("-" * 80)
+        lora_b_layers = [(k, v['lora_B']) for k, v in layer_stats.items() if v['lora_B'] is not None]
+        lora_b_layers.sort(key=lambda x: abs(x[1]['max']), reverse=True)
+        for i, (layer_name, stats) in enumerate(lora_b_layers[:5], 1):
+            print(f"{i}. {layer_name}")
+            print(f"   mean={stats['mean']:.6e}, max={stats['max']:.6e}, "
+                  f"非零比例={stats['non_zero_ratio']*100:.2f}%")
+    except Exception as e:
+        print(f"❌ 错误: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    # 遍历手动填写的文件路径
+    for path in SAFETENSOR_PATHS:
+        inspect_safetensor(path)

diffusion-dpo-test/metrics.json ADDED Viewed

	@@ -0,0 +1,142 @@

+{
+  "summary": {
+    "avg_inference_time_sec": 23.599896386852894,
+    "std_inference_time_sec": 6.796912376816178,
+    "min_inference_time_sec": 11.098074495792389,
+    "max_inference_time_sec": 28.76287134224549,
+    "median_inference_time_sec": 26.921020144131035,
+    "p95_inference_time_sec": 28.102163634379394,
+    "p99_inference_time_sec": 28.270882021300498,
+    "throughput_single_gpu_per_sec": 0.042373067390121394,
+    "throughput_parallel_per_sec": 0.08250964031858578,
+    "peak_memory_mb": 33811.7392578125,
+    "peak_memory_gb": 33.01927661895752,
+    "total_images": 100,
+    "warmup_images": 8,
+    "measured_images": 92,
+    "model_load_time_sec": 52.43063974380493,
+    "inference_wall_time_sec": 1211.9795894622803,
+    "total_time_sec": 1264.4102292060852,
+    "num_gpus": 2
+  },
+  "per_gpu_metrics": {
+    "1": {
+      "inference_times": [
+        27.854881670325994,
+        27.943492013029754,
+        27.888656420167536,
+        27.804196048993617,
+        27.924615799915045,
+        27.952690774109215,
+        27.882505219895393,
+        27.896253335289657,
+        28.09671812877059,
+        27.85268287314102,
+        27.742382244672626,
+        28.76287134224549,
+        28.061799827963114,
+        28.054252550937235,
+        27.92581091215834,
+        27.954700701870024,
+        27.952801280654967,
+        27.932732206769288,
+        28.13525341823697,
+        27.90626529790461,
+        28.22222373681143,
+        27.82774563319981,
+        27.953424714971334,
+        28.111765013076365,
+        27.881029167678207,
+        27.89013520721346,
+        27.974640298169106,
+        28.015457140281796,
+        28.10881925234571,
+        27.839136187918484,
+        27.980245498009026,
+        27.941782257054,
+        27.858478610869497,
+        18.717311061918736,
+        11.598434458952397,
+        11.579710375983268,
+        11.584534626919776,
+        11.59768055099994,
+        11.199870459735394,
+        11.196961861103773,
+        11.191290821880102,
+        11.286846159026027,
+        11.2286821231246,
+        11.219772449228913,
+        11.226453838404268,
+        11.211608635727316
+      ],
+      "warmup_time": 115.76587492786348,
+      "peak_memory_mb": 33810.9892578125,
+      "allocated_memory_mb": 33196.46240234375,
+      "reserved_memory_mb": 34506.0,
+      "total_images": 50,
+      "avg_inference_time": 23.434121787122898,
+      "std_inference_time": 7.309697334208333,
+      "throughput": 0.04267281740208,
+      "memory_efficiency": 96.20489886496189
+    },
+    "0": {
+      "inference_times": [
+        26.802028878591955,
+        26.797191261779517,
+        26.82992110401392,
+        26.958114746958017,
+        26.761777761392295,
+        26.84792256169021,
+        26.746575728990138,
+        27.076817566063255,
+        26.943395509850234,
+        26.85071571683511,
+        26.841394792776555,
+        27.631777914240956,
+        26.833319212775677,
+        26.806214389856905,
+        26.8738283761777,
+        26.895515635609627,
+        26.940260547678918,
+        27.09848231682554,
+        26.858372538816184,
+        26.918541864026338,
+        26.94270030176267,
+        26.95117418281734,
+        26.760037765838206,
+        26.82909763790667,
+        26.831056244205683,
+        26.92349842423573,
+        26.80812106281519,
+        26.730416806880385,
+        27.080423870123923,
+        27.055579679086804,
+        27.27255716919899,
+        27.117452350910753,
+        26.751979127991945,
+        26.876946676988155,
+        26.70633071102202,
+        26.0279257488437,
+        25.012685468886048,
+        11.098074495792389,
+        11.139604906085879,
+        11.128950875252485,
+        11.235403429716825,
+        11.146971406880766,
+        11.146004664245993,
+        11.10717493435368,
+        11.114083136897534,
+        11.114445879124105
+      ],
+      "warmup_time": 111.6877530622296,
+      "peak_memory_mb": 33811.7392578125,
+      "allocated_memory_mb": 33197.21240234375,
+      "reserved_memory_mb": 34506.0,
+      "total_images": 50,
+      "avg_inference_time": 23.76567098658289,
+      "std_inference_time": 6.237739828068353,
+      "throughput": 0.042077499118983785,
+      "memory_efficiency": 96.20707239999928
+    }
+  }
+}

diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000263-seed-0.png ADDED Viewed

diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000463-seed-0.png ADDED Viewed

diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000563-seed-0.png ADDED Viewed

diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000763-seed-0.png ADDED Viewed

diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000863-seed-0.png ADDED Viewed

diffusion-dpo-test/results-test/DrealSR/sony_160_x4.png ADDED Viewed

diffusion-dpo-test/results-test/DrealSR/sony_189_x4.png ADDED Viewed

diffusion-dpo-test/src/flux/__pycache__/block.cpython-310.pyc ADDED Viewed

Binary file (5.96 kB). View file

diffusion-dpo-test/src/flux/__pycache__/block.cpython-311.pyc ADDED Viewed

Binary file (14.6 kB). View file

diffusion-dpo-test/src/flux/__pycache__/condition.cpython-310.pyc ADDED Viewed

Binary file (3.32 kB). View file

diffusion-dpo-test/src/flux/__pycache__/condition.cpython-311.pyc ADDED Viewed

Binary file (5.83 kB). View file

diffusion-dpo-test/src/flux/__pycache__/generate.cpython-310.pyc ADDED Viewed

Binary file (5.79 kB). View file

diffusion-dpo-test/src/flux/__pycache__/generate.cpython-311.pyc ADDED Viewed

Binary file (11.1 kB). View file

diffusion-dpo-test/src/flux/__pycache__/lora_controller.cpython-310.pyc ADDED Viewed

Binary file (3 kB). View file

diffusion-dpo-test/src/flux/__pycache__/lora_controller.cpython-311.pyc ADDED Viewed

Binary file (5.13 kB). View file

diffusion-dpo-test/src/flux/__pycache__/pipeline_tools.cpython-310.pyc ADDED Viewed

Binary file (1.42 kB). View file

diffusion-dpo-test/src/flux/__pycache__/pipeline_tools.cpython-311.pyc ADDED Viewed

Binary file (2.56 kB). View file

diffusion-dpo-test/src/flux/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (4.26 kB). View file