diff --git a/diffusion-dpo-ocr/check_video_resolution.py b/diffusion-dpo-ocr/check_video_resolution.py new file mode 100644 index 0000000000000000000000000000000000000000..fe355c223b3e647e07e920bd2157eab8082040ea --- /dev/null +++ b/diffusion-dpo-ocr/check_video_resolution.py @@ -0,0 +1,194 @@ +""" +检查 RoadText1K Videos 目录下所有视频的分辨率 +""" + +import os +from pathlib import Path +from collections import Counter +import cv2 +from tqdm import tqdm + + +# ============================================================================ +# 配置参数 - 请修改这里 +# ============================================================================ +CONFIG = { + # RoadText1K Videos 目录 + 'videos_dir': '/home/wanghongbo06/baipurui/DATA/RoadText1k/Videos', + + # 目标分辨率 (可选,用于检查是否匹配) + 'target_resolution': (1920, 1080), # 例如: (1920, 1080) 或 None 只统计 + + # 检查哪个数据集 + 'split': 'test', # 'test', 'train', 'val', 或 'all' 检查全部 +} +# ============================================================================ + + +def get_video_resolution(video_path: Path) -> tuple: + """ + 获取视频分辨率 + + Returns: + (width, height) 或 None 如果无法读取 + """ + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + return None + + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = cap.get(cv2.CAP_PROP_FPS) + frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + cap.release() + + return { + 'width': width, + 'height': height, + 'fps': fps, + 'frame_count': frame_count, + } + + +def main(): + videos_dir = Path(CONFIG['videos_dir']) + split = CONFIG['split'] + target_res = CONFIG['target_resolution'] + + print("="*60) + print("Video Resolution Checker") + print("="*60) + print(f"Videos directory: {videos_dir}") + print(f"Split: {split}") + if target_res: + print(f"Target resolution: {target_res[0]}x{target_res[1]}") + print() + + # 确定要检查的目录 + if split == 'all': + check_dirs = ['test', 'train', 'val'] + else: + check_dirs = [split] + + all_resolutions = [] + resolution_stats = Counter() + mismatched_videos = [] + + total_videos = 0 + + for split_name in check_dirs: + split_dir = videos_dir / split_name + + if not split_dir.exists(): + print(f"Warning: Directory not found: {split_dir}") + continue + + print(f"\nChecking {split_name}...") + + # 查找所有子目录中的视频文件(支持嵌套结构) + video_files = [] + for subdir in sorted(split_dir.iterdir()): + if subdir.is_dir(): + # 递归查找子目录中的视频 + video_files.extend(subdir.glob('*.mp4')) + video_files.extend(subdir.glob('*.avi')) + elif subdir.suffix in ['.mp4', '.avi']: + # 视频文件直接在 split 目录下 + video_files.append(subdir) + + video_files = sorted(video_files) + + print(f"Found {len(video_files)} videos") + + for video_path in tqdm(video_files, desc=f"Processing {split_name}"): + info = get_video_resolution(video_path) + + if info is None: + print(f"Warning: Cannot read {video_path.name}") + continue + + width = info['width'] + height = info['height'] + resolution = (width, height) + + all_resolutions.append({ + 'file': video_path.name, + 'split': split_name, + 'width': width, + 'height': height, + 'fps': info['fps'], + 'frame_count': info['frame_count'], + }) + + resolution_stats[resolution] += 1 + + # 检查是否匹配目标分辨率 + if target_res: + if resolution != target_res: + mismatched_videos.append({ + 'file': video_path.name, + 'split': split_name, + 'resolution': f"{width}x{height}", + 'expected': f"{target_res[0]}x{target_res[1]}", + }) + + total_videos += 1 + + # 打印统计结果 + print("\n" + "="*60) + print("STATISTICS") + print("="*60) + print(f"Total videos checked: {total_videos}") + print(f"Unique resolutions: {len(resolution_stats)}") + print() + + print("Resolution distribution:") + print("-" * 60) + for (width, height), count in resolution_stats.most_common(): + percentage = count / total_videos * 100 + print(f" {width:4d} x {height:4d}: {count:4d} videos ({percentage:5.1f}%)") + + # 打印最常见的分辨率 + if resolution_stats: + most_common = resolution_stats.most_common(1)[0] + print(f"\nMost common resolution: {most_common[0][0]}x{most_common[0][1]} ({most_common[1]} videos)") + + # 如果有目标分辨率,显示不匹配的 + if target_res: + print("\n" + "="*60) + print("RESOLUTION MATCHING") + print("="*60) + if mismatched_videos: + print(f"Videos NOT matching target resolution: {len(mismatched_videos)}") + print("\nMismatched videos (first 20):") + for item in mismatched_videos[:20]: + print(f" {item['split']}/{item['file']}: {item['resolution']} (expected {item['expected']})") + if len(mismatched_videos) > 20: + print(f" ... and {len(mismatched_videos) - 20} more") + else: + print(f"✓ All videos match target resolution {target_res[0]}x{target_res[1]}") + + # 保存详细结果 + output_file = videos_dir.parent / 'video_resolution_report.json' + import json + report = { + 'total_videos': total_videos, + 'resolution_distribution': {f"{w}x{h}": count for (w, h), count in resolution_stats.items()}, + 'videos': all_resolutions, + } + + if target_res: + report['target_resolution'] = f"{target_res[0]}x{target_res[1]}" + report['mismatched_count'] = len(mismatched_videos) + report['mismatched_videos'] = mismatched_videos + + with open(output_file, 'w') as f: + json.dump(report, f, indent=2) + + print(f"\nDetailed report saved to: {output_file}") + + +if __name__ == '__main__': + main() + diff --git a/diffusion-dpo-ocr/prepare_roadtext.py b/diffusion-dpo-ocr/prepare_roadtext.py new file mode 100644 index 0000000000000000000000000000000000000000..8445a59b28eb1764b986168635936ca1de221d99 --- /dev/null +++ b/diffusion-dpo-ocr/prepare_roadtext.py @@ -0,0 +1,625 @@ +""" +RoadText1K 预处理脚本 +从视频中提取帧,resize 到 512x512,合并 Localisation 和 Text_Transcription 标注 + +使用流程: +1. 运行此脚本生成 GT images (512x512) 和合并后的标注 +2. 用你的方式生成 LR 和 SR images +3. 运行 test_roadtext.py 评估 OCR 性能 +""" + +import os +import json +import random +from pathlib import Path +from typing import Dict, List, Tuple + +from PIL import Image +import numpy as np +from tqdm import tqdm +import cv2 + + +# ============================================================================ +# 配置参数 - 请修改这里 +# ============================================================================ +CONFIG = { + # RoadText1K 根目录 + 'roadtext_root': '/home/wanghongbo06/baipurui/DATA/RoadText1k', + + # 使用哪个数据集 (test/train/val) + 'split': 'test', + + # 输出目录 + 'output_dir': '/home/wanghongbo06/baipurui/DATA/RoadText1k_patch_crop', + + # Crop 尺寸 (从原图中 crop 这个大小的区域) + 'crop_size': 512, + + # Crop 策略: 'center', 'random', 'text_center' (以文本区域为中心) + 'crop_strategy': 'text_center', + + # 最小文本框保留比例 (文本框至少要有这么多比例在 crop 区域内才保留) + # 1.0 表示只保留完全在 crop 区域内的文本框(推荐,避免坐标问题) + # 0.7 表示文本框至少 70% 在 crop 区域内(但可能导致坐标超出边界) + 'min_box_overlap': 1.0, + + # 最小文本框数量 (crop 后至少要有这么多有效文本框) + 'min_text_boxes': 1, + + # 最终图片数量限制 + 'max_frames': 1000, + + # 随机种子 + 'seed': 42, +} +# ============================================================================ + + +def load_localisation_annotations(root_dir: str) -> Dict: + """加载所有 Localisation 标注""" + loc_dir = Path(root_dir) / 'Ground_truths' / 'Localisation' + all_annotations = {} + + print("Loading Localisation annotations...") + json_files = sorted(loc_dir.glob('*.json')) + print(f"Found {len(json_files)} JSON files") + + for json_file in json_files: + try: + with open(json_file, 'r', encoding='utf-8') as f: + data = json.load(f) + # data 应该是一个列表 + if isinstance(data, list): + for item in data: + img_name = item.get('name', '') + if img_name: + all_annotations[img_name] = item + else: + print(f"Warning: {json_file.name} is not a list format") + except Exception as e: + print(f"Error loading {json_file.name}: {e}") + + print(f"Loaded {len(all_annotations)} image annotations") + return all_annotations + + +def load_text_transcriptions(root_dir: str) -> Dict: + """加载所有 Text_Transcription 标注""" + text_dir = Path(root_dir) / 'Ground_truths' / 'Text_Transcription' + all_texts = {} + + print("Loading Text_Transcription annotations...") + # 支持 *.json 和 *.json.json (双扩展名) + json_files = sorted(list(text_dir.glob('*.json')) + list(text_dir.glob('*.json.json'))) + print(f"Found {len(json_files)} JSON files") + + for json_file in json_files: + try: + with open(json_file, 'r', encoding='utf-8') as f: + data = json.load(f) + # data 应该是一个字典 {video_name: {label_id: text}} + if isinstance(data, dict): + for video_name, texts in data.items(): + if video_name not in all_texts: + all_texts[video_name] = {} + if isinstance(texts, dict): + all_texts[video_name].update(texts) + else: + print(f"Warning: {json_file.name} is not a dict format") + except Exception as e: + print(f"Error loading {json_file.name}: {e}") + + print(f"Loaded texts for {len(all_texts)} videos") + return all_texts + + +def get_box_bounds(box: List[float]) -> Tuple[float, float, float, float]: + """从多边形获取边界框 (x1, y1, x2, y2)""" + # box2d 格式 [x1,y1,x2,y1,x2,y2,x1,y2] 或其他多边形格式 + xs = [box[i] for i in range(0, len(box), 2)] + ys = [box[i] for i in range(1, len(box), 2)] + return min(xs), min(ys), max(xs), max(ys) + + +def is_box_fully_inside(box: List[float], crop_x: int, crop_y: int, crop_size: int) -> bool: + """检查框是否完全在 crop 区域内""" + x1, y1, x2, y2 = get_box_bounds(box) + cx1, cy1 = crop_x, crop_y + cx2, cy2 = crop_x + crop_size, crop_y + crop_size + return x1 >= cx1 and y1 >= cy1 and x2 <= cx2 and y2 <= cy2 + + +def calc_box_overlap(box: List[float], crop_x: int, crop_y: int, crop_size: int) -> float: + """ + 计算文本框与 crop 区域的重叠比例 + 返回值: 0.0 - 1.0,表示文本框有多少比例在 crop 区域内 + """ + x1, y1, x2, y2 = get_box_bounds(box) + box_area = (x2 - x1) * (y2 - y1) + if box_area <= 0: + return 0.0 + + # crop 区域边界 + cx1, cy1 = crop_x, crop_y + cx2, cy2 = crop_x + crop_size, crop_y + crop_size + + # 计算交集 + inter_x1 = max(x1, cx1) + inter_y1 = max(y1, cy1) + inter_x2 = min(x2, cx2) + inter_y2 = min(y2, cy2) + + if inter_x1 >= inter_x2 or inter_y1 >= inter_y2: + return 0.0 + + inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1) + return inter_area / box_area + + +def clip_polygon_to_crop(poly: List[float], crop_x: int, crop_y: int, crop_size: int) -> List[float]: + """ + 将多边形坐标转换到 crop 坐标系 + + 当 min_box_overlap = 1.0 时,框完全在 crop 内,只需要平移 + 当 min_box_overlap < 1.0 时,需要将超出边界的坐标裁剪到边界 + """ + clipped = [] + for i in range(0, len(poly), 2): + x = poly[i] - crop_x # 先平移 + y = poly[i+1] - crop_y + + # 裁剪到 [0, crop_size] 范围内(安全措施) + x = max(0, min(x, crop_size)) + y = max(0, min(y, crop_size)) + + clipped.append(x) + clipped.append(y) + + return clipped + + +def find_best_crop_position( + polygons: List[List[float]], + img_w: int, + img_h: int, + crop_size: int, + strategy: str = 'text_center', + min_overlap: float = 0.7 +) -> Tuple[int, int, List[int]]: + """ + 找到最佳的 crop 位置 + + Args: + polygons: 文本框多边形列表 + img_w, img_h: 原图尺寸 + crop_size: crop 尺寸 + strategy: crop 策略 + min_overlap: 最小重叠比例,文本框至少这么多比例在 crop 内才保留 + + Returns: + (crop_x, crop_y, valid_box_indices) + """ + if not polygons: + crop_x = max(0, (img_w - crop_size) // 2) + crop_y = max(0, (img_h - crop_size) // 2) + return crop_x, crop_y, [] + + if strategy == 'center': + crop_x = max(0, (img_w - crop_size) // 2) + crop_y = max(0, (img_h - crop_size) // 2) + + elif strategy == 'text_center': + # 计算所有文本框的中心,以平均中心为 crop 中心 + centers_x = [] + centers_y = [] + for poly in polygons: + x1, y1, x2, y2 = get_box_bounds(poly) + centers_x.append((x1 + x2) / 2) + centers_y.append((y1 + y2) / 2) + + avg_cx = sum(centers_x) / len(centers_x) + avg_cy = sum(centers_y) / len(centers_y) + + crop_x = int(avg_cx - crop_size / 2) + crop_y = int(avg_cy - crop_size / 2) + + # 边界检查 + crop_x = max(0, min(crop_x, img_w - crop_size)) + crop_y = max(0, min(crop_y, img_h - crop_size)) + + elif strategy == 'random': + max_x = max(0, img_w - crop_size) + max_y = max(0, img_h - crop_size) + crop_x = random.randint(0, max_x) if max_x > 0 else 0 + crop_y = random.randint(0, max_y) if max_y > 0 else 0 + + else: + crop_x = max(0, (img_w - crop_size) // 2) + crop_y = max(0, (img_h - crop_size) // 2) + + # 找出哪些文本框的重叠比例 >= min_overlap + valid_indices = [] + for i, poly in enumerate(polygons): + if min_overlap >= 0.99: + # 严格模式:只保留完全在 crop 区域内的框 + if is_box_fully_inside(poly, crop_x, crop_y, crop_size): + valid_indices.append(i) + else: + # 宽松模式:保留 overlap >= min_overlap 的框 + overlap = calc_box_overlap(poly, crop_x, crop_y, crop_size) + if overlap >= min_overlap: + valid_indices.append(i) + + return crop_x, crop_y, valid_indices + + +def extract_frame_with_crop( + video_path: Path, + frame_idx: int, + output_dir: Path, + crop_x: int, + crop_y: int, + crop_size: int, +) -> Tuple[str, bool]: + """ + 从视频中提取指定帧并 crop + + Returns: + (saved_filename, success) + """ + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + return '', False + + # 跳到指定帧 + cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) + ret, frame = cap.read() + cap.release() + + if not ret: + return '', False + + # 转换为 RGB + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + img = Image.fromarray(frame_rgb) + + # Crop + img_cropped = img.crop((crop_x, crop_y, crop_x + crop_size, crop_y + crop_size)) + + # 保存 + video_name = video_path.stem + img_filename = f"{video_name}-{frame_idx:07d}.png" + img_path = output_dir / img_filename + img_cropped.save(img_path) + + return img_filename, True + + +def adjust_polygon_for_crop(poly: List[float], crop_x: int, crop_y: int) -> List[float]: + """调整多边形坐标到 crop 后的坐标系""" + adjusted = [] + for i in range(0, len(poly), 2): + adjusted.append(poly[i] - crop_x) + adjusted.append(poly[i+1] - crop_y) + return adjusted + + +def merge_annotations(loc_ann: Dict, text_dict: Dict) -> Dict: + """ + 合并 Localisation 和 Text_Transcription + + Args: + loc_ann: Localisation 标注项 + text_dict: Text_Transcription 字典 {video_name: {label_id: text}} + + Returns: + 合并后的标注: { + 'polygons': [[x1,y1,x2,y2,x3,y3,x4,y4], ...], + 'texts': ['text1', 'text2', ...], + 'ignore': [False, False, ...] + } + """ + video_name = loc_ann.get('videoName', '') + labels = loc_ann.get('labels') or [] # 处理 None 的情况 + + # 获取该视频的文本字典 + video_texts = text_dict.get(video_name, {}) + + polygons = [] + texts = [] + ignore = [] + + for label in labels: + label_id = str(label.get('id', '')) + box2d = label.get('box2d', {}) + + if not box2d: + continue + + # 从 box2d 转换为多边形 (4个点) + x1, y1 = box2d.get('x1', 0), box2d.get('y1', 0) + x2, y2 = box2d.get('x2', 0), box2d.get('y2', 0) + + # 转换为多边形格式 [x1,y1,x2,y1,x2,y2,x1,y2] + polygon = [x1, y1, x2, y1, x2, y2, x1, y2] + polygons.append(polygon) + + # 获取文本 + text = video_texts.get(label_id, '') + texts.append(text) + + # 判断是否忽略 (空文本或特定标记) + ignore_flag = (text == '' or text == '###' or text.lower() == 'illegible') + ignore.append(ignore_flag) + + return { + 'polygons': polygons, + 'texts': texts, + 'ignore': ignore, + } + + +def scale_polygon(polygon: List[float], scale_x: float, scale_y: float) -> List[float]: + """缩放多边形坐标""" + scaled = [] + for i in range(0, len(polygon), 2): + scaled.append(polygon[i] * scale_x) + scaled.append(polygon[i+1] * scale_y) + return scaled + + +def main(): + random.seed(CONFIG['seed']) + np.random.seed(CONFIG['seed']) + + root_dir = Path(CONFIG['roadtext_root']) + output_dir = Path(CONFIG['output_dir']) + split = CONFIG['split'] + crop_size = CONFIG['crop_size'] + + # 创建输出目录 + gt_dir = output_dir / 'gt' + gt_dir.mkdir(parents=True, exist_ok=True) + + print(f"RoadText1K root: {root_dir}") + print(f"Split: {split}") + print(f"Output dir: {output_dir}") + print(f"Crop size: {crop_size}x{crop_size}") + print(f"Crop strategy: {CONFIG['crop_strategy']}") + print(f"Min text boxes per crop: {CONFIG['min_text_boxes']}") + print() + + # 加载标注 + loc_annotations = load_localisation_annotations(root_dir) + text_transcriptions = load_text_transcriptions(root_dir) + + # 获取视频目录 + video_dir = root_dir / 'Videos' / split + if not video_dir.exists(): + print(f"Error: Video directory not found: {video_dir}") + return + + # 获取所有视频文件(支持嵌套子目录结构) + video_files = [] + for subdir in sorted(video_dir.iterdir()): + if subdir.is_dir(): + # 子目录中的视频文件 + video_files.extend(subdir.glob('*.mp4')) + video_files.extend(subdir.glob('*.avi')) + elif subdir.suffix in ['.mp4', '.avi']: + # 视频文件直接在 split 目录下 + video_files.append(subdir) + + video_files = sorted(video_files) + total_videos = len(video_files) + print(f"找到 {total_videos} 个视频") + print() + + # 处理视频 + new_annotations = {} + processed_count = 0 + total_frames = 0 + + print("Processing videos...") + print("Step 1: 找出每个视频中有标注的帧...") + + # 第一步:找出每个视频中有标注的帧索引,同时记录标注 key + video_annotated_frames = {} # {video_name: [(frame_num, annotation_key), ...]} + video_name_to_path = {vp.stem: vp for vp in video_files} + + for key, ann in loc_annotations.items(): + # 从标注的 name 中提取视频名和帧号 + # 格式可能是: "200_frames/170-0000001.jpg" 或 "test_frames/701-0000001.jpg" + parts = key.split('/') + if len(parts) >= 2: + filename = parts[-1] # "170-0000001.jpg" + else: + filename = key # "170-0000001.jpg" + + # 提取帧号 + if '-' in filename: + try: + frame_str = filename.split('-')[-1].split('.')[0] + frame_num = int(frame_str) + + # 尝试提取视频名 + video_name_candidate = filename.split('-')[0] + + # 检查是否在我们的视频列表中 + if video_name_candidate in video_name_to_path: + if video_name_candidate not in video_annotated_frames: + video_annotated_frames[video_name_candidate] = [] + # 存储 (frame_num, annotation_key) 以便后续使用 + entry = (frame_num, key) + if entry not in video_annotated_frames[video_name_candidate]: + video_annotated_frames[video_name_candidate].append(entry) + except: + pass + + print(f"找到 {len(video_annotated_frames)} 个视频有标注") + total_annotated_frames = sum(len(frames) for frames in video_annotated_frames.values()) + print(f"总共有 {total_annotated_frames} 帧有标注") + + # 如果设置了 max_frames,随机选取 + max_frames = CONFIG['max_frames'] + if max_frames is not None and total_annotated_frames > max_frames: + print(f"\n限制最大帧数为 {max_frames},随机选取中...") + + # 将所有帧展开为列表 [(video_name, frame_num, annotation_key), ...] + all_frames = [] + for video_name, frame_list in video_annotated_frames.items(): + for frame_num, ann_key in frame_list: + all_frames.append((video_name, frame_num, ann_key)) + + # 随机选取 + selected_frames = random.sample(all_frames, max_frames) + + # 重新组织为 video_annotated_frames 格式 + video_annotated_frames = {} + for video_name, frame_num, ann_key in selected_frames: + if video_name not in video_annotated_frames: + video_annotated_frames[video_name] = [] + video_annotated_frames[video_name].append((frame_num, ann_key)) + + print(f"随机选取 {max_frames} 帧,涉及 {len(video_annotated_frames)} 个视频") + + crop_size = CONFIG['crop_size'] + crop_strategy = CONFIG['crop_strategy'] + min_box_overlap = CONFIG['min_box_overlap'] + min_text_boxes = CONFIG['min_text_boxes'] + + print() + print("Step 2: 提取帧并 Crop(保留有效文本框)...") + print(f" Crop 尺寸: {crop_size}x{crop_size}") + print(f" Crop 策略: {crop_strategy}") + print(f" 最小重叠比例: {min_box_overlap:.0%}") + print(f" 最小文本框数: {min_text_boxes}") + print() + + skipped_no_boxes = 0 + + for video_path in tqdm(video_files, desc="Videos"): + video_name = video_path.stem + + # 获取该视频有标注的帧信息 [(frame_num, ann_key), ...] + annotated_frame_info = video_annotated_frames.get(video_name, []) + + if not annotated_frame_info: + continue + + # 获取视频分辨率 + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + continue + orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + cap.release() + + if orig_w == 0 or orig_h == 0: + continue + + # 处理每一帧 + for frame_num, ann_key in annotated_frame_info: + if ann_key not in loc_annotations: + continue + + loc_ann = loc_annotations[ann_key] + if loc_ann is None: + continue + + # 合并标注 + merged_ann = merge_annotations(loc_ann, text_transcriptions) + + if not merged_ann['polygons']: + skipped_no_boxes += 1 + continue + + # 找到最佳的 crop 位置 + crop_x, crop_y, valid_indices = find_best_crop_position( + merged_ann['polygons'], + orig_w, + orig_h, + crop_size, + crop_strategy, + min_box_overlap + ) + + # 检查有效文本框数量 + if len(valid_indices) < min_text_boxes: + skipped_no_boxes += 1 + continue + + # 提取并 crop 帧 + img_filename, success = extract_frame_with_crop( + video_path, frame_num, gt_dir, + crop_x, crop_y, crop_size + ) + + if not success: + continue + + # 只保留有效的文本框,裁剪并调整坐标 + cropped_polygons = [] + cropped_texts = [] + cropped_ignore = [] + + for i in valid_indices: + # 裁剪多边形到 crop 区域并转换坐标 + clipped_poly = clip_polygon_to_crop( + merged_ann['polygons'][i], crop_x, crop_y, crop_size + ) + cropped_polygons.append(clipped_poly) + cropped_texts.append(merged_ann['texts'][i]) + cropped_ignore.append(merged_ann['ignore'][i]) + + new_annotations[img_filename] = { + 'polygons': cropped_polygons, + 'texts': cropped_texts, + 'ignore': cropped_ignore, + 'original_name': ann_key, + 'crop_position': [crop_x, crop_y], + } + + total_frames += 1 + + processed_count += 1 + + # 保存标注 + ann_output_path = output_dir / 'annotations.json' + with open(ann_output_path, 'w', encoding='utf-8') as f: + json.dump(new_annotations, f, indent=2, ensure_ascii=False) + + # 统计有效文本框 + total_boxes = sum(len(ann['texts']) for ann in new_annotations.values()) + valid_boxes = sum( + sum(1 for ig in ann['ignore'] if not ig) + for ann in new_annotations.values() + ) + + print() + print("="*60) + print("完成!") + print("="*60) + print(f"处理视频数: {processed_count}") + print(f"提取帧数: {total_frames}") + print(f"跳过帧数(文本框不足): {skipped_no_boxes}") + print(f"有标注的图片数: {len(new_annotations)}") + print(f"文本框总数: {total_boxes}") + print(f"有效文本框: {valid_boxes}") + print(f"每张图平均文本框: {total_boxes/max(1,len(new_annotations)):.1f}") + print() + print("输出文件:") + print(f" GT images ({crop_size}x{crop_size}): {gt_dir}") + print(f" Annotations: {ann_output_path}") + print() + print("下一步:") + print(f" 1. 用你的方式生成 LR images (如 128x128)") + print(f" 2. 超分得到 SR images ({crop_size}x{crop_size})") + print(f" 3. 将 SR images 保存到 {output_dir}/sr") + print(f" 4. 运行 test_roadtext.py 评估") + + +if __name__ == '__main__': + main() + diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_BSRGAN.json b/diffusion-dpo-ocr/results/roadtext_eval_results_BSRGAN.json new file mode 100644 index 0000000000000000000000000000000000000000..db9fb938a382b7c4e7ddf659ad4deb1ebec35b1b --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_BSRGAN.json @@ -0,0 +1,17 @@ +{ + "TP": 51, + "FP": 80, + "FN": 1920, + "Precision": 0.3893129770992366, + "Recall": 0.0258751902587519, + "F1-Score": 0.04852521408182683, + "OCR_detections": 131, + "GT_boxes": 1971, + "Detection_rate": 0.06646372399797057, + "Det_Precision": 0.6183206106870229, + "Det_Recall": 0.0410958904109589, + "Det_F1": 0.07706945765937202, + "Det_matched": 81, + "Text_matched": 51, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_DP2O-SR.json b/diffusion-dpo-ocr/results/roadtext_eval_results_DP2O-SR.json new file mode 100644 index 0000000000000000000000000000000000000000..96f467b8d0a2ed06cc34eb900c10df5510b061bb --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_DP2O-SR.json @@ -0,0 +1,17 @@ +{ + "TP": 314, + "FP": 1182, + "FN": 1657, + "Precision": 0.20989304812834225, + "Recall": 0.15930999492643327, + "F1-Score": 0.181136429189501, + "OCR_detections": 1496, + "GT_boxes": 1971, + "Detection_rate": 0.7590055809233891, + "Det_Precision": 0.5274064171122995, + "Det_Recall": 0.4003044140030441, + "Det_F1": 0.4551485434092875, + "Det_matched": 789, + "Text_matched": 314, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_DiT4SR.json b/diffusion-dpo-ocr/results/roadtext_eval_results_DiT4SR.json new file mode 100644 index 0000000000000000000000000000000000000000..0770b317fcc05f322681ecba7b557ceca8390269 --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_DiT4SR.json @@ -0,0 +1,17 @@ +{ + "TP": 226, + "FP": 903, + "FN": 1745, + "Precision": 0.20017714791851196, + "Recall": 0.11466260781329274, + "F1-Score": 0.14580645161290323, + "OCR_detections": 1129, + "GT_boxes": 1971, + "Detection_rate": 0.5728056823947235, + "Det_Precision": 0.5234720992028343, + "Det_Recall": 0.2998477929984779, + "Det_F1": 0.3812903225806451, + "Det_matched": 591, + "Text_matched": 226, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_DiffBIR.json b/diffusion-dpo-ocr/results/roadtext_eval_results_DiffBIR.json new file mode 100644 index 0000000000000000000000000000000000000000..0ac7bd9f8265ababed33203138aa89e45c7386c8 --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_DiffBIR.json @@ -0,0 +1,17 @@ +{ + "TP": 172, + "FP": 687, + "FN": 1799, + "Precision": 0.20023282887077998, + "Recall": 0.08726534753932014, + "F1-Score": 0.1215547703180212, + "OCR_detections": 859, + "GT_boxes": 1971, + "Detection_rate": 0.43581938102486045, + "Det_Precision": 0.5448195576251456, + "Det_Recall": 0.2374429223744292, + "Det_F1": 0.33074204946996466, + "Det_matched": 468, + "Text_matched": 172, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_FaithDiff.json b/diffusion-dpo-ocr/results/roadtext_eval_results_FaithDiff.json new file mode 100644 index 0000000000000000000000000000000000000000..f890b6621abc353fbdc569cc38d9a6e980335c4d --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_FaithDiff.json @@ -0,0 +1,17 @@ +{ + "TP": 210, + "FP": 1068, + "FN": 1761, + "Precision": 0.1643192488262911, + "Recall": 0.106544901065449, + "F1-Score": 0.12927054478301014, + "OCR_detections": 1278, + "GT_boxes": 1971, + "Detection_rate": 0.6484018264840182, + "Det_Precision": 0.4890453834115806, + "Det_Recall": 0.31709791983764585, + "Det_F1": 0.38473376423514927, + "Det_matched": 625, + "Text_matched": 210, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_Ours.json b/diffusion-dpo-ocr/results/roadtext_eval_results_Ours.json new file mode 100644 index 0000000000000000000000000000000000000000..0533867652304e5b3bfb7b1c2749b8457fd40abe --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_Ours.json @@ -0,0 +1,17 @@ +{ + "TP": 233, + "FP": 1483, + "FN": 1738, + "Precision": 0.1357808857808858, + "Recall": 0.11821410451547437, + "F1-Score": 0.12639001898562516, + "OCR_detections": 1716, + "GT_boxes": 1971, + "Detection_rate": 0.8706240487062404, + "Det_Precision": 0.3275058275058275, + "Det_Recall": 0.28513444951801115, + "Det_F1": 0.3048548955790616, + "Det_matched": 562, + "Text_matched": 233, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_Real-ESRGAN.json b/diffusion-dpo-ocr/results/roadtext_eval_results_Real-ESRGAN.json new file mode 100644 index 0000000000000000000000000000000000000000..684219f7808b04a2d8d55850bfc5d93c84d0f636 --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_Real-ESRGAN.json @@ -0,0 +1,17 @@ +{ + "TP": 64, + "FP": 191, + "FN": 1907, + "Precision": 0.25098039215686274, + "Recall": 0.032470826991374935, + "F1-Score": 0.05750224618149146, + "OCR_detections": 255, + "GT_boxes": 1971, + "Detection_rate": 0.1293759512937595, + "Det_Precision": 0.5882352941176471, + "Det_Recall": 0.076103500761035, + "Det_F1": 0.1347708894878706, + "Det_matched": 150, + "Text_matched": 64, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_SUPSR.json b/diffusion-dpo-ocr/results/roadtext_eval_results_SUPSR.json new file mode 100644 index 0000000000000000000000000000000000000000..81dd0029919abc1b7b688efc335e5217cd5d1e9b --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_SUPSR.json @@ -0,0 +1,17 @@ +{ + "TP": 219, + "FP": 1267, + "FN": 1752, + "Precision": 0.14737550471063257, + "Recall": 0.1111111111111111, + "F1-Score": 0.126699450390512, + "OCR_detections": 1486, + "GT_boxes": 1971, + "Detection_rate": 0.7539320142059868, + "Det_Precision": 0.4878869448183042, + "Det_Recall": 0.3678335870116692, + "Det_F1": 0.41943881978594155, + "Det_matched": 725, + "Text_matched": 219, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_SeeSR.json b/diffusion-dpo-ocr/results/roadtext_eval_results_SeeSR.json new file mode 100644 index 0000000000000000000000000000000000000000..9cda1686395e4b11e9b187236021f09555e8bff2 --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_SeeSR.json @@ -0,0 +1,17 @@ +{ + "TP": 0, + "FP": 0, + "FN": 0, + "Precision": 0, + "Recall": 0, + "F1-Score": 0, + "OCR_detections": 0, + "GT_boxes": 0, + "Detection_rate": 0, + "Det_Precision": 0, + "Det_Recall": 0, + "Det_F1": 0, + "Det_matched": 0, + "Text_matched": 0, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_StableSR.json b/diffusion-dpo-ocr/results/roadtext_eval_results_StableSR.json new file mode 100644 index 0000000000000000000000000000000000000000..9618580a43d27281373eb5e77037e6cf34d36547 --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_StableSR.json @@ -0,0 +1,17 @@ +{ + "TP": 141, + "FP": 672, + "FN": 1830, + "Precision": 0.17343173431734318, + "Recall": 0.0715372907153729, + "F1-Score": 0.10129310344827586, + "OCR_detections": 813, + "GT_boxes": 1971, + "Detection_rate": 0.4124809741248097, + "Det_Precision": 0.5276752767527675, + "Det_Recall": 0.2176560121765601, + "Det_F1": 0.3081896551724138, + "Det_matched": 429, + "Text_matched": 141, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_SwinIR.json b/diffusion-dpo-ocr/results/roadtext_eval_results_SwinIR.json new file mode 100644 index 0000000000000000000000000000000000000000..24237bd9cdc88b261ab38ffa9856fcb3d37f26ee --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_SwinIR.json @@ -0,0 +1,17 @@ +{ + "TP": 66, + "FP": 182, + "FN": 1905, + "Precision": 0.2661290322580645, + "Recall": 0.0334855403348554, + "F1-Score": 0.05948625506985128, + "OCR_detections": 248, + "GT_boxes": 1971, + "Detection_rate": 0.12582445459157787, + "Det_Precision": 0.5806451612903226, + "Det_Recall": 0.0730593607305936, + "Det_F1": 0.12978819287967552, + "Det_matched": 144, + "Text_matched": 66, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_gt.json b/diffusion-dpo-ocr/results/roadtext_eval_results_gt.json new file mode 100644 index 0000000000000000000000000000000000000000..25c4fc6574a4ebb445b96438592c3aa3adac9dd0 --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_gt.json @@ -0,0 +1,17 @@ +{ + "TP": 953, + "FP": 576, + "FN": 1018, + "Precision": 0.6232831916285154, + "Recall": 0.4835109081684424, + "F1-Score": 0.5445714285714285, + "OCR_detections": 1529, + "GT_boxes": 1971, + "Detection_rate": 0.7757483510908169, + "Det_Precision": 0.6487900588620014, + "Det_Recall": 0.5032978183663115, + "Det_F1": 0.5668571428571428, + "Det_matched": 992, + "Text_matched": 953, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_sample00.json b/diffusion-dpo-ocr/results/roadtext_eval_results_sample00.json new file mode 100644 index 0000000000000000000000000000000000000000..9cda1686395e4b11e9b187236021f09555e8bff2 --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_sample00.json @@ -0,0 +1,17 @@ +{ + "TP": 0, + "FP": 0, + "FN": 0, + "Precision": 0, + "Recall": 0, + "F1-Score": 0, + "OCR_detections": 0, + "GT_boxes": 0, + "Detection_rate": 0, + "Det_Precision": 0, + "Det_Recall": 0, + "Det_F1": 0, + "Det_matched": 0, + "Text_matched": 0, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/results/roadtext_eval_results_zoomlr.json b/diffusion-dpo-ocr/results/roadtext_eval_results_zoomlr.json new file mode 100644 index 0000000000000000000000000000000000000000..cb79f586820eccb0b98d325c8800a5e925056add --- /dev/null +++ b/diffusion-dpo-ocr/results/roadtext_eval_results_zoomlr.json @@ -0,0 +1,17 @@ +{ + "TP": 8, + "FP": 106, + "FN": 1963, + "Precision": 0.07017543859649122, + "Recall": 0.004058853373921867, + "F1-Score": 0.007673860911270983, + "OCR_detections": 114, + "GT_boxes": 1971, + "Detection_rate": 0.0578386605783866, + "Det_Precision": 0.07017543859649122, + "Det_Recall": 0.004058853373921867, + "Det_F1": 0.007673860911270983, + "Det_matched": 8, + "Text_matched": 8, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/roadtext_eval_results_output.json b/diffusion-dpo-ocr/roadtext_eval_results_output.json new file mode 100644 index 0000000000000000000000000000000000000000..1421bba9de6cccf996a8c3097bcba019e244084d --- /dev/null +++ b/diffusion-dpo-ocr/roadtext_eval_results_output.json @@ -0,0 +1,17 @@ +{ + "TP": 167, + "FP": 707, + "FN": 1804, + "Precision": 0.19107551487414187, + "Recall": 0.08472856418061897, + "F1-Score": 0.11739894551845341, + "OCR_detections": 874, + "GT_boxes": 1971, + "Detection_rate": 0.44342973110096395, + "Det_Precision": 0.540045766590389, + "Det_Recall": 0.23947234906139014, + "Det_F1": 0.3318101933216168, + "Det_matched": 472, + "Text_matched": 167, + "eval_mode": "end2end" +} \ No newline at end of file diff --git a/diffusion-dpo-ocr/test_roadtext.py b/diffusion-dpo-ocr/test_roadtext.py new file mode 100644 index 0000000000000000000000000000000000000000..e2f199603dfbc5352891087ed6b27dadc2d89894 --- /dev/null +++ b/diffusion-dpo-ocr/test_roadtext.py @@ -0,0 +1,514 @@ +""" +RoadText1K OCR 评估脚本 +评估超分图片在 OCR 任务上的 Precision 和 Recall + +Metrics: + - Precision: TP / (TP + FP) + - Recall: TP / (TP + FN) + - F1-Score: 2 * Precision * Recall / (Precision + Recall) +""" + +import os +import json +from pathlib import Path +from typing import List, Dict, Tuple +import difflib + +import numpy as np +from PIL import Image +from tqdm import tqdm + + +# ============================================================================ +# 配置参数 - 请修改这里 +# ============================================================================ +CONFIG = { + # SR images 目录 + 'sr_dir': '/home/wanghongbo06/baipurui/results/RoadText/DreamClear/results/output', + # 'sr_dir': '/home/wanghongbo06/baipurui/DATA/RoadText1k_patch_crop/SR-Eval/zoomlr', + + # 标注文件 (prepare_roadtext.py 生成的) + 'annotation_file': '/home/wanghongbo06/baipurui/DATA/RoadText1k_patch_crop/annotations.json', + + # OCR 引擎选择: 'paddleocr', 'easyocr', 或 'tesseract' + 'ocr_engine': 'paddleocr', + + # 匹配参数 + 'iou_threshold': 0.3, # 检测框 IoU 阈值 + 'text_similarity_threshold': 0.3, # 文本相似度阈值 (0 = 只看检测,不看识别) + + # 评估模式: + # 'detection_only' - 只评估检测 (忽略文本内容) + # 'end2end' - 端到端评估 (检测 + 识别) + # 'recognition_only' - 只评估识别 (在GT框上裁剪后识别) + 'eval_mode': 'end2end', + + # OCR 配置 + 'device': 'gpu', # 'gpu' 或 'cpu' + + # 调试选项 + 'debug_visualize': True, # 是否可视化前几张图的检测框 + 'debug_save_dir': './debug_vis', # 可视化保存目录 + + # 输出 + 'output': './roadtext_eval_results.json', +} +# ============================================================================ + + +def load_ocr_model(engine='paddleocr', device='gpu'): + """加载 OCR 模型""" + if engine == 'paddleocr': + try: + from paddleocr import PaddleOCR + print("Loading PaddleOCR...") + ocr = PaddleOCR( + lang='en', + device=device, + ) + return ocr, 'paddleocr' + except ImportError: + print("PaddleOCR not found. Install: pip install paddleocr") + raise + + elif engine == 'easyocr': + try: + import easyocr + print("Loading EasyOCR...") + use_gpu = (device == 'gpu') + reader = easyocr.Reader(['en'], gpu=use_gpu) + return reader, 'easyocr' + except ImportError: + print("EasyOCR not found. Install: pip install easyocr") + raise + + else: + raise ValueError(f"Unsupported OCR engine: {engine}") + + +def run_ocr(image_path: str, ocr_model, engine: str) -> List[Dict]: + """ + 运行 OCR + + Returns: + List of dicts with keys: 'polygon', 'text', 'confidence' + """ + results = [] + + if engine == 'paddleocr': + ocr_result = ocr_model.ocr(str(image_path)) + + if ocr_result and ocr_result[0]: + for line in ocr_result[0]: + polygon = np.array(line[0]).flatten().tolist() # [[x1,y1],[x2,y2],...] -> [x1,y1,x2,y2,...] + text = line[1][0] + confidence = line[1][1] + + results.append({ + 'polygon': polygon, + 'text': text, + 'confidence': confidence, + }) + + elif engine == 'easyocr': + ocr_result = ocr_model.readtext(str(image_path)) + + for detection in ocr_result: + polygon = np.array(detection[0]).flatten().tolist() + text = detection[1] + confidence = detection[2] + + results.append({ + 'polygon': polygon, + 'text': text, + 'confidence': confidence, + }) + + return results + + +def polygon_iou(poly1: List[float], poly2: List[float]) -> float: + """ + 计算两个多边形的 IoU + poly: [x1,y1,x2,y2,x3,y3,x4,y4] + """ + try: + from shapely.geometry import Polygon + + # 转换为 Polygon 对象 + p1 = Polygon([(poly1[i], poly1[i+1]) for i in range(0, len(poly1), 2)]) + p2 = Polygon([(poly2[i], poly2[i+1]) for i in range(0, len(poly2), 2)]) + + if not p1.is_valid or not p2.is_valid: + return 0.0 + + # 计算 IoU + intersection = p1.intersection(p2).area + union = p1.union(p2).area + + if union == 0: + return 0.0 + + return intersection / union + + except ImportError: + # 使用 bbox IoU 近似 + return bbox_iou_from_polygon(poly1, poly2) + + +def bbox_iou_from_polygon(poly1: List[float], poly2: List[float]) -> float: + """使用 bbox 近似计算 IoU""" + # 获取 bbox + x1_min = min(poly1[0::2]) + y1_min = min(poly1[1::2]) + x1_max = max(poly1[0::2]) + y1_max = max(poly1[1::2]) + + x2_min = min(poly2[0::2]) + y2_min = min(poly2[1::2]) + x2_max = max(poly2[0::2]) + y2_max = max(poly2[1::2]) + + # 计算交集 + inter_xmin = max(x1_min, x2_min) + inter_ymin = max(y1_min, y2_min) + inter_xmax = min(x1_max, x2_max) + inter_ymax = min(y1_max, y2_max) + + if inter_xmax <= inter_xmin or inter_ymax <= inter_ymin: + return 0.0 + + inter_area = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin) + area1 = (x1_max - x1_min) * (y1_max - y1_min) + area2 = (x2_max - x2_min) * (y2_max - y2_min) + union_area = area1 + area2 - inter_area + + return inter_area / union_area if union_area > 0 else 0.0 + + +def text_similarity(text1: str, text2: str) -> float: + """计算文本相似度""" + text1 = text1.lower().strip() + text2 = text2.lower().strip() + + if text1 == text2: + return 1.0 + + # 使用编辑距离 + return difflib.SequenceMatcher(None, text1, text2).ratio() + + +def match_detections( + pred_results: List[Dict], + gt_annotations: Dict, + iou_thresh: float = 0.5, + text_sim_thresh: float = 0.5, + eval_mode: str = 'end2end', +) -> Tuple[int, int, int, Dict]: + """ + 匹配预测和GT + + Args: + eval_mode: 'detection_only', 'end2end', 'recognition_only' + + Returns: + (TP, FP, FN, details) + """ + gt_polygons = gt_annotations['polygons'] + gt_texts = gt_annotations['texts'] + gt_ignore = gt_annotations.get('ignore', [False] * len(gt_texts)) + + matched_gt = set() + tp = 0 + fp = 0 + + # 详细统计 + details = { + 'det_matched': 0, # 检测匹配数 (IoU 通过) + 'text_matched': 0, # 文本匹配数 + 'det_fp': 0, # 检测误检 + } + + # 遍历预测结果 + for pred in pred_results: + pred_poly = pred['polygon'] + pred_text = pred['text'] + + best_iou = 0 + best_gt_idx = -1 + + # 找到最佳匹配的 GT + for gt_idx, (gt_poly, gt_text, ignore) in enumerate(zip(gt_polygons, gt_texts, gt_ignore)): + if ignore or gt_idx in matched_gt: + continue + + iou = polygon_iou(pred_poly, gt_poly) + + if iou > best_iou: + best_iou = iou + best_gt_idx = gt_idx + + # 判断是否匹配成功 + if best_iou >= iou_thresh and best_gt_idx >= 0: + details['det_matched'] += 1 + + if eval_mode == 'detection_only': + # 只看检测,不看文本 + tp += 1 + matched_gt.add(best_gt_idx) + else: + # 检查文本相似度 + gt_text = gt_texts[best_gt_idx] + sim = text_similarity(pred_text, gt_text) + + if sim >= text_sim_thresh: + tp += 1 + matched_gt.add(best_gt_idx) + details['text_matched'] += 1 + else: + # 根据评估模式决定是否计为 FP + if eval_mode == 'end2end': + fp += 1 # 位置对但文字错,计为 FP + else: + fp += 1 # 误检 + details['det_fp'] += 1 + + # 计算 FN (未检测到的GT) + fn = 0 + for gt_idx, ignore in enumerate(gt_ignore): + if not ignore and gt_idx not in matched_gt: + fn += 1 + + return tp, fp, fn, details + + +def visualize_detections( + img_path: Path, + gt_ann: Dict, + pred_results: List[Dict], + save_path: Path, +): + """可视化 GT 和 OCR 检测框对比""" + import cv2 + + img = cv2.imread(str(img_path)) + if img is None: + return + + # 绘制 GT 框 (绿色) + for i, (poly, text, ignore) in enumerate(zip( + gt_ann['polygons'], gt_ann['texts'], gt_ann.get('ignore', [False] * len(gt_ann['texts'])) + )): + if ignore: + continue + pts = np.array(poly).reshape(-1, 2).astype(np.int32) + cv2.polylines(img, [pts], True, (0, 255, 0), 2) + # 标注 GT 文本 + x, y = int(pts[0][0]), int(pts[0][1]) - 5 + cv2.putText(img, f"GT:{text[:10]}", (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1) + + # 绘制 OCR 检测框 (红色) + for pred in pred_results: + poly = pred['polygon'] + text = pred['text'] + pts = np.array(poly).reshape(-1, 2).astype(np.int32) + cv2.polylines(img, [pts], True, (0, 0, 255), 2) + # 标注 OCR 文本 + x, y = int(pts[0][0]), int(pts[0][1]) + 15 + cv2.putText(img, f"OCR:{text[:10]}", (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 255), 1) + + cv2.imwrite(str(save_path), img) + + +def evaluate_dataset( + sr_dir: str, + annotation_file: str, + ocr_model, + engine: str, + debug: bool = False, +) -> Dict: + """评估整个数据集""" + sr_dir = Path(sr_dir) + eval_mode = CONFIG.get('eval_mode', 'end2end') + debug_visualize = CONFIG.get('debug_visualize', False) + debug_save_dir = Path(CONFIG.get('debug_save_dir', './debug_vis')) + + if debug_visualize: + debug_save_dir.mkdir(parents=True, exist_ok=True) + + # 加载标注 + with open(annotation_file, 'r', encoding='utf-8') as f: + annotations = json.load(f) + + # 调试:统计标注信息 + total_gt_boxes = 0 + total_ignored = 0 + for img_name, ann in annotations.items(): + total_gt_boxes += len(ann['texts']) + total_ignored += sum(ann.get('ignore', [False] * len(ann['texts']))) + print(f"\n标注统计: 总共 {len(annotations)} 张图片, {total_gt_boxes} 个文本框, {total_ignored} 个被忽略") + print(f"评估模式: {eval_mode}") + + total_tp = 0 + total_fp = 0 + total_fn = 0 + + # 汇总详细统计 + total_details = { + 'det_matched': 0, + 'text_matched': 0, + 'det_fp': 0, + } + + print("Running OCR evaluation...") + total_ocr_detections = 0 + debug_count = 0 + vis_count = 0 + + for img_name, gt_ann in tqdm(annotations.items()): + img_path = sr_dir / img_name + + if not img_path.exists(): + # 尝试其他扩展名 + img_path = sr_dir / (Path(img_name).stem + '.jpg') + if not img_path.exists(): + continue + + # 运行 OCR + pred_results = run_ocr(img_path, ocr_model, engine) + total_ocr_detections += len(pred_results) + + # 调试:打印前3张图片的详细信息 + if debug_count < 3: + print(f"\n[DEBUG] Image: {img_name}") + print(f" GT boxes: {len(gt_ann['polygons'])}, ignored: {sum(gt_ann.get('ignore', []))}") + print(f" OCR detections: {len(pred_results)}") + if gt_ann['polygons']: + print(f" GT[0] polygon: {gt_ann['polygons'][0][:4]}... text: '{gt_ann['texts'][0]}'") + if pred_results: + print(f" OCR[0] polygon: {pred_results[0]['polygon'][:4]}... text: '{pred_results[0]['text']}'") + debug_count += 1 + + # 可视化 + if debug_visualize and vis_count < 20: + vis_path = debug_save_dir / f"vis_{img_name}" + visualize_detections(img_path, gt_ann, pred_results, vis_path) + vis_count += 1 + + # 匹配 + tp, fp, fn, details = match_detections( + pred_results, + gt_ann, + iou_thresh=CONFIG['iou_threshold'], + text_sim_thresh=CONFIG['text_similarity_threshold'], + eval_mode=eval_mode, + ) + + total_tp += tp + total_fp += fp + total_fn += fn + + for k, v in details.items(): + total_details[k] += v + + print(f"\nOCR 总共检测到 {total_ocr_detections} 个文本框") + + # 计算指标 + precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0 + recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0 + f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 + + # 统计 GT 信息 (不含 ignore) + total_gt_boxes_valid = total_tp + total_fn + + # 计算纯检测指标 (不考虑文本内容) + det_precision = total_details['det_matched'] / total_ocr_detections if total_ocr_detections > 0 else 0 + det_recall = total_details['det_matched'] / total_gt_boxes_valid if total_gt_boxes_valid > 0 else 0 + det_f1 = 2 * det_precision * det_recall / (det_precision + det_recall) if (det_precision + det_recall) > 0 else 0 + + return { + 'TP': total_tp, + 'FP': total_fp, + 'FN': total_fn, + 'Precision': precision, + 'Recall': recall, + 'F1-Score': f1_score, + 'OCR_detections': total_ocr_detections, + 'GT_boxes': total_gt_boxes_valid, + 'Detection_rate': total_ocr_detections / total_gt_boxes_valid if total_gt_boxes_valid > 0 else 0, + # 新增:纯检测指标 + 'Det_Precision': det_precision, + 'Det_Recall': det_recall, + 'Det_F1': det_f1, + 'Det_matched': total_details['det_matched'], + 'Text_matched': total_details['text_matched'], + 'eval_mode': eval_mode, + } + + +def main(): + # 自动生成 output 路径:根据 sr_dir 最后一个目录名 + sr_dir = Path(CONFIG['sr_dir']) + baseline_name = sr_dir.name # 获取最后一个目录名,如 'sr', 'gt', 'bicubic' 等 + output_path = Path(f"./roadtext_eval_results_{baseline_name}.json") + + print("="*60) + print("RoadText1K OCR Evaluation") + print("="*60) + print(f"SR images: {CONFIG['sr_dir']}") + print(f"Annotations: {CONFIG['annotation_file']}") + print(f"OCR engine: {CONFIG['ocr_engine']}") + print(f"IoU threshold: {CONFIG['iou_threshold']}") + print(f"Text similarity threshold: {CONFIG['text_similarity_threshold']}") + print(f"Output will be saved to: {output_path}") + print() + + # 加载 OCR 模型 + ocr_model, engine = load_ocr_model( + CONFIG['ocr_engine'], + device=CONFIG['device'] + ) + + # 评估 + results = evaluate_dataset( + CONFIG['sr_dir'], + CONFIG['annotation_file'], + ocr_model, + engine, + ) + + # 打印结果 + print("\n" + "="*60) + print("EVALUATION RESULTS") + print("="*60) + print(f"评估模式: {results.get('eval_mode', 'end2end')}") + + print("\n[Detection Statistics]") + print(f" GT text boxes (valid): {results['GT_boxes']}") + print(f" OCR detections: {results['OCR_detections']}") + print(f" Detection matched (IoU): {results.get('Det_matched', 'N/A')}") + print(f" Text matched: {results.get('Text_matched', 'N/A')}") + + print("\n[Detection-Only Metrics] (只看框位置,不看文字)") + print(f" Det Precision: {results.get('Det_Precision', 0)*100:.2f}%") + print(f" Det Recall: {results.get('Det_Recall', 0)*100:.2f}%") + print(f" Det F1-Score: {results.get('Det_F1', 0)*100:.2f}%") + + print("\n[End-to-End Metrics] (检测 + 识别)") + print(f" True Positives: {results['TP']}") + print(f" False Positives: {results['FP']}") + print(f" False Negatives: {results['FN']}") + print(f" Precision: {results['Precision']*100:.2f}%") + print(f" Recall: {results['Recall']*100:.2f}%") + print(f" F1-Score: {results['F1-Score']*100:.2f}%") + + # 保存结果 + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w') as f: + json.dump(results, f, indent=2) + print(f"\nResults saved to {output_path}") + + +if __name__ == '__main__': + main() + diff --git a/diffusion-dpo-ocr/verify_roadtext_annotations.py b/diffusion-dpo-ocr/verify_roadtext_annotations.py new file mode 100644 index 0000000000000000000000000000000000000000..fdaee5e1d7f739c51a54210228b0a6a4fe9872a5 --- /dev/null +++ b/diffusion-dpo-ocr/verify_roadtext_annotations.py @@ -0,0 +1,223 @@ +""" +验证 RoadText1K 预处理结果的脚本 +可视化 GT 标注框与 crop 后图像的对应关系,检查坐标是否正确 +""" + +import os +import json +import random +from pathlib import Path +from typing import Dict, List + +import cv2 +import numpy as np +from PIL import Image + + +# ============================================================================ +# 配置 +# ============================================================================ +CONFIG = { + # GT 图像目录 + 'gt_dir': '/home/wanghongbo06/baipurui/DATA/RoadText1k_patch_crop/gt', + + # 标注文件路径 + 'annotation_file': '/home/wanghongbo06/baipurui/DATA/RoadText1k_patch_crop/annotations.json', + + # 可视化输出目录 + 'vis_output_dir': './verify_roadtext_vis', + + # 可视化图片数量 + 'num_samples': 20, + + # 随机种子 + 'seed': 42, +} +# ============================================================================ + + +def draw_annotations(img: np.ndarray, ann: Dict, color=(0, 255, 0), thickness=2) -> np.ndarray: + """在图像上绘制标注框""" + img_vis = img.copy() + + polygons = ann.get('polygons', []) + texts = ann.get('texts', []) + ignores = ann.get('ignore', [False] * len(texts)) + + for i, (poly, text, ignore) in enumerate(zip(polygons, texts, ignores)): + if ignore: + box_color = (128, 128, 128) # 灰色 = ignore + else: + box_color = color + + # 绘制多边形 + pts = np.array(poly).reshape(-1, 2).astype(np.int32) + cv2.polylines(img_vis, [pts], True, box_color, thickness) + + # 标注文本 + x, y = int(pts[0][0]), int(pts[0][1]) - 5 + if y < 15: + y = int(pts[0][1]) + 15 + + # 缩短文本显示 + display_text = text[:15] + "..." if len(text) > 15 else text + cv2.putText(img_vis, display_text, (x, y), + cv2.FONT_HERSHEY_SIMPLEX, 0.4, box_color, 1) + + return img_vis + + +def verify_single_image( + img_path: Path, + ann: Dict, + save_path: Path, +) -> Dict: + """验证单张图片的标注""" + img = cv2.imread(str(img_path)) + if img is None: + return {'error': f'Cannot read image: {img_path}'} + + h, w = img.shape[:2] + + # 统计信息 + stats = { + 'img_size': (w, h), + 'num_boxes': len(ann.get('polygons', [])), + 'num_ignored': sum(ann.get('ignore', [])), + 'boxes_in_bounds': 0, + 'boxes_out_of_bounds': 0, + } + + # 检查标注框是否在图像范围内 + for poly in ann.get('polygons', []): + xs = [poly[i] for i in range(0, len(poly), 2)] + ys = [poly[i] for i in range(1, len(poly), 2)] + + if min(xs) >= 0 and max(xs) <= w and min(ys) >= 0 and max(ys) <= h: + stats['boxes_in_bounds'] += 1 + else: + stats['boxes_out_of_bounds'] += 1 + print(f" [WARNING] Box out of bounds: x=[{min(xs):.1f}, {max(xs):.1f}], y=[{min(ys):.1f}, {max(ys):.1f}], img={w}x{h}") + + # 绘制标注 + img_vis = draw_annotations(img, ann) + + # 保存 + cv2.imwrite(str(save_path), img_vis) + + return stats + + +def main(): + random.seed(CONFIG['seed']) + + gt_dir = Path(CONFIG['gt_dir']) + ann_file = Path(CONFIG['annotation_file']) + vis_dir = Path(CONFIG['vis_output_dir']) + vis_dir.mkdir(parents=True, exist_ok=True) + + print("=" * 60) + print("RoadText1K 标注验证脚本") + print("=" * 60) + print(f"GT dir: {gt_dir}") + print(f"Annotations: {ann_file}") + print(f"Output: {vis_dir}") + print() + + # 检查文件是否存在 + if not ann_file.exists(): + print(f"Error: Annotation file not found: {ann_file}") + return + + if not gt_dir.exists(): + print(f"Error: GT directory not found: {gt_dir}") + return + + # 加载标注 + with open(ann_file, 'r', encoding='utf-8') as f: + annotations = json.load(f) + + print(f"总共 {len(annotations)} 张图片有标注") + + # 全局统计 + total_boxes = 0 + total_ignored = 0 + total_in_bounds = 0 + total_out_of_bounds = 0 + + # 计算所有图片的统计 + for img_name, ann in annotations.items(): + total_boxes += len(ann.get('polygons', [])) + total_ignored += sum(ann.get('ignore', [])) + + # 检查边界 + for poly in ann.get('polygons', []): + xs = [poly[i] for i in range(0, len(poly), 2)] + ys = [poly[i] for i in range(1, len(poly), 2)] + if min(xs) >= 0 and max(xs) <= 512 and min(ys) >= 0 and max(ys) <= 512: + total_in_bounds += 1 + else: + total_out_of_bounds += 1 + + print(f"\n全局统计:") + print(f" 总文本框: {total_boxes}") + print(f" 忽略框: {total_ignored}") + print(f" 有效框 (非忽略): {total_boxes - total_ignored}") + print(f" 框在图像范围内: {total_in_bounds}") + print(f" 框超出范围: {total_out_of_bounds}") + + if total_out_of_bounds > 0: + print(f"\n⚠️ 有 {total_out_of_bounds} 个框超出图像范围!这可能是坐标转换问题。") + + # 随机选取一些图片进行可视化 + img_names = list(annotations.keys()) + num_samples = min(CONFIG['num_samples'], len(img_names)) + selected = random.sample(img_names, num_samples) + + print(f"\n随机选取 {num_samples} 张图片进行可视化...") + + for img_name in selected: + ann = annotations[img_name] + img_path = gt_dir / img_name + + if not img_path.exists(): + # 尝试其他扩展名 + img_path = gt_dir / (Path(img_name).stem + '.jpg') + + if not img_path.exists(): + print(f" [SKIP] Image not found: {img_name}") + continue + + save_path = vis_dir / f"verify_{img_name}" + stats = verify_single_image(img_path, ann, save_path) + + if 'error' not in stats: + print(f" [OK] {img_name}: {stats['num_boxes']} boxes, " + f"{stats['boxes_out_of_bounds']} out of bounds") + + print(f"\n可视化结果保存到: {vis_dir}") + + # 额外检查:打印一些标注样例 + print("\n" + "=" * 60) + print("标注样例 (前 3 张图):") + print("=" * 60) + + for i, (img_name, ann) in enumerate(list(annotations.items())[:3]): + print(f"\n[{i+1}] {img_name}") + print(f" crop_position: {ann.get('crop_position', 'N/A')}") + print(f" num_boxes: {len(ann.get('polygons', []))}") + + for j, (poly, text, ignore) in enumerate(zip( + ann.get('polygons', [])[:2], # 只显示前2个 + ann.get('texts', [])[:2], + ann.get('ignore', [])[:2] + )): + xs = [poly[k] for k in range(0, len(poly), 2)] + ys = [poly[k] for k in range(1, len(poly), 2)] + print(f" Box {j}: x=[{min(xs):.1f}, {max(xs):.1f}], y=[{min(ys):.1f}, {max(ys):.1f}], " + f"text='{text[:20]}', ignore={ignore}") + + +if __name__ == '__main__': + main() + diff --git a/diffusion-dpo-test/DIAGNOSTIC_CHECKLIST.md b/diffusion-dpo-test/DIAGNOSTIC_CHECKLIST.md new file mode 100644 index 0000000000000000000000000000000000000000..012df5579dfb0c178abfe35c69db8701a1d9192a --- /dev/null +++ b/diffusion-dpo-test/DIAGNOSTIC_CHECKLIST.md @@ -0,0 +1,297 @@ +# 🔍 完整诊断检查清单 + +## 问题描述 +训练了 15 和 60 个 epoch,测试结果**逐像素完全相同**(262144 个 RGB 值一模一样) + +--- + +## ✅ 已确认正常的部分 + +### 1. 训练动态正常 +- ✅ `acc` 和 `l_acc` 在变化 +- ✅ `loss` 在下降 +- ✅ `Max lora_B Check` 从 `1.04e-05` 增长到 `2.23e-03`(200+ 倍增长) + +### 2. 代码逻辑正常 +- ✅ `disable_adapter()` 使用了 `with` 语句(已修复) +- ✅ VAE 编码在 `no_grad()` 中 +- ✅ `ref_pred` 正确 detach +- ✅ LoRA 键名已清理(去除 `base_model.model.` 前缀) +- ✅ 优化器在正确的时机创建(dtype 转换之后) + +### 3. x_embedder 为 0 是正常的 +- ✅ 输入层通常不需要针对 DPO 偏好优化 +- ✅ 其他深层(`single_transformer_blocks.*`)的权重在增长 +- ✅ LoRA 是加法,不是乘法,0 不会阻塞梯度流 + +--- + +## ❓ 需要检查的部分 + +### 🎯 检查 1: Checkpoint 权重是否真的在变化? + +**这是最关键的检查!** + +```bash +# 在训练机器上运行 +cd <训练脚本运行目录> # 即 run.sh 的执行目录 + +# 1. 确认文件存在 +ls -lh results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors +ls -lh results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors + +# 2. 对比权重 +python compare_checkpoints.py \ + results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors \ + results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors +``` + +**预期结果**: +- ✅ **正常**: "50%+ 的参数在变化",最大差异 > 1e-4 +- ❌ **异常**: "完全相同" 或 "< 10% 参数变化" + +**如果异常,说明**: +- 保存逻辑有问题 +- 或者训练根本没生效(但这与 loss 下降矛盾) + +--- + +### 🎯 检查 2: 测试代码加载的是哪个 checkpoint? + +**问题**: 测试代码路径和训练输出路径不匹配 + +训练输出: +``` +results_1202_4/checkpoint-XX/lora_train_unet/adapter_model.safetensors +``` + +测试代码加载: +```python +# diffusion-dpo-test/test.py line 33 +"/home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-50/..." +``` + +注意 `results_1130` vs `results_1202_4`! + +**验证方法**: +```bash +# 在测试机器上 +# 1. 确认测试代码实际加载的文件 +ls -lh /home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-50/lora_train_unet/adapter_model.safetensors +ls -lh /home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-500/lora_train_unet/adapter_model.safetensors + +# 2. 检查文件修改时间(是否是最新训练的) +stat /home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-50/lora_train_unet/adapter_model.safetensors + +# 3. 对比这两个文件 +python compare_checkpoints.py \ + /home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-50/lora_train_unet/adapter_model.safetensors \ + /home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-500/lora_train_unet/adapter_model.safetensors +``` + +**如果这两个文件完全相同**: +- 说明您测试的不是最新训练的模型 +- 需要更新测试代码的路径,或者将最新的 checkpoint 复制到测试机器 + +--- + +### 🎯 检查 3: 测试代码的 LoRA 融合逻辑 + +测试代码使用了**两次 fuse_lora**: + +```python +# 第一次:SR base LoRA +pipe.load_lora_weights("...pytorch_lora_weights_v2.safetensors", adapter_name="sr") +pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr"]) +pipe.unload_lora_weights() + +# 第二次:DPO trained LoRA +pipe.load_lora_weights("...adapter_model.safetensors", adapter_name="sr2") +pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr2"]) +pipe.unload_lora_weights() +``` + +**潜在问题**: +1. `unload_lora_weights()` 可能清除了刚融合的权重 +2. 第二次 `fuse_lora` 可能没有正确叠加到第一次的结果上 +3. 如果两个 LoRA 的权重完全相同,输出自然也相同 + +**验证方法 A: 只加载 DPO LoRA** +```python +# 临时修改 test.py +pipe = FluxPipeline.from_pretrained(...).to("cuda") + +# 只加载第一个 LoRA +pipe.load_lora_weights( + "/home/wanghongbo06/baipurui/CKPTs/FLUX_SR/pytorch_lora_weights_v2.safetensors", + adapter_name="sr" +) +pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr"]) +pipe.unload_lora_weights() + +# 生成图像 -> 保存为 result_sr_only.png + +# 然后只加载第二个 LoRA +pipe2 = FluxPipeline.from_pretrained(...).to("cuda") +pipe2.load_lora_weights( + "/home/wanghongbo06/diffusion-dpo-adv/results_1130/checkpoint-60/lora_train_unet/adapter_model.safetensors", + adapter_name="dpo" +) +pipe2.fuse_lora(lora_scale=1.0, adapter_names=["dpo"]) + +# 生成图像 -> 保存为 result_dpo_only.png +``` + +如果 `result_dpo_only.png` 在不同 epoch 之间也完全相同,说明问题在 checkpoint 本身。 + +**验证方法 B: 检查融合后的权重** +```python +# 在 test.py 中添加调试代码 +import torch + +# 加载第一个 LoRA 后 +pipe.load_lora_weights("...pytorch_lora_weights_v2.safetensors", adapter_name="sr") +pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr"]) + +# 保存融合后的 transformer 权重快照 +transformer_weights_after_sr = { + k: v.clone() for k, v in pipe.transformer.state_dict().items() + if 'single_transformer_blocks.30' in k +} + +pipe.unload_lora_weights() + +# 加载第二个 LoRA 后 +pipe.load_lora_weights("...adapter_model.safetensors", adapter_name="sr2") +pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr2"]) + +transformer_weights_after_dpo = { + k: v.clone() for k, v in pipe.transformer.state_dict().items() + if 'single_transformer_blocks.30' in k +} + +# 对比 +for k in transformer_weights_after_sr.keys(): + diff = (transformer_weights_after_dpo[k] - transformer_weights_after_sr[k]).abs().max() + print(f"{k}: max_diff = {diff:.6e}") +``` + +如果所有 diff 都是 0,说明第二个 LoRA 没有被正确应用。 + +--- + +### 🎯 检查 4: Checkpoint 文件的完整性 + +```bash +# 在训练机器上 +python inspect_safetensor.py results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors +``` + +**预期结果**: +- 应该有 100+ 个参数张量 +- `single_transformer_blocks.*` 的 lora_B 应该有非零值 +- 非零参数比例应该 > 50% + +**如果异常**: +- 文件损坏 +- 或者保存逻辑有问题 + +--- + +## 🔧 可能的修复方案 + +### 方案 1: 如果 checkpoint 权重没有变化 +**原因**: 保存逻辑有问题 + +**修复**: 检查 `save_model_hook` 中的状态字典获取逻辑 + +```python +# train_single_lora.py line 672 +full_state_dict = accelerator.get_state_dict(model) +``` + +可能需要改为: +```python +from peft import get_peft_model_state_dict +full_state_dict = get_peft_model_state_dict(model, adapter_name="train_unet") +``` + +--- + +### 方案 2: 如果测试代码路径错误 +**原因**: 加载了旧的 checkpoint + +**修复**: 更新测试代码的路径,或者将最新 checkpoint 复制到测试机器 + +```bash +# 在训练机器上 +scp -r results_1202_4/checkpoint-60 user@test-machine:/home/wanghongbo06/diffusion-dpo-adv/ +``` + +--- + +### 方案 3: 如果 LoRA 融合有问题 +**原因**: `fuse_lora` 和 `unload_lora_weights` 的交互有问题 + +**修复**: 使用 `set_adapters` 而不是 `fuse_lora` + +```python +# 新的测试代码 +pipe = FluxPipeline.from_pretrained(...).to("cuda") + +# 加载两个 LoRA(不融合) +pipe.load_lora_weights("...pytorch_lora_weights_v2.safetensors", adapter_name="sr") +pipe.load_lora_weights("...adapter_model.safetensors", adapter_name="dpo") + +# 同时启用两个 adapter +pipe.set_adapters(["sr", "dpo"], adapter_weights=[1.0, 1.0]) + +# 生成图像 +result_img = generate(pipe, ...) +``` + +--- + +## 📊 诊断流程图 + +``` +开始 + ↓ +检查 1: 对比 checkpoint 权重 + ├─ 权重有变化 → 继续检查 2 + └─ 权重无变化 → 【问题在训练/保存】→ 方案 1 + ↓ +检查 2: 确认测试代码加载的路径 + ├─ 路径正确 → 继续检查 3 + └─ 路径错误 → 【问题在测试代码】→ 方案 2 + ↓ +检查 3: 验证 LoRA 融合逻辑 + ├─ 融合正确 → 继续检查 4 + └─ 融合失败 → 【问题在测试代码】→ 方案 3 + ↓ +检查 4: 检查 checkpoint 文件完整性 + ├─ 文件完整 → 【未知问题,需要更深入调查】 + └─ 文件损坏 → 【问题在保存】→ 方案 1 +``` + +--- + +## 🚀 立即执行 + +**请在训练机器上运行以下命令**: + +```bash +cd + +# 1. 对比 checkpoint(最重要!) +python /data2/hongbo.wang/DPO-SR/compare_checkpoints.py \ + results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors \ + results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors + +# 2. 检查单个 checkpoint +python /data2/hongbo.wang/DPO-SR/inspect_safetensor.py \ + results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors +``` + +**请将输出结果发给我,我会根据结果进一步诊断!** + diff --git a/diffusion-dpo-test/DIV2K-val/sobolev-400/0000843-seed-0.png b/diffusion-dpo-test/DIV2K-val/sobolev-400/0000843-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..6181f70d875cfbed1f6563769f09dea312f5af17 Binary files /dev/null and b/diffusion-dpo-test/DIV2K-val/sobolev-400/0000843-seed-0.png differ diff --git a/diffusion-dpo-test/__pycache__/color_fix.cpython-310.pyc b/diffusion-dpo-test/__pycache__/color_fix.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..067f77515e708f82fcbe4475d40c10f3e528cbce Binary files /dev/null and b/diffusion-dpo-test/__pycache__/color_fix.cpython-310.pyc differ diff --git a/diffusion-dpo-test/analyze_lora_magnitude.py b/diffusion-dpo-test/analyze_lora_magnitude.py new file mode 100644 index 0000000000000000000000000000000000000000..8de6160091dcfa53e0b80fd09d8267bb597b47d4 --- /dev/null +++ b/diffusion-dpo-test/analyze_lora_magnitude.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +""" +分析 LoRA 权重的实际大小,理解为什么效果差异这么大 +""" +import os +os.environ["HF_HOME"] = "/home/wanghongbo06/.cache/huggingface" + +import torch +from safetensors.torch import load_file +import numpy as np + +def analyze_lora(path, name=""): + """分析单个 LoRA 文件""" + print(f"\n{'='*80}") + print(f"分析 LoRA: {name}") + print(f"路径: {path}") + print(f"{'='*80}") + + state_dict = load_file(path) + + # 分离 lora_A 和 lora_B + lora_a_keys = [k for k in state_dict.keys() if "lora_A" in k] + lora_b_keys = [k for k in state_dict.keys() if "lora_B" in k] + + print(f"\n总参数数: {len(state_dict)}") + print(f" - lora_A: {len(lora_a_keys)} 个") + print(f" - lora_B: {len(lora_b_keys)} 个") + + # 分析 lora_A + print(f"\n--- lora_A 统计 ---") + a_means = [] + a_maxs = [] + a_stds = [] + for k in lora_a_keys: + v = state_dict[k].float() + a_means.append(v.mean().item()) + a_maxs.append(v.abs().max().item()) + a_stds.append(v.std().item()) + + print(f" Mean of means: {np.mean(a_means):.6e}") + print(f" Max of maxs: {np.max(a_maxs):.6e}") + print(f" Mean of stds: {np.mean(a_stds):.6e}") + + # 分析 lora_B + print(f"\n--- lora_B 统计 ---") + b_means = [] + b_maxs = [] + b_stds = [] + b_nonzero_ratio = [] + for k in lora_b_keys: + v = state_dict[k].float() + b_means.append(v.mean().item()) + b_maxs.append(v.abs().max().item()) + b_stds.append(v.std().item()) + b_nonzero_ratio.append((v.abs() > 1e-10).float().mean().item()) + + print(f" Mean of means: {np.mean(b_means):.6e}") + print(f" Max of maxs: {np.max(b_maxs):.6e}") + print(f" Mean of stds: {np.mean(b_stds):.6e}") + print(f" Avg non-zero ratio: {np.mean(b_nonzero_ratio)*100:.2f}%") + + # LoRA 的实际贡献 = A @ B + # 估算 LoRA 对权重的影响大小 + print(f"\n--- LoRA 影响估算 ---") + print(f" LoRA 输出 ≈ x @ A.T @ B.T") + print(f" |A| * |B| ≈ {np.max(a_maxs) * np.max(b_maxs):.6e}") + + # 找出最大的几个 lora_B + print(f"\n--- 最大的 5 个 lora_B 层 ---") + b_with_max = [(k, state_dict[k].float().abs().max().item()) for k in lora_b_keys] + b_with_max.sort(key=lambda x: -x[1]) + for k, v in b_with_max[:5]: + print(f" {v:.6e}: {k}") + + return state_dict + + +def compare_loras(path1, path2, name1="LoRA 1", name2="LoRA 2"): + """对比两个 LoRA""" + print(f"\n{'='*80}") + print(f"对比 {name1} vs {name2}") + print(f"{'='*80}") + + sd1 = load_file(path1) + sd2 = load_file(path2) + + # 对比 lora_B 的变化 + lora_b_keys = [k for k in sd1.keys() if "lora_B" in k] + + diffs = [] + for k in lora_b_keys: + if k in sd2: + diff = (sd2[k].float() - sd1[k].float()).abs() + diffs.append({ + 'key': k, + 'max_diff': diff.max().item(), + 'mean_diff': diff.mean().item(), + 'val1_max': sd1[k].float().abs().max().item(), + 'val2_max': sd2[k].float().abs().max().item(), + }) + + # 按差异排序 + diffs.sort(key=lambda x: -x['max_diff']) + + print(f"\n变化最大的 10 个 lora_B 层:") + print("-" * 100) + for d in diffs[:10]: + print(f" max_diff={d['max_diff']:.6e}, {name1}_max={d['val1_max']:.6e}, {name2}_max={d['val2_max']:.6e}") + print(f" {d['key']}") + + # 总体统计 + all_max_diffs = [d['max_diff'] for d in diffs] + print(f"\n总体统计:") + print(f" 最大差异: {max(all_max_diffs):.6e}") + print(f" 平均差异: {np.mean(all_max_diffs):.6e}") + print(f" 差异 > 1e-4 的层数: {sum(1 for d in all_max_diffs if d > 1e-4)}") + print(f" 差异 > 1e-5 的层数: {sum(1 for d in all_max_diffs if d > 1e-5)}") + + +def compare_with_sr_lora(sr_path, dpo_path): + """对比 SR LoRA 和 DPO LoRA 的量级""" + print(f"\n{'='*80}") + print(f"对比 SR LoRA 和 DPO LoRA 的量级") + print(f"{'='*80}") + + sr_sd = load_file(sr_path) + dpo_sd = load_file(dpo_path) + + # SR LoRA 的量级 + sr_b_maxs = [] + for k, v in sr_sd.items(): + if "lora_B" in k or "lora_down" in k: # 不同格式可能用不同命名 + sr_b_maxs.append(v.float().abs().max().item()) + + # DPO LoRA 的量级 + dpo_b_maxs = [] + for k, v in dpo_sd.items(): + if "lora_B" in k: + dpo_b_maxs.append(v.float().abs().max().item()) + + print(f"\nSR LoRA (lora_B/lora_down):") + if sr_b_maxs: + print(f" Max: {max(sr_b_maxs):.6e}") + print(f" Mean: {np.mean(sr_b_maxs):.6e}") + else: + print(f" (没有找到 lora_B 或 lora_down)") + # 打印所有 key 看看格式 + print(f" SR LoRA keys 示例: {list(sr_sd.keys())[:5]}") + + print(f"\nDPO LoRA (lora_B):") + print(f" Max: {max(dpo_b_maxs):.6e}") + print(f" Mean: {np.mean(dpo_b_maxs):.6e}") + + if sr_b_maxs and dpo_b_maxs: + ratio = max(dpo_b_maxs) / max(sr_b_maxs) if max(sr_b_maxs) > 0 else float('inf') + print(f"\n量级比较:") + print(f" DPO / SR = {ratio:.4f}") + if ratio > 1: + print(f" ⚠️ DPO LoRA 比 SR LoRA 大 {ratio:.1f} 倍!这可能导致效果变差") + else: + print(f" DPO LoRA 比 SR LoRA 小 {1/ratio:.1f} 倍") + + +if __name__ == "__main__": + # 分析 DPO LoRA checkpoints + ckpt_15 = "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors" + ckpt_105 = "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-105/lora_train_unet/adapter_model.safetensors" + sr_lora = "/home/wanghongbo06/baipurui/CKPTs/FLUX_SR/pytorch_lora_weights_v2.safetensors" + + # 1. 分析各个 checkpoint + analyze_lora(ckpt_15, "DPO checkpoint-15") + analyze_lora(ckpt_105, "DPO checkpoint-105") + + # 2. 对比 15 vs 105 + compare_loras(ckpt_15, ckpt_105, "ckpt-15", "ckpt-105") + + # 3. 对比 SR LoRA 和 DPO LoRA 的量级 + compare_with_sr_lora(sr_lora, ckpt_15) + diff --git a/diffusion-dpo-test/check_lora_keys.py b/diffusion-dpo-test/check_lora_keys.py new file mode 100644 index 0000000000000000000000000000000000000000..7e0d195e0d6707abbcb061559d6949f998773596 --- /dev/null +++ b/diffusion-dpo-test/check_lora_keys.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +"""检查 LoRA safetensor 文件的 key 格式""" +from safetensors.torch import load_file +import sys + +def check_keys(path): + print(f"\n检查文件: {path}") + print("=" * 80) + + state_dict = load_file(path) + + print(f"总共 {len(state_dict)} 个 key\n") + + print("前 20 个 key:") + print("-" * 80) + for i, key in enumerate(sorted(state_dict.keys())[:20]): + print(f" {key}") + + print("\n..." if len(state_dict) > 20 else "") + + # 检查 key 格式 + print("\nKey 格式分析:") + print("-" * 80) + + has_transformer_prefix = any(k.startswith("transformer.") for k in state_dict.keys()) + has_base_model_prefix = any(k.startswith("base_model.") for k in state_dict.keys()) + has_lora_A = any("lora_A" in k for k in state_dict.keys()) + has_lora_B = any("lora_B" in k for k in state_dict.keys()) + has_train_unet = any("train_unet" in k for k in state_dict.keys()) + + print(f" 包含 'transformer.' 前缀: {has_transformer_prefix}") + print(f" 包含 'base_model.' 前缀: {has_base_model_prefix}") + print(f" 包含 'lora_A': {has_lora_A}") + print(f" 包含 'lora_B': {has_lora_B}") + print(f" 包含 'train_unet': {has_train_unet}") + + # 显示一个完整的 key 示例 + sample_key = list(state_dict.keys())[0] + print(f"\n示例 key: {sample_key}") + print(f"示例 shape: {state_dict[sample_key].shape}") + + # 检查 Diffusers 期望的格式 + print("\n" + "=" * 80) + print("Diffusers load_lora_weights 期望的 key 格式:") + print("-" * 80) + print(" transformer.single_transformer_blocks.0.attn.to_k.lora_A.weight") + print(" transformer.single_transformer_blocks.0.attn.to_k.lora_B.weight") + print("\n您的 key 格式:") + print(f" {sample_key}") + + # 判断是否兼容 + print("\n" + "=" * 80) + if has_train_unet: + print("❌ 问题: 您的 key 包含 '.train_unet.' 后缀!") + print(" Diffusers 期望: xxx.lora_A.weight") + print(" 您的格式: xxx.lora_A.train_unet.weight") + print("\n 这就是 LoRA 无法加载的原因!") + elif not has_transformer_prefix: + print("⚠️ 问题: 您的 key 缺少 'transformer.' 前缀!") + print(" Diffusers 期望: transformer.xxx.lora_A.weight") + print(f" 您的格式: {sample_key}") + else: + print("✅ Key 格式看起来正确") + +if __name__ == "__main__": + paths = [ + "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors", + "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors", + ] + + for path in paths: + try: + check_keys(path) + except Exception as e: + print(f"❌ 无法读取 {path}: {e}") + diff --git a/diffusion-dpo-test/color_fix.py b/diffusion-dpo-test/color_fix.py new file mode 100644 index 0000000000000000000000000000000000000000..b02c59054f8ba984b69789c6a861bb3bd49329fb --- /dev/null +++ b/diffusion-dpo-test/color_fix.py @@ -0,0 +1,119 @@ +''' +# -------------------------------------------------------------------------------- +# Color fixed script from Li Yi (https://github.com/pkuliyi2015/sd-webui-stablesr/blob/master/srmodule/colorfix.py) +# -------------------------------------------------------------------------------- +''' + +import torch +from PIL import Image +from torch import Tensor +from torch.nn import functional as F + +from torchvision.transforms import ToTensor, ToPILImage + +def adain_color_fix(target: Image, source: Image): + # Convert images to tensors + to_tensor = ToTensor() + target_tensor = to_tensor(target).unsqueeze(0) + source_tensor = to_tensor(source).unsqueeze(0) + + # Apply adaptive instance normalization + result_tensor = adaptive_instance_normalization(target_tensor, source_tensor) + + # Convert tensor back to image + to_image = ToPILImage() + result_image = to_image(result_tensor.squeeze(0).clamp_(0.0, 1.0)) + + return result_image + +def wavelet_color_fix(target: Image, source: Image): + # Convert images to tensors + to_tensor = ToTensor() + target_tensor = to_tensor(target).unsqueeze(0) + source_tensor = to_tensor(source).unsqueeze(0) + + # Apply wavelet reconstruction + result_tensor = wavelet_reconstruction(target_tensor, source_tensor) + + # Convert tensor back to image + to_image = ToPILImage() + result_image = to_image(result_tensor.squeeze(0).clamp_(0.0, 1.0)) + + return result_image + +def calc_mean_std(feat: Tensor, eps=1e-5): + """Calculate mean and std for adaptive_instance_normalization. + Args: + feat (Tensor): 4D tensor. + eps (float): A small value added to the variance to avoid + divide-by-zero. Default: 1e-5. + """ + size = feat.size() + assert len(size) == 4, 'The input feature should be 4D tensor.' + b, c = size[:2] + feat_var = feat.reshape(b, c, -1).var(dim=2) + eps + feat_std = feat_var.sqrt().reshape(b, c, 1, 1) + feat_mean = feat.reshape(b, c, -1).mean(dim=2).reshape(b, c, 1, 1) + return feat_mean, feat_std + +def adaptive_instance_normalization(content_feat:Tensor, style_feat:Tensor): + """Adaptive instance normalization. + Adjust the reference features to have the similar color and illuminations + as those in the degradate features. + Args: + content_feat (Tensor): The reference feature. + style_feat (Tensor): The degradate features. + """ + size = content_feat.size() + style_mean, style_std = calc_mean_std(style_feat) + content_mean, content_std = calc_mean_std(content_feat) + normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size) + return normalized_feat * style_std.expand(size) + style_mean.expand(size) + +def wavelet_blur(image: Tensor, radius: int): + """ + Apply wavelet blur to the input tensor. + """ + # input shape: (1, 3, H, W) + # convolution kernel + kernel_vals = [ + [0.0625, 0.125, 0.0625], + [0.125, 0.25, 0.125], + [0.0625, 0.125, 0.0625], + ] + kernel = torch.tensor(kernel_vals, dtype=image.dtype, device=image.device) + # add channel dimensions to the kernel to make it a 4D tensor + kernel = kernel[None, None] + # repeat the kernel across all input channels + kernel = kernel.repeat(3, 1, 1, 1) + image = F.pad(image, (radius, radius, radius, radius), mode='replicate') + # apply convolution + output = F.conv2d(image, kernel, groups=3, dilation=radius) + return output + +def wavelet_decomposition(image: Tensor, levels=5): + """ + Apply wavelet decomposition to the input tensor. + This function only returns the low frequency & the high frequency. + """ + high_freq = torch.zeros_like(image) + for i in range(levels): + radius = 2 ** i + low_freq = wavelet_blur(image, radius) + high_freq += (image - low_freq) + image = low_freq + + return high_freq, low_freq + +def wavelet_reconstruction(content_feat:Tensor, style_feat:Tensor): + """ + Apply wavelet decomposition, so that the content will have the same color as the style. + """ + # calculate the wavelet decomposition of the content feature + content_high_freq, content_low_freq = wavelet_decomposition(content_feat) + del content_low_freq + # calculate the wavelet decomposition of the style feature + style_high_freq, style_low_freq = wavelet_decomposition(style_feat) + del style_high_freq + # reconstruct the content feature with the style's high frequency + return content_high_freq + style_low_freq \ No newline at end of file diff --git a/diffusion-dpo-test/compare.py b/diffusion-dpo-test/compare.py new file mode 100644 index 0000000000000000000000000000000000000000..21f1829cba865db3fc5f3f75238e8ed9f69cd2c8 --- /dev/null +++ b/diffusion-dpo-test/compare.py @@ -0,0 +1,73 @@ +from PIL import Image +import numpy as np + + +def compare_images(img_path1, img_path2, diff_output_path=None): + """ + 逐像素比较两张 RGB 图像的差异。 + 要求两张图分辨率相同、都是 RGB 模式。 + + :param img_path1: 第一张图片路径 + :param img_path2: 第二张图片路径 + :param diff_output_path: 如果不为 None,则保存差异图到该路径 + :return: 一个字典,包含一些差异统计信息 + """ + # 打开图片并转换为 RGB(避免有的图片是 RGBA/灰度模式) + img1 = Image.open(img_path1).convert("RGB") + img2 = Image.open(img_path2).convert("RGB") + + # 检查分辨率是否一致 + if img1.size != img2.size: + raise ValueError(f"两张图分辨率不同: {img1.size} vs {img2.size}") + + # 转为 NumPy 数组,形状为 (H, W, 3) + arr1 = np.array(img1, dtype=np.int16) # 用 int16 以免减法溢出 + arr2 = np.array(img2, dtype=np.int16) + + # 逐像素逐通道做差,取绝对值 + diff = np.abs(arr1 - arr2) # (H, W, 3) + + # 每个像素的“差异强度”可以用 RGB 差分的和或均值来表示 + # 这里用每像素的 RGB 差的平均值 + per_pixel_diff = diff.mean(axis=2) # (H, W) + + # 一些统计信息 + total_pixels = per_pixel_diff.size + # 差异为0的像素 + same_pixels = np.sum(per_pixel_diff == 0) + different_pixels = total_pixels - same_pixels + + max_diff = float(per_pixel_diff.max()) # 单像素最大平均差值 + mean_diff = float(per_pixel_diff.mean()) # 所有像素平均差值 + diff_ratio = different_pixels / total_pixels # 有差异像素占比 + + stats = { + "total_pixels": int(total_pixels), + "same_pixels": int(same_pixels), + "different_pixels": int(different_pixels), + "different_ratio": diff_ratio, # 0~1 之间 + "max_diff_per_pixel": max_diff, # 0~255 + "mean_diff_per_pixel": mean_diff + } + + # 如果需要输出一张差异图 + if diff_output_path is not None: + # diff 目前是 0~255 范围内的 RGB 差值,可以直接保存成图像看 + diff_img = np.clip(diff, 0, 255).astype(np.uint8) + diff_image = Image.fromarray(diff_img, mode="RGB") + diff_image.save(diff_output_path) + # 也可以考虑把差异增强一下再保存(例如乘个系数) + + return stats + + +if __name__ == "__main__": + img1_path = "./results-test/dpo_scale_ablation/dpo_scale_0.0/0000010-seed-0.png" + img2_path = "./results-test/dpo_scale_ablation/dpo_scale_1.0/0000010-seed-0.png" + diff_img_path = "diff.png" + + stats = compare_images(img1_path, img2_path, diff_output_path=diff_img_path) + + print("比较结果:") + for k, v in stats.items(): + print(f"{k}: {v}") diff --git a/diffusion-dpo-test/compare_checkpoints.py b/diffusion-dpo-test/compare_checkpoints.py new file mode 100644 index 0000000000000000000000000000000000000000..14a83121e09795b44f73e0c2ace8c8562aa5f499 --- /dev/null +++ b/diffusion-dpo-test/compare_checkpoints.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +"""对比两个 checkpoint 的 safetensor 文件,检查权重是否真的在变化""" +import sys +from safetensors.torch import load_file +import torch +import os + +# ============================== +# 在这里手动填写 checkpoint 路径 +# ============================== +CHECKPOINT_1 = r"/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors" +CHECKPOINT_2 = r"/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors" +# ============================== + + +def compare_safetensors(path1, path2): + print(f"\n{'='*80}") + print(f"对比两个 checkpoint:") + print(f" Checkpoint 1: {path1}") + print(f" Checkpoint 2: {path2}") + print(f"{'='*80}\n") + + try: + state_dict1 = load_file(path1) + state_dict2 = load_file(path2) + + # 检查键是否一致 + keys1 = set(state_dict1.keys()) + keys2 = set(state_dict2.keys()) + + if keys1 != keys2: + print("⚠️ 警告: 两个 checkpoint 的键不一致!") + print(f" 只在 checkpoint1 中: {keys1 - keys2}") + print(f" 只在 checkpoint2 中: {keys2 - keys1}") + return + + print(f"✅ 两个 checkpoint 都有 {len(keys1)} 个参数张量\n") + + # 统计差异 + identical_count = 0 + different_count = 0 + max_diff_info = None + max_diff = 0 + + layer_diffs = {} + + for key in sorted(keys1): + tensor1 = state_dict1[key] + tensor2 = state_dict2[key] + + diff = (tensor2 - tensor1).float() + abs_diff = diff.abs() + + max_abs_diff = abs_diff.max().item() + mean_abs_diff = abs_diff.mean().item() + + if max_abs_diff == 0: + identical_count += 1 + else: + different_count += 1 + + if max_abs_diff > max_diff: + max_diff = max_abs_diff + max_diff_info = { + 'key': key, + 'max_diff': max_abs_diff, + 'mean_diff': mean_abs_diff, + 'tensor1_max': tensor1.float().abs().max().item(), + 'tensor2_max': tensor2.float().abs().max().item(), + } + + if '.lora_B.' in key: + layer_name = key.split('.lora_B.')[0] + if layer_name not in layer_diffs: + layer_diffs[layer_name] = { + 'max_diff': max_abs_diff, + 'mean_diff': mean_abs_diff, + 'key': key + } + + print(f"差异统计:") + print(f" 完全相同的参数: {identical_count} / {len(keys1)} ({identical_count/len(keys1)*100:.2f}%)") + print(f" 有变化的参数: {different_count} / {len(keys1)} ({different_count/len(keys1)*100:.2f}%)") + print() + + if max_diff_info: + print(f"最大权重变化:") + print(f" 层: {max_diff_info['key']}") + print(f" 最大绝对差异: {max_diff_info['max_diff']:.6e}") + print(f" 平均绝对差异: {max_diff_info['mean_diff']:.6e}") + print(f" Checkpoint1 最大值: {max_diff_info['tensor1_max']:.6e}") + print(f" Checkpoint2 最大值: {max_diff_info['tensor2_max']:.6e}") + print() + + key_layers = ['x_embedder', 'transformer_blocks.0', 'transformer_blocks.9', + 'single_transformer_blocks.30', 'proj_out'] + + print("关键层的 lora_B 权重变化:") + print("-" * 80) + for layer_prefix in key_layers: + matching = [k for k in layer_diffs.keys() if layer_prefix in k] + if matching: + for layer_name in matching[:2]: + info = layer_diffs[layer_name] + print(f"\n层: {layer_name}") + print(f" 最大差异: {info['max_diff']:.6e}") + print(f" 平均差异: {info['mean_diff']:.6e}") + + key = info['key'] + t1 = state_dict1[key].float() + t2 = state_dict2[key].float() + print(f" Checkpoint1: mean={t1.mean():.6e}, max={t1.abs().max():.6e}") + print(f" Checkpoint2: mean={t2.mean():.6e}, max={t2.abs().max():.6e}") + + print("\n" + "="*80) + print("lora_B 权重变化最大的前 10 个层:") + print("-" * 80) + sorted_layers = sorted(layer_diffs.items(), key=lambda x: x[1]['max_diff'], reverse=True) + + for i, (layer_name, info) in enumerate(sorted_layers[:10], 1): + key = info['key'] + t1 = state_dict1[key].float() + t2 = state_dict2[key].float() + print(f"\n{i}. {layer_name}") + print(f" 最大差异: {info['max_diff']:.6e}, 平均差异: {info['mean_diff']:.6e}") + print(f" Ckpt1: mean={t1.mean():.6e}, max={t1.abs().max():.6e}") + print(f" Ckpt2: mean={t2.mean():.6e}, max={t2.abs().max():.6e}") + + print("\n" + "="*80) + if different_count == 0: + print("❌ 严重问题: 两个 checkpoint 完全相同,模型没有学习!") + elif different_count < len(keys1) * 0.1: + print(f"⚠️ 警告: 只有 {different_count/len(keys1)*100:.2f}% 的参数在变化,可能存在梯度阻塞") + else: + print(f"✅ 正常: {different_count/len(keys1)*100:.2f}% 的参数在变化") + if max_diff < 1e-6: + print(f"⚠️ 但是: 最大变化只有 {max_diff:.6e},变化幅度可能太小") + print("="*80) + + except Exception as e: + print(f"❌ 错误: {e}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + compare_safetensors(CHECKPOINT_1, CHECKPOINT_2) diff --git a/diffusion-dpo-test/data_val/0000009-seed-0.png b/diffusion-dpo-test/data_val/0000009-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..7c12c62f555c88fb19e6e0de3bc383bc09d480f9 Binary files /dev/null and b/diffusion-dpo-test/data_val/0000009-seed-0.png differ diff --git a/diffusion-dpo-test/data_val/0000010-seed-0.png b/diffusion-dpo-test/data_val/0000010-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..f01d36ff76660bfc570d58c34b42c3fb1ec8b73e Binary files /dev/null and b/diffusion-dpo-test/data_val/0000010-seed-0.png differ diff --git a/diffusion-dpo-test/fix_lora_keys.py b/diffusion-dpo-test/fix_lora_keys.py new file mode 100644 index 0000000000000000000000000000000000000000..86cbacaad2a1b2532a583e1ae547938d1060f004 --- /dev/null +++ b/diffusion-dpo-test/fix_lora_keys.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +""" +修复已保存的 LoRA checkpoint 的 key 格式 +将 PEFT 格式转换为 Diffusers load_lora_weights 期望的格式 + +PEFT 格式: x_embedder.lora_A.train_unet.weight +Diffusers 格式: transformer.x_embedder.lora_A.weight +""" +import os +import sys +from safetensors.torch import load_file, save_file + +def fix_lora_keys(input_path, output_path=None): + """ + 修复 LoRA checkpoint 的 key 格式 + + Args: + input_path: 输入的 safetensor 文件路径 + output_path: 输出路径,默认覆盖原文件(会先备份) + """ + if output_path is None: + output_path = input_path + + print(f"\n{'='*80}") + print(f"修复 LoRA Key 格式") + print(f" 输入: {input_path}") + print(f" 输出: {output_path}") + print(f"{'='*80}\n") + + # 加载原始 state_dict + state_dict = load_file(input_path) + print(f"加载了 {len(state_dict)} 个参数\n") + + # 显示原始 key 格式 + sample_key = list(state_dict.keys())[0] + print(f"原始 key 格式示例: {sample_key}") + + # 检查是否需要修复 + needs_fix = False + if "train_unet" in sample_key: + needs_fix = True + print(" ✓ 检测到 '.train_unet.' 后缀,需要移除") + if not sample_key.startswith("transformer."): + needs_fix = True + print(" ✓ 缺少 'transformer.' 前缀,需要添加") + + if not needs_fix: + print("\n✅ Key 格式已经正确,无需修复!") + return + + # 修复 key 格式 + print("\n开始修复...") + new_state_dict = {} + for k, v in state_dict.items(): + new_k = k + # 1. 移除 base_model.model. 前缀(如果有) + new_k = new_k.replace("base_model.model.", "") + # 2. 移除 .train_unet 后缀 + new_k = new_k.replace(".train_unet.", ".") + # 3. 添加 transformer. 前缀 + if not new_k.startswith("transformer."): + new_k = "transformer." + new_k + new_state_dict[new_k] = v + + # 显示修复后的 key 格式 + new_sample_key = list(new_state_dict.keys())[0] + print(f"修复后 key 格式示例: {new_sample_key}") + + # 备份原文件(如果覆盖) + if output_path == input_path: + backup_path = input_path + ".backup" + print(f"\n备份原文件到: {backup_path}") + os.rename(input_path, backup_path) + + # 保存修复后的文件 + save_file(new_state_dict, output_path) + print(f"\n✅ 已保存修复后的文件: {output_path}") + + # 验证 + print("\n验证修复结果...") + verify_dict = load_file(output_path) + verify_key = list(verify_dict.keys())[0] + + if verify_key.startswith("transformer.") and ".train_unet." not in verify_key: + print("✅ 验证通过!Key 格式正确") + else: + print(f"❌ 验证失败!Key 格式仍有问题: {verify_key}") + + return new_state_dict + + +def fix_checkpoint_dir(checkpoint_dir): + """修复整个 checkpoint 目录""" + lora_dir = os.path.join(checkpoint_dir, "lora_train_unet") + adapter_path = os.path.join(lora_dir, "adapter_model.safetensors") + + if os.path.exists(adapter_path): + fix_lora_keys(adapter_path) + else: + print(f"❌ 找不到文件: {adapter_path}") + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("用法:") + print(" python fix_lora_keys.py ") + print(" python fix_lora_keys.py ") + print() + print("示例:") + print(" python fix_lora_keys.py results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors") + print(" python fix_lora_keys.py results_1202_4/checkpoint-15") + print() + + # 默认修复 results_1202_4 下的所有 checkpoint + base_dir = "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4" + if os.path.exists(base_dir): + print(f"自动扫描 {base_dir} 下的所有 checkpoint...") + for item in sorted(os.listdir(base_dir)): + if item.startswith("checkpoint-"): + ckpt_path = os.path.join(base_dir, item) + fix_checkpoint_dir(ckpt_path) + else: + print(f"默认目录 {base_dir} 不存在") + else: + path = sys.argv[1] + if path.endswith(".safetensors"): + fix_lora_keys(path) + elif os.path.isdir(path): + fix_checkpoint_dir(path) + else: + print(f"❌ 无效路径: {path}") + diff --git a/diffusion-dpo-test/inspect_safetensor.py b/diffusion-dpo-test/inspect_safetensor.py new file mode 100644 index 0000000000000000000000000000000000000000..69e8f21311568b698583bbf6da961f2428c8cadc --- /dev/null +++ b/diffusion-dpo-test/inspect_safetensor.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""检查 safetensor 文件的内容""" +from safetensors.torch import load_file +import torch + +# ========================================================== +# 在这里手动填写 safetensors 文件路径(可填写多个) +# 示例: +# SAFETENSOR_PATHS = [ +# r"/path/to/adapter_model1.safetensors", +# r"/path/to/adapter_model2.safetensors", +# ] +# ========================================================== +SAFETENSOR_PATHS = [ + r"/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors", +] +# ========================================================== + + +def inspect_safetensor(path): + print(f"\n{'='*80}") + print(f"检查文件: {path}") + print(f"{'='*80}\n") + + try: + state_dict = load_file(path) + + print(f"总共有 {len(state_dict)} 个参数张量\n") + + total_params = 0 + zero_params = 0 + non_zero_params = 0 + + layer_stats = {} + + for key, tensor in state_dict.items(): + num_params = tensor.numel() + total_params += num_params + + non_zero_count = (tensor != 0).sum().item() + zero_count = num_params - non_zero_count + + if non_zero_count == 0: + zero_params += num_params + else: + non_zero_params += num_params + + if '.lora_A.' in key or '.lora_B.' in key: + layer_name = key.split('.lora_')[0] + if layer_name not in layer_stats: + layer_stats[layer_name] = { + 'lora_A': None, + 'lora_B': None + } + + stats_entry = { + 'mean': tensor.float().mean().item(), + 'std': tensor.float().std().item(), + 'max': tensor.float().max().item(), + 'min': tensor.float().min().item(), + 'non_zero_ratio': non_zero_count / num_params + } + + if '.lora_A.' in key: + layer_stats[layer_name]['lora_A'] = stats_entry + else: + layer_stats[layer_name]['lora_B'] = stats_entry + + print(f"参数统计:") + print(f" 总参数数: {total_params:,}") + print(f" 非零参数: {non_zero_params:,} ({non_zero_params/total_params*100:.2f}%)") + print(f" 零参数: {zero_params:,} ({zero_params/total_params*100:.2f}%)") + print() + + key_layers = ['x_embedder', 'transformer_blocks.0', 'transformer_blocks.9', + 'single_transformer_blocks.30', 'proj_out'] + + print("关键层统计:") + print("-" * 80) + for layer_name in key_layers: + matching_layers = [k for k in layer_stats.keys() if layer_name in k] + if matching_layers: + for full_layer in matching_layers[:3]: + stats = layer_stats[full_layer] + print(f"\n层: {full_layer}") + if stats['lora_A']: + print(f" lora_A: mean={stats['lora_A']['mean']:.6e}, " + f"max={stats['lora_A']['max']:.6e}, " + f"非零比例={stats['lora_A']['non_zero_ratio']*100:.2f}%") + if stats['lora_B']: + print(f" lora_B: mean={stats['lora_B']['mean']:.6e}, " + f"max={stats['lora_B']['max']:.6e}, " + f"非零比例={stats['lora_B']['non_zero_ratio']*100:.2f}%") + + print("\n" + "="*80) + print("lora_B 权重最大的前 5 个层:") + print("-" * 80) + lora_b_layers = [(k, v['lora_B']) for k, v in layer_stats.items() if v['lora_B'] is not None] + lora_b_layers.sort(key=lambda x: abs(x[1]['max']), reverse=True) + + for i, (layer_name, stats) in enumerate(lora_b_layers[:5], 1): + print(f"{i}. {layer_name}") + print(f" mean={stats['mean']:.6e}, max={stats['max']:.6e}, " + f"非零比例={stats['non_zero_ratio']*100:.2f}%") + + except Exception as e: + print(f"❌ 错误: {e}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + # 遍历手动填写的文件路径 + for path in SAFETENSOR_PATHS: + inspect_safetensor(path) diff --git a/diffusion-dpo-test/metrics.json b/diffusion-dpo-test/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..0c0104ed1ee1e19f9ebbde99e54919747fe4c9e4 --- /dev/null +++ b/diffusion-dpo-test/metrics.json @@ -0,0 +1,142 @@ +{ + "summary": { + "avg_inference_time_sec": 23.599896386852894, + "std_inference_time_sec": 6.796912376816178, + "min_inference_time_sec": 11.098074495792389, + "max_inference_time_sec": 28.76287134224549, + "median_inference_time_sec": 26.921020144131035, + "p95_inference_time_sec": 28.102163634379394, + "p99_inference_time_sec": 28.270882021300498, + "throughput_single_gpu_per_sec": 0.042373067390121394, + "throughput_parallel_per_sec": 0.08250964031858578, + "peak_memory_mb": 33811.7392578125, + "peak_memory_gb": 33.01927661895752, + "total_images": 100, + "warmup_images": 8, + "measured_images": 92, + "model_load_time_sec": 52.43063974380493, + "inference_wall_time_sec": 1211.9795894622803, + "total_time_sec": 1264.4102292060852, + "num_gpus": 2 + }, + "per_gpu_metrics": { + "1": { + "inference_times": [ + 27.854881670325994, + 27.943492013029754, + 27.888656420167536, + 27.804196048993617, + 27.924615799915045, + 27.952690774109215, + 27.882505219895393, + 27.896253335289657, + 28.09671812877059, + 27.85268287314102, + 27.742382244672626, + 28.76287134224549, + 28.061799827963114, + 28.054252550937235, + 27.92581091215834, + 27.954700701870024, + 27.952801280654967, + 27.932732206769288, + 28.13525341823697, + 27.90626529790461, + 28.22222373681143, + 27.82774563319981, + 27.953424714971334, + 28.111765013076365, + 27.881029167678207, + 27.89013520721346, + 27.974640298169106, + 28.015457140281796, + 28.10881925234571, + 27.839136187918484, + 27.980245498009026, + 27.941782257054, + 27.858478610869497, + 18.717311061918736, + 11.598434458952397, + 11.579710375983268, + 11.584534626919776, + 11.59768055099994, + 11.199870459735394, + 11.196961861103773, + 11.191290821880102, + 11.286846159026027, + 11.2286821231246, + 11.219772449228913, + 11.226453838404268, + 11.211608635727316 + ], + "warmup_time": 115.76587492786348, + "peak_memory_mb": 33810.9892578125, + "allocated_memory_mb": 33196.46240234375, + "reserved_memory_mb": 34506.0, + "total_images": 50, + "avg_inference_time": 23.434121787122898, + "std_inference_time": 7.309697334208333, + "throughput": 0.04267281740208, + "memory_efficiency": 96.20489886496189 + }, + "0": { + "inference_times": [ + 26.802028878591955, + 26.797191261779517, + 26.82992110401392, + 26.958114746958017, + 26.761777761392295, + 26.84792256169021, + 26.746575728990138, + 27.076817566063255, + 26.943395509850234, + 26.85071571683511, + 26.841394792776555, + 27.631777914240956, + 26.833319212775677, + 26.806214389856905, + 26.8738283761777, + 26.895515635609627, + 26.940260547678918, + 27.09848231682554, + 26.858372538816184, + 26.918541864026338, + 26.94270030176267, + 26.95117418281734, + 26.760037765838206, + 26.82909763790667, + 26.831056244205683, + 26.92349842423573, + 26.80812106281519, + 26.730416806880385, + 27.080423870123923, + 27.055579679086804, + 27.27255716919899, + 27.117452350910753, + 26.751979127991945, + 26.876946676988155, + 26.70633071102202, + 26.0279257488437, + 25.012685468886048, + 11.098074495792389, + 11.139604906085879, + 11.128950875252485, + 11.235403429716825, + 11.146971406880766, + 11.146004664245993, + 11.10717493435368, + 11.114083136897534, + 11.114445879124105 + ], + "warmup_time": 111.6877530622296, + "peak_memory_mb": 33811.7392578125, + "allocated_memory_mb": 33197.21240234375, + "reserved_memory_mb": 34506.0, + "total_images": 50, + "avg_inference_time": 23.76567098658289, + "std_inference_time": 6.237739828068353, + "throughput": 0.042077499118983785, + "memory_efficiency": 96.20707239999928 + } + } +} \ No newline at end of file diff --git a/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000263-seed-0.png b/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000263-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..00fd241e68d8f05580a8759a9774ae41f550bcce Binary files /dev/null and b/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000263-seed-0.png differ diff --git a/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000463-seed-0.png b/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000463-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..f3da207b5b10748545737726cc700276f365f565 Binary files /dev/null and b/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000463-seed-0.png differ diff --git a/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000563-seed-0.png b/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000563-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..813851f82fc8120e7c9eabcb842ade99fdcd95ee Binary files /dev/null and b/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000563-seed-0.png differ diff --git a/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000763-seed-0.png b/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000763-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..496ea95c8746e3b3848180a06e9bc6c256227e86 Binary files /dev/null and b/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000763-seed-0.png differ diff --git a/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000863-seed-0.png b/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000863-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..f0c0574e7eee417d007a2a05e6124a6f2d1ea6b4 Binary files /dev/null and b/diffusion-dpo-test/results-test/DIV2K-val-epoch10/0000863-seed-0.png differ diff --git a/diffusion-dpo-test/results-test/DrealSR/sony_160_x4.png b/diffusion-dpo-test/results-test/DrealSR/sony_160_x4.png new file mode 100644 index 0000000000000000000000000000000000000000..e52c34fb42096edc66ee02bf19fae764fe6652f0 Binary files /dev/null and b/diffusion-dpo-test/results-test/DrealSR/sony_160_x4.png differ diff --git a/diffusion-dpo-test/results-test/DrealSR/sony_189_x4.png b/diffusion-dpo-test/results-test/DrealSR/sony_189_x4.png new file mode 100644 index 0000000000000000000000000000000000000000..b45b5f54e38a9428380e58cfe755dbff60d03842 Binary files /dev/null and b/diffusion-dpo-test/results-test/DrealSR/sony_189_x4.png differ diff --git a/diffusion-dpo-test/src/flux/__pycache__/block.cpython-310.pyc b/diffusion-dpo-test/src/flux/__pycache__/block.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..987bf13950ff4fd29bb69a904f24efa95798e7f1 Binary files /dev/null and b/diffusion-dpo-test/src/flux/__pycache__/block.cpython-310.pyc differ diff --git a/diffusion-dpo-test/src/flux/__pycache__/block.cpython-311.pyc b/diffusion-dpo-test/src/flux/__pycache__/block.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e38516267034724f213ffd3f79609e51b1201cc Binary files /dev/null and b/diffusion-dpo-test/src/flux/__pycache__/block.cpython-311.pyc differ diff --git a/diffusion-dpo-test/src/flux/__pycache__/condition.cpython-310.pyc b/diffusion-dpo-test/src/flux/__pycache__/condition.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..73e5447e202660152a8b5d6d457b8520ba96eafe Binary files /dev/null and b/diffusion-dpo-test/src/flux/__pycache__/condition.cpython-310.pyc differ diff --git a/diffusion-dpo-test/src/flux/__pycache__/condition.cpython-311.pyc b/diffusion-dpo-test/src/flux/__pycache__/condition.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7099393dd3a89fe791ac8b77dd61257ac6d73415 Binary files /dev/null and b/diffusion-dpo-test/src/flux/__pycache__/condition.cpython-311.pyc differ diff --git a/diffusion-dpo-test/src/flux/__pycache__/generate.cpython-310.pyc b/diffusion-dpo-test/src/flux/__pycache__/generate.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c839c37010b83869542a936094dce1bd1b9381ce Binary files /dev/null and b/diffusion-dpo-test/src/flux/__pycache__/generate.cpython-310.pyc differ diff --git a/diffusion-dpo-test/src/flux/__pycache__/generate.cpython-311.pyc b/diffusion-dpo-test/src/flux/__pycache__/generate.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fef385667f87c3f476c532a261b0b949bf2714eb Binary files /dev/null and b/diffusion-dpo-test/src/flux/__pycache__/generate.cpython-311.pyc differ diff --git a/diffusion-dpo-test/src/flux/__pycache__/lora_controller.cpython-310.pyc b/diffusion-dpo-test/src/flux/__pycache__/lora_controller.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7693b913caa9f8fc24d5251d7912440e9a26441 Binary files /dev/null and b/diffusion-dpo-test/src/flux/__pycache__/lora_controller.cpython-310.pyc differ diff --git a/diffusion-dpo-test/src/flux/__pycache__/lora_controller.cpython-311.pyc b/diffusion-dpo-test/src/flux/__pycache__/lora_controller.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2cf9936547f74c304dc0e2909980651d33f040c7 Binary files /dev/null and b/diffusion-dpo-test/src/flux/__pycache__/lora_controller.cpython-311.pyc differ diff --git a/diffusion-dpo-test/src/flux/__pycache__/pipeline_tools.cpython-310.pyc b/diffusion-dpo-test/src/flux/__pycache__/pipeline_tools.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..812015fe704e9f0721e2694d1c422bde3b10cd18 Binary files /dev/null and b/diffusion-dpo-test/src/flux/__pycache__/pipeline_tools.cpython-310.pyc differ diff --git a/diffusion-dpo-test/src/flux/__pycache__/pipeline_tools.cpython-311.pyc b/diffusion-dpo-test/src/flux/__pycache__/pipeline_tools.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be59406291e72479e6a3c6bd0a13da41bddcaf96 Binary files /dev/null and b/diffusion-dpo-test/src/flux/__pycache__/pipeline_tools.cpython-311.pyc differ diff --git a/diffusion-dpo-test/src/flux/__pycache__/transformer.cpython-310.pyc b/diffusion-dpo-test/src/flux/__pycache__/transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b896c5a36395535d61dc8f440838ab2b5add8c96 Binary files /dev/null and b/diffusion-dpo-test/src/flux/__pycache__/transformer.cpython-310.pyc differ diff --git a/diffusion-dpo-test/src/flux/__pycache__/transformer.cpython-311.pyc b/diffusion-dpo-test/src/flux/__pycache__/transformer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..323307afda10b76f0d25f21f8a775bc675d29c3d Binary files /dev/null and b/diffusion-dpo-test/src/flux/__pycache__/transformer.cpython-311.pyc differ diff --git a/diffusion-dpo-test/src/flux/block.py b/diffusion-dpo-test/src/flux/block.py new file mode 100644 index 0000000000000000000000000000000000000000..1c9123abca755a4d66ac537260aaf24df2fd04e6 --- /dev/null +++ b/diffusion-dpo-test/src/flux/block.py @@ -0,0 +1,339 @@ +import torch +from typing import List, Union, Optional, Dict, Any, Callable +from diffusers.models.attention_processor import Attention, F +from .lora_controller import enable_lora + + +def attn_forward( + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor = None, + condition_latents: torch.FloatTensor = None, + attention_mask: Optional[torch.FloatTensor] = None, + image_rotary_emb: Optional[torch.Tensor] = None, + cond_rotary_emb: Optional[torch.Tensor] = None, + model_config: Optional[Dict[str, Any]] = {}, +) -> torch.FloatTensor: + batch_size, _, _ = ( + hidden_states.shape + if encoder_hidden_states is None + else encoder_hidden_states.shape + ) + + with enable_lora( + (attn.to_q, attn.to_k, attn.to_v), model_config.get("latent_lora", False) + ): + # `sample` projections. + query = attn.to_q(hidden_states) + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + + inner_dim = key.shape[-1] + head_dim = inner_dim // attn.heads + + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + if attn.norm_q is not None: + query = attn.norm_q(query) + if attn.norm_k is not None: + key = attn.norm_k(key) + + # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states` + if encoder_hidden_states is not None: + # `context` projections. + encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states) + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + + encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view( + batch_size, -1, attn.heads, head_dim + ).transpose(1, 2) + encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view( + batch_size, -1, attn.heads, head_dim + ).transpose(1, 2) + encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view( + batch_size, -1, attn.heads, head_dim + ).transpose(1, 2) + + if attn.norm_added_q is not None: + encoder_hidden_states_query_proj = attn.norm_added_q( + encoder_hidden_states_query_proj + ) + if attn.norm_added_k is not None: + encoder_hidden_states_key_proj = attn.norm_added_k( + encoder_hidden_states_key_proj + ) + + # attention + query = torch.cat([encoder_hidden_states_query_proj, query], dim=2) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=2) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=2) + + if image_rotary_emb is not None: + from diffusers.models.embeddings import apply_rotary_emb + + query = apply_rotary_emb(query, image_rotary_emb) + key = apply_rotary_emb(key, image_rotary_emb) + + if condition_latents is not None: + cond_query = attn.to_q(condition_latents) + cond_key = attn.to_k(condition_latents) + cond_value = attn.to_v(condition_latents) + + cond_query = cond_query.view(batch_size, -1, attn.heads, head_dim).transpose( + 1, 2 + ) + cond_key = cond_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + cond_value = cond_value.view(batch_size, -1, attn.heads, head_dim).transpose( + 1, 2 + ) + if attn.norm_q is not None: + cond_query = attn.norm_q(cond_query) + if attn.norm_k is not None: + cond_key = attn.norm_k(cond_key) + + if cond_rotary_emb is not None: + cond_query = apply_rotary_emb(cond_query, cond_rotary_emb) + cond_key = apply_rotary_emb(cond_key, cond_rotary_emb) + + if condition_latents is not None: + query = torch.cat([query, cond_query], dim=2) + key = torch.cat([key, cond_key], dim=2) + value = torch.cat([value, cond_value], dim=2) + + if not model_config.get("union_cond_attn", True): + # If we don't want to use the union condition attention, we need to mask the attention + # between the hidden states and the condition latents + attention_mask = torch.ones( + query.shape[2], key.shape[2], device=query.device, dtype=torch.bool + ) + condition_n = cond_query.shape[2] + attention_mask[-condition_n:, :-condition_n] = False + attention_mask[:-condition_n, -condition_n:] = False + elif model_config.get("independent_condition", False): + attention_mask = torch.ones( + query.shape[2], key.shape[2], device=query.device, dtype=torch.bool + ) + condition_n = cond_query.shape[2] + attention_mask[-condition_n:, :-condition_n] = False + if hasattr(attn, "c_factor"): + attention_mask = torch.zeros( + query.shape[2], key.shape[2], device=query.device, dtype=query.dtype + ) + condition_n = cond_query.shape[2] + bias = torch.log(attn.c_factor[0]) + attention_mask[-condition_n:, :-condition_n] = bias + attention_mask[:-condition_n, -condition_n:] = bias + hidden_states = F.scaled_dot_product_attention( + query, key, value, dropout_p=0.0, is_causal=False, attn_mask=attention_mask + ) + hidden_states = hidden_states.transpose(1, 2).reshape( + batch_size, -1, attn.heads * head_dim + ) + hidden_states = hidden_states.to(query.dtype) + + if encoder_hidden_states is not None: + if condition_latents is not None: + encoder_hidden_states, hidden_states, condition_latents = ( + hidden_states[:, : encoder_hidden_states.shape[1]], + hidden_states[ + :, encoder_hidden_states.shape[1] : -condition_latents.shape[1] + ], + hidden_states[:, -condition_latents.shape[1] :], + ) + else: + encoder_hidden_states, hidden_states = ( + hidden_states[:, : encoder_hidden_states.shape[1]], + hidden_states[:, encoder_hidden_states.shape[1] :], + ) + + with enable_lora((attn.to_out[0],), model_config.get("latent_lora", False)): + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + encoder_hidden_states = attn.to_add_out(encoder_hidden_states) + + if condition_latents is not None: + condition_latents = attn.to_out[0](condition_latents) + condition_latents = attn.to_out[1](condition_latents) + + return ( + (hidden_states, encoder_hidden_states, condition_latents) + if condition_latents is not None + else (hidden_states, encoder_hidden_states) + ) + elif condition_latents is not None: + # if there are condition_latents, we need to separate the hidden_states and the condition_latents + hidden_states, condition_latents = ( + hidden_states[:, : -condition_latents.shape[1]], + hidden_states[:, -condition_latents.shape[1] :], + ) + return hidden_states, condition_latents + else: + return hidden_states + + +def block_forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor, + condition_latents: torch.FloatTensor, + temb: torch.FloatTensor, + cond_temb: torch.FloatTensor, + cond_rotary_emb=None, + image_rotary_emb=None, + model_config: Optional[Dict[str, Any]] = {}, +): + use_cond = condition_latents is not None + with enable_lora((self.norm1.linear,), model_config.get("latent_lora", False)): + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( + hidden_states, emb=temb + ) + + norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = ( + self.norm1_context(encoder_hidden_states, emb=temb) + ) + + if use_cond: + ( + norm_condition_latents, + cond_gate_msa, + cond_shift_mlp, + cond_scale_mlp, + cond_gate_mlp, + ) = self.norm1(condition_latents, emb=cond_temb) + + # Attention. + result = attn_forward( + self.attn, + model_config=model_config, + hidden_states=norm_hidden_states, + encoder_hidden_states=norm_encoder_hidden_states, + condition_latents=norm_condition_latents if use_cond else None, + image_rotary_emb=image_rotary_emb, + cond_rotary_emb=cond_rotary_emb if use_cond else None, + ) + attn_output, context_attn_output = result[:2] + cond_attn_output = result[2] if use_cond else None + + # Process attention outputs for the `hidden_states`. + # 1. hidden_states + attn_output = gate_msa.unsqueeze(1) * attn_output + hidden_states = hidden_states + attn_output + # 2. encoder_hidden_states + context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output + encoder_hidden_states = encoder_hidden_states + context_attn_output + # 3. condition_latents + if use_cond: + cond_attn_output = cond_gate_msa.unsqueeze(1) * cond_attn_output + condition_latents = condition_latents + cond_attn_output + if model_config.get("add_cond_attn", False): + hidden_states += cond_attn_output + + # LayerNorm + MLP. + # 1. hidden_states + norm_hidden_states = self.norm2(hidden_states) + norm_hidden_states = ( + norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + ) + # 2. encoder_hidden_states + norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states) + norm_encoder_hidden_states = ( + norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None] + ) + # 3. condition_latents + if use_cond: + norm_condition_latents = self.norm2(condition_latents) + norm_condition_latents = ( + norm_condition_latents * (1 + cond_scale_mlp[:, None]) + + cond_shift_mlp[:, None] + ) + + # Feed-forward. + with enable_lora((self.ff.net[2],), model_config.get("latent_lora", False)): + # 1. hidden_states + ff_output = self.ff(norm_hidden_states) + ff_output = gate_mlp.unsqueeze(1) * ff_output + # 2. encoder_hidden_states + context_ff_output = self.ff_context(norm_encoder_hidden_states) + context_ff_output = c_gate_mlp.unsqueeze(1) * context_ff_output + # 3. condition_latents + if use_cond: + cond_ff_output = self.ff(norm_condition_latents) + cond_ff_output = cond_gate_mlp.unsqueeze(1) * cond_ff_output + + # Process feed-forward outputs. + hidden_states = hidden_states + ff_output + encoder_hidden_states = encoder_hidden_states + context_ff_output + if use_cond: + condition_latents = condition_latents + cond_ff_output + + # Clip to avoid overflow. + if encoder_hidden_states.dtype == torch.float16: + encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504) + + return encoder_hidden_states, hidden_states, condition_latents if use_cond else None + + +def single_block_forward( + self, + hidden_states: torch.FloatTensor, + temb: torch.FloatTensor, + image_rotary_emb=None, + condition_latents: torch.FloatTensor = None, + cond_temb: torch.FloatTensor = None, + cond_rotary_emb=None, + model_config: Optional[Dict[str, Any]] = {}, +): + + using_cond = condition_latents is not None + residual = hidden_states + with enable_lora( + ( + self.norm.linear, + self.proj_mlp, + ), + model_config.get("latent_lora", False), + ): + norm_hidden_states, gate = self.norm(hidden_states, emb=temb) + mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states)) + if using_cond: + residual_cond = condition_latents + norm_condition_latents, cond_gate = self.norm(condition_latents, emb=cond_temb) + mlp_cond_hidden_states = self.act_mlp(self.proj_mlp(norm_condition_latents)) + + attn_output = attn_forward( + self.attn, + model_config=model_config, + hidden_states=norm_hidden_states, + image_rotary_emb=image_rotary_emb, + **( + { + "condition_latents": norm_condition_latents, + "cond_rotary_emb": cond_rotary_emb if using_cond else None, + } + if using_cond + else {} + ), + ) + if using_cond: + attn_output, cond_attn_output = attn_output + + with enable_lora((self.proj_out,), model_config.get("latent_lora", False)): + hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2) + gate = gate.unsqueeze(1) + hidden_states = gate * self.proj_out(hidden_states) + hidden_states = residual + hidden_states + if using_cond: + condition_latents = torch.cat([cond_attn_output, mlp_cond_hidden_states], dim=2) + cond_gate = cond_gate.unsqueeze(1) + condition_latents = cond_gate * self.proj_out(condition_latents) + condition_latents = residual_cond + condition_latents + + if hidden_states.dtype == torch.float16: + hidden_states = hidden_states.clip(-65504, 65504) + + return hidden_states if not using_cond else (hidden_states, condition_latents) diff --git a/diffusion-dpo-test/src/flux/condition.py b/diffusion-dpo-test/src/flux/condition.py new file mode 100644 index 0000000000000000000000000000000000000000..782736020eeac897f41d8227851d83a7f2a93ca2 --- /dev/null +++ b/diffusion-dpo-test/src/flux/condition.py @@ -0,0 +1,131 @@ +import torch +from typing import Optional, Union, List, Tuple +from diffusers.pipelines import FluxPipeline +from PIL import Image, ImageFilter +import numpy as np +import cv2 + +from .pipeline_tools import encode_images + +condition_dict = { + "depth": 0, + "canny": 1, + "subject": 4, + "coloring": 6, + "deblurring": 7, + "depth_pred": 8, + "fill": 9, + "sr": 10, + "cartoon": 11, +} + + +class Condition(object): + def __init__( + self, + condition_type: str, + raw_img: Union[Image.Image, torch.Tensor] = None, + condition: Union[Image.Image, torch.Tensor] = None, + mask=None, + position_delta=None, + position_scale=1.0, + ) -> None: + self.condition_type = condition_type + assert raw_img is not None or condition is not None + if raw_img is not None: + self.condition = self.get_condition(condition_type, raw_img) + else: + self.condition = condition + self.position_delta = position_delta + self.position_scale = position_scale + # TODO: Add mask support + assert mask is None, "Mask not supported yet" + + def get_condition( + self, condition_type: str, raw_img: Union[Image.Image, torch.Tensor] + ) -> Union[Image.Image, torch.Tensor]: + """ + Returns the condition image. + """ + if condition_type == "depth": + from transformers import pipeline + + depth_pipe = pipeline( + task="depth-estimation", + model="LiheYoung/depth-anything-small-hf", + device="cuda", + ) + source_image = raw_img.convert("RGB") + condition_img = depth_pipe(source_image)["depth"].convert("RGB") + return condition_img + elif condition_type == "canny": + img = np.array(raw_img) + edges = cv2.Canny(img, 100, 200) + edges = Image.fromarray(edges).convert("RGB") + return edges + elif condition_type == "subject": + return raw_img + elif condition_type == "coloring": + return raw_img.convert("L").convert("RGB") + elif condition_type == "deblurring": + condition_image = ( + raw_img.convert("RGB") + .filter(ImageFilter.GaussianBlur(10)) + .convert("RGB") + ) + return condition_image + elif condition_type == "fill": + return raw_img.convert("RGB") + elif condition_type == "cartoon": + return raw_img.convert("RGB") + elif condition_type == "sr": + return raw_img.convert("RGB") + return self.condition + + @property + def type_id(self) -> int: + """ + Returns the type id of the condition. + """ + return condition_dict[self.condition_type] + + @classmethod + def get_type_id(cls, condition_type: str) -> int: + """ + Returns the type id of the condition. + """ + return condition_dict[condition_type] + + def encode(self, pipe: FluxPipeline) -> Tuple[torch.Tensor, torch.Tensor, int]: + """ + Encodes the condition into tokens, ids and type_id. + """ + if self.condition_type in [ + "depth", + "canny", + "subject", + "coloring", + "deblurring", + "depth_pred", + "fill", + "sr", + "cartoon", + ]: + tokens, ids = encode_images(pipe, self.condition) + else: + raise NotImplementedError( + f"Condition type {self.condition_type} not implemented" + ) + if self.position_delta is None and self.condition_type == "subject": + self.position_delta = [0, -self.condition.size[0] // 16] + if self.position_delta is not None: + ids[:, 1] += self.position_delta[0] + ids[:, 2] += self.position_delta[1] + if self.position_scale != 1.0: + scale_bias = (self.position_scale - 1.0) / 2 + ids[:, 1] *= self.position_scale + ids[:, 2] *= self.position_scale + ids[:, 1] += scale_bias + ids[:, 2] += scale_bias + type_id = torch.ones_like(ids[:, :1]) * self.type_id + return tokens, ids, type_id diff --git a/diffusion-dpo-test/src/flux/generate.py b/diffusion-dpo-test/src/flux/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..96b65f1ec151d8c65e2946a91f6b3acc49da67f1 --- /dev/null +++ b/diffusion-dpo-test/src/flux/generate.py @@ -0,0 +1,296 @@ +import torch +import yaml, os +from diffusers.pipelines import FluxPipeline +from typing import List, Union, Optional, Dict, Any, Callable +from .transformer import tranformer_forward +from .condition import Condition + +from diffusers.pipelines.flux.pipeline_flux import ( + FluxPipelineOutput, + calculate_shift, + retrieve_timesteps, + np, +) + + +def get_config(config_path: str = None): + config_path = config_path or os.environ.get("XFL_CONFIG") + if not config_path: + return {} + with open(config_path, "r") as f: + config = yaml.safe_load(f) + return config + + +def prepare_params( + prompt: Union[str, List[str]] = None, + prompt_2: Optional[Union[str, List[str]]] = None, + height: Optional[int] = 512, + width: Optional[int] = 512, + num_inference_steps: int = 28, + timesteps: List[int] = None, + guidance_scale: float = 3.5, + num_images_per_prompt: Optional[int] = 1, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + joint_attention_kwargs: Optional[Dict[str, Any]] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + max_sequence_length: int = 512, + **kwargs: dict, +): + return ( + prompt, + prompt_2, + height, + width, + num_inference_steps, + timesteps, + guidance_scale, + num_images_per_prompt, + generator, + latents, + prompt_embeds, + pooled_prompt_embeds, + output_type, + return_dict, + joint_attention_kwargs, + callback_on_step_end, + callback_on_step_end_tensor_inputs, + max_sequence_length, + ) + + +def seed_everything(seed: int = 42): + torch.backends.cudnn.deterministic = True + torch.manual_seed(seed) + np.random.seed(seed) + + +@torch.no_grad() +def generate( + pipeline: FluxPipeline, + conditions: List[Condition] = None, + config_path: str = None, + model_config: Optional[Dict[str, Any]] = {}, + condition_scale: float = 1.0, + default_lora: bool = False, + **params: dict, +): + model_config = model_config or get_config(config_path).get("model", {}) + if condition_scale != 1: + for name, module in pipeline.transformer.named_modules(): + if not name.endswith(".attn"): + continue + module.c_factor = torch.ones(1, 1) * condition_scale + + self = pipeline + ( + prompt, + prompt_2, + height, + width, + num_inference_steps, + timesteps, + guidance_scale, + num_images_per_prompt, + generator, + latents, + prompt_embeds, + pooled_prompt_embeds, + output_type, + return_dict, + joint_attention_kwargs, + callback_on_step_end, + callback_on_step_end_tensor_inputs, + max_sequence_length, + ) = prepare_params(**params) + + height = height or self.default_sample_size * self.vae_scale_factor + width = width or self.default_sample_size * self.vae_scale_factor + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + prompt_2, + height, + width, + prompt_embeds=prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs, + max_sequence_length=max_sequence_length, + ) + + self._guidance_scale = guidance_scale + self._joint_attention_kwargs = joint_attention_kwargs + self._interrupt = False + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + lora_scale = ( + self.joint_attention_kwargs.get("scale", None) + if self.joint_attention_kwargs is not None + else None + ) + ( + prompt_embeds, + pooled_prompt_embeds, + text_ids, + ) = self.encode_prompt( + prompt=prompt, + prompt_2=prompt_2, + prompt_embeds=prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + device=device, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + lora_scale=lora_scale, + ) + + # 4. Prepare latent variables + num_channels_latents = self.transformer.config.in_channels // 4 + latents, latent_image_ids = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 4.1. Prepare conditions + condition_latents, condition_ids, condition_type_ids = ([] for _ in range(3)) + use_condition = conditions is not None or [] + if use_condition: + assert len(conditions) <= 1, "Only one condition is supported for now." + if not default_lora: + pipeline.set_adapters(conditions[0].condition_type) + for condition in conditions: + tokens, ids, type_id = condition.encode(self) + condition_latents.append(tokens) # [batch_size, token_n, token_dim] + condition_ids.append(ids) # [token_n, id_dim(3)] + condition_type_ids.append(type_id) # [token_n, 1] + condition_latents = torch.cat(condition_latents, dim=1) + condition_ids = torch.cat(condition_ids, dim=0) + condition_type_ids = torch.cat(condition_type_ids, dim=0) + + # 5. Prepare timesteps + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + image_seq_len = latents.shape[1] + mu = calculate_shift( + image_seq_len, + self.scheduler.config.base_image_seq_len, + self.scheduler.config.max_image_seq_len, + self.scheduler.config.base_shift, + self.scheduler.config.max_shift, + ) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, + num_inference_steps, + device, + timesteps, + sigmas, + mu=mu, + ) + num_warmup_steps = max( + len(timesteps) - num_inference_steps * self.scheduler.order, 0 + ) + self._num_timesteps = len(timesteps) + + # 6. Denoising loop + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timestep = t.expand(latents.shape[0]).to(latents.dtype) + + # handle guidance + if self.transformer.config.guidance_embeds: + guidance = torch.tensor([guidance_scale], device=device) + guidance = guidance.expand(latents.shape[0]) + else: + guidance = None + noise_pred = tranformer_forward( + self.transformer, + model_config=model_config, + # Inputs of the condition (new feature) + condition_latents=condition_latents if use_condition else None, + condition_ids=condition_ids if use_condition else None, + condition_type_ids=condition_type_ids if use_condition else None, + # Inputs to the original transformer + hidden_states=latents, + # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing) + timestep=timestep / 1000, + guidance=guidance, + pooled_projections=pooled_prompt_embeds, + encoder_hidden_states=prompt_embeds, + txt_ids=text_ids, + img_ids=latent_image_ids, + joint_attention_kwargs=self.joint_attention_kwargs, + return_dict=False, + )[0] + + # compute the previous noisy sample x_t -> x_t-1 + latents_dtype = latents.dtype + latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + + if latents.dtype != latents_dtype: + if torch.backends.mps.is_available(): + # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 + latents = latents.to(latents_dtype) + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ( + (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 + ): + progress_bar.update() + + if output_type == "latent": + image = latents + + else: + latents = self._unpack_latents(latents, height, width, self.vae_scale_factor) + latents = ( + latents / self.vae.config.scaling_factor + ) + self.vae.config.shift_factor + image = self.vae.decode(latents, return_dict=False)[0] + image = self.image_processor.postprocess(image, output_type=output_type) + + # Offload all models + self.maybe_free_model_hooks() + + if condition_scale != 1: + for name, module in pipeline.transformer.named_modules(): + if not name.endswith(".attn"): + continue + del module.c_factor + + if not return_dict: + return (image,) + + return FluxPipelineOutput(images=image) diff --git a/diffusion-dpo-test/src/flux/lora_controller.py b/diffusion-dpo-test/src/flux/lora_controller.py new file mode 100644 index 0000000000000000000000000000000000000000..21b23eae2bdeaab171df4616e174dd6d96351620 --- /dev/null +++ b/diffusion-dpo-test/src/flux/lora_controller.py @@ -0,0 +1,75 @@ +from peft.tuners.tuners_utils import BaseTunerLayer +from typing import List, Any, Optional, Type + + +class enable_lora: + def __init__(self, lora_modules: List[BaseTunerLayer], activated: bool) -> None: + self.activated: bool = activated + if activated: + return + self.lora_modules: List[BaseTunerLayer] = [ + each for each in lora_modules if isinstance(each, BaseTunerLayer) + ] + self.scales = [ + { + active_adapter: lora_module.scaling[active_adapter] + for active_adapter in lora_module.active_adapters + } + for lora_module in self.lora_modules + ] + + def __enter__(self) -> None: + if self.activated: + return + + for lora_module in self.lora_modules: + if not isinstance(lora_module, BaseTunerLayer): + continue + lora_module.scale_layer(0) + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[Any], + ) -> None: + if self.activated: + return + for i, lora_module in enumerate(self.lora_modules): + if not isinstance(lora_module, BaseTunerLayer): + continue + for active_adapter in lora_module.active_adapters: + lora_module.scaling[active_adapter] = self.scales[i][active_adapter] + + +class set_lora_scale: + def __init__(self, lora_modules: List[BaseTunerLayer], scale: float) -> None: + self.lora_modules: List[BaseTunerLayer] = [ + each for each in lora_modules if isinstance(each, BaseTunerLayer) + ] + self.scales = [ + { + active_adapter: lora_module.scaling[active_adapter] + for active_adapter in lora_module.active_adapters + } + for lora_module in self.lora_modules + ] + self.scale = scale + + def __enter__(self) -> None: + for lora_module in self.lora_modules: + if not isinstance(lora_module, BaseTunerLayer): + continue + lora_module.scale_layer(self.scale) + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[Any], + ) -> None: + for i, lora_module in enumerate(self.lora_modules): + if not isinstance(lora_module, BaseTunerLayer): + continue + for active_adapter in lora_module.active_adapters: + lora_module.scaling[active_adapter] = self.scales[i][active_adapter] diff --git a/diffusion-dpo-test/src/flux/pipeline_tools.py b/diffusion-dpo-test/src/flux/pipeline_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..36174e4fe27dd12dbe629e3b92ca160b3600d889 --- /dev/null +++ b/diffusion-dpo-test/src/flux/pipeline_tools.py @@ -0,0 +1,52 @@ +from diffusers.pipelines import FluxPipeline +from diffusers.utils import logging +from diffusers.pipelines.flux.pipeline_flux import logger +from torch import Tensor + + +def encode_images(pipeline: FluxPipeline, images: Tensor): + images = pipeline.image_processor.preprocess(images) + images = images.to(pipeline.device).to(pipeline.dtype) + images = pipeline.vae.encode(images).latent_dist.sample() + images = ( + images - pipeline.vae.config.shift_factor + ) * pipeline.vae.config.scaling_factor + images_tokens = pipeline._pack_latents(images, *images.shape) + images_ids = pipeline._prepare_latent_image_ids( + images.shape[0], + images.shape[2], + images.shape[3], + pipeline.device, + pipeline.dtype, + ) + if images_tokens.shape[1] != images_ids.shape[0]: + images_ids = pipeline._prepare_latent_image_ids( + images.shape[0], + images.shape[2] // 2, + images.shape[3] // 2, + pipeline.device, + pipeline.dtype, + ) + return images_tokens, images_ids + + +def prepare_text_input(pipeline: FluxPipeline, prompts, max_sequence_length=512): + # Turn off warnings (CLIP overflow) + logger.setLevel(logging.ERROR) + ( + prompt_embeds, + pooled_prompt_embeds, + text_ids, + ) = pipeline.encode_prompt( + prompt=prompts, + prompt_2=None, + prompt_embeds=None, + pooled_prompt_embeds=None, + device=pipeline.device, + num_images_per_prompt=1, + max_sequence_length=max_sequence_length, + lora_scale=None, + ) + # Turn on warnings + logger.setLevel(logging.WARNING) + return prompt_embeds, pooled_prompt_embeds, text_ids diff --git a/diffusion-dpo-test/src/flux/transformer.py b/diffusion-dpo-test/src/flux/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..4ffa539162e5445476a4ea7d32c4a8eb91e984b8 --- /dev/null +++ b/diffusion-dpo-test/src/flux/transformer.py @@ -0,0 +1,252 @@ +import torch +from diffusers.pipelines import FluxPipeline +from typing import List, Union, Optional, Dict, Any, Callable +from .block import block_forward, single_block_forward +from .lora_controller import enable_lora +from accelerate.utils import is_torch_version +from diffusers.models.transformers.transformer_flux import ( + FluxTransformer2DModel, + Transformer2DModelOutput, + USE_PEFT_BACKEND, + scale_lora_layers, + unscale_lora_layers, + logger, +) +import numpy as np + + +def prepare_params( + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor = None, + pooled_projections: torch.Tensor = None, + timestep: torch.LongTensor = None, + img_ids: torch.Tensor = None, + txt_ids: torch.Tensor = None, + guidance: torch.Tensor = None, + joint_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_block_samples=None, + controlnet_single_block_samples=None, + return_dict: bool = True, + **kwargs: dict, +): + return ( + hidden_states, + encoder_hidden_states, + pooled_projections, + timestep, + img_ids, + txt_ids, + guidance, + joint_attention_kwargs, + controlnet_block_samples, + controlnet_single_block_samples, + return_dict, + ) + + +def tranformer_forward( + transformer: FluxTransformer2DModel, + condition_latents: torch.Tensor, + condition_ids: torch.Tensor, + condition_type_ids: torch.Tensor, + model_config: Optional[Dict[str, Any]] = {}, + c_t=0, + **params: dict, +): + self = transformer + use_condition = condition_latents is not None + + ( + hidden_states, + encoder_hidden_states, + pooled_projections, + timestep, + img_ids, + txt_ids, + guidance, + joint_attention_kwargs, + controlnet_block_samples, + controlnet_single_block_samples, + return_dict, + ) = prepare_params(**params) + + if joint_attention_kwargs is not None: + joint_attention_kwargs = joint_attention_kwargs.copy() + lora_scale = joint_attention_kwargs.pop("scale", 1.0) + else: + lora_scale = 1.0 + + if USE_PEFT_BACKEND: + # weight the lora layers by setting `lora_scale` for each PEFT layer + scale_lora_layers(self, lora_scale) + else: + if ( + joint_attention_kwargs is not None + and joint_attention_kwargs.get("scale", None) is not None + ): + logger.warning( + "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective." + ) + + with enable_lora((self.x_embedder,), model_config.get("latent_lora", False)): + hidden_states = self.x_embedder(hidden_states) + condition_latents = self.x_embedder(condition_latents) if use_condition else None + + timestep = timestep.to(hidden_states.dtype) * 1000 + + if guidance is not None: + guidance = guidance.to(hidden_states.dtype) * 1000 + else: + guidance = None + + temb = ( + self.time_text_embed(timestep, pooled_projections) + if guidance is None + else self.time_text_embed(timestep, guidance, pooled_projections) + ) + + cond_temb = ( + self.time_text_embed(torch.ones_like(timestep) * c_t * 1000, pooled_projections) + if guidance is None + else self.time_text_embed( + torch.ones_like(timestep) * c_t * 1000, guidance, pooled_projections + ) + ) + encoder_hidden_states = self.context_embedder(encoder_hidden_states) + + if txt_ids.ndim == 3: + logger.warning( + "Passing `txt_ids` 3d torch.Tensor is deprecated." + "Please remove the batch dimension and pass it as a 2d torch Tensor" + ) + txt_ids = txt_ids[0] + if img_ids.ndim == 3: + logger.warning( + "Passing `img_ids` 3d torch.Tensor is deprecated." + "Please remove the batch dimension and pass it as a 2d torch Tensor" + ) + img_ids = img_ids[0] + + ids = torch.cat((txt_ids, img_ids), dim=0) + image_rotary_emb = self.pos_embed(ids) + if use_condition: + # condition_ids[:, :1] = condition_type_ids + cond_rotary_emb = self.pos_embed(condition_ids) + + # hidden_states = torch.cat([hidden_states, condition_latents], dim=1) + + for index_block, block in enumerate(self.transformer_blocks): + if self.training and self.gradient_checkpointing: + ckpt_kwargs: Dict[str, Any] = ( + {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + ) + encoder_hidden_states, hidden_states, condition_latents = ( + torch.utils.checkpoint.checkpoint( + block_forward, + self=block, + model_config=model_config, + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + condition_latents=condition_latents if use_condition else None, + temb=temb, + cond_temb=cond_temb if use_condition else None, + cond_rotary_emb=cond_rotary_emb if use_condition else None, + image_rotary_emb=image_rotary_emb, + **ckpt_kwargs, + ) + ) + + else: + encoder_hidden_states, hidden_states, condition_latents = block_forward( + block, + model_config=model_config, + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + condition_latents=condition_latents if use_condition else None, + temb=temb, + cond_temb=cond_temb if use_condition else None, + cond_rotary_emb=cond_rotary_emb if use_condition else None, + image_rotary_emb=image_rotary_emb, + ) + + # controlnet residual + if controlnet_block_samples is not None: + interval_control = len(self.transformer_blocks) / len( + controlnet_block_samples + ) + interval_control = int(np.ceil(interval_control)) + hidden_states = ( + hidden_states + + controlnet_block_samples[index_block // interval_control] + ) + hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1) + + for index_block, block in enumerate(self.single_transformer_blocks): + if self.training and self.gradient_checkpointing: + ckpt_kwargs: Dict[str, Any] = ( + {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + ) + result = torch.utils.checkpoint.checkpoint( + single_block_forward, + self=block, + model_config=model_config, + hidden_states=hidden_states, + temb=temb, + image_rotary_emb=image_rotary_emb, + **( + { + "condition_latents": condition_latents, + "cond_temb": cond_temb, + "cond_rotary_emb": cond_rotary_emb, + } + if use_condition + else {} + ), + **ckpt_kwargs, + ) + + else: + result = single_block_forward( + block, + model_config=model_config, + hidden_states=hidden_states, + temb=temb, + image_rotary_emb=image_rotary_emb, + **( + { + "condition_latents": condition_latents, + "cond_temb": cond_temb, + "cond_rotary_emb": cond_rotary_emb, + } + if use_condition + else {} + ), + ) + if use_condition: + hidden_states, condition_latents = result + else: + hidden_states = result + + # controlnet residual + if controlnet_single_block_samples is not None: + interval_control = len(self.single_transformer_blocks) / len( + controlnet_single_block_samples + ) + interval_control = int(np.ceil(interval_control)) + hidden_states[:, encoder_hidden_states.shape[1] :, ...] = ( + hidden_states[:, encoder_hidden_states.shape[1] :, ...] + + controlnet_single_block_samples[index_block // interval_control] + ) + + hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...] + + hidden_states = self.norm_out(hidden_states, temb) + output = self.proj_out(hidden_states) + + if USE_PEFT_BACKEND: + # remove `lora_scale` from each PEFT layer + unscale_lora_layers(self, lora_scale) + + if not return_dict: + return (output,) + return Transformer2DModelOutput(sample=output) diff --git a/diffusion-dpo-test/src/gradio/gradio_app.py b/diffusion-dpo-test/src/gradio/gradio_app.py new file mode 100644 index 0000000000000000000000000000000000000000..f8190be9c0cd1985e664761a40aad5569dc19dc9 --- /dev/null +++ b/diffusion-dpo-test/src/gradio/gradio_app.py @@ -0,0 +1,115 @@ +import gradio as gr +import torch +from PIL import Image, ImageDraw, ImageFont +from diffusers.pipelines import FluxPipeline +from diffusers import FluxTransformer2DModel +import numpy as np + +from ..flux.condition import Condition +from ..flux.generate import seed_everything, generate + +pipe = None +use_int8 = False + + +def get_gpu_memory(): + return torch.cuda.get_device_properties(0).total_memory / 1024**3 + + +def init_pipeline(): + global pipe + if use_int8 or get_gpu_memory() < 33: + transformer_model = FluxTransformer2DModel.from_pretrained( + "sayakpaul/flux.1-schell-int8wo-improved", + torch_dtype=torch.bfloat16, + use_safetensors=False, + ) + pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-schnell", + transformer=transformer_model, + torch_dtype=torch.bfloat16, + ) + else: + pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16 + ) + pipe = pipe.to("cuda") + pipe.load_lora_weights( + "Yuanshi/OminiControl", + weight_name="omini/subject_512.safetensors", + adapter_name="subject", + ) + + +def process_image_and_text(image, text): + # center crop image + w, h, min_size = image.size[0], image.size[1], min(image.size) + image = image.crop( + ( + (w - min_size) // 2, + (h - min_size) // 2, + (w + min_size) // 2, + (h + min_size) // 2, + ) + ) + image = image.resize((512, 512)) + + condition = Condition("subject", image, position_delta=(0, 32)) + + if pipe is None: + init_pipeline() + + result_img = generate( + pipe, + prompt=text.strip(), + conditions=[condition], + num_inference_steps=8, + height=512, + width=512, + ).images[0] + + return result_img + + +def get_samples(): + sample_list = [ + { + "image": "assets/oranges.jpg", + "text": "A very close up view of this item. It is placed on a wooden table. The background is a dark room, the TV is on, and the screen is showing a cooking show. With text on the screen that reads 'Omini Control!'", + }, + { + "image": "assets/penguin.jpg", + "text": "On Christmas evening, on a crowded sidewalk, this item sits on the road, covered in snow and wearing a Christmas hat, holding a sign that reads 'Omini Control!'", + }, + { + "image": "assets/rc_car.jpg", + "text": "A film style shot. On the moon, this item drives across the moon surface. The background is that Earth looms large in the foreground.", + }, + { + "image": "assets/clock.jpg", + "text": "In a Bauhaus style room, this item is placed on a shiny glass table, with a vase of flowers next to it. In the afternoon sun, the shadows of the blinds are cast on the wall.", + }, + { + "image": "assets/tshirt.jpg", + "text": "On the beach, a lady sits under a beach umbrella with 'Omini' written on it. She's wearing this shirt and has a big smile on her face, with her surfboard hehind her.", + }, + ] + return [[Image.open(sample["image"]), sample["text"]] for sample in sample_list] + + +demo = gr.Interface( + fn=process_image_and_text, + inputs=[ + gr.Image(type="pil"), + gr.Textbox(lines=2), + ], + outputs=gr.Image(type="pil"), + title="OminiControl / Subject driven generation", + examples=get_samples(), +) + +if __name__ == "__main__": + init_pipeline() + demo.launch( + debug=True, + ) diff --git a/diffusion-dpo-test/src/train/callbacks.py b/diffusion-dpo-test/src/train/callbacks.py new file mode 100644 index 0000000000000000000000000000000000000000..6f871ae99c0a4154ba646a383243f40ebf1e8f8a --- /dev/null +++ b/diffusion-dpo-test/src/train/callbacks.py @@ -0,0 +1,253 @@ +import lightning as L +from PIL import Image, ImageFilter, ImageDraw +import numpy as np +from transformers import pipeline +import cv2 +import torch +import os + +try: + import wandb +except ImportError: + wandb = None + +from ..flux.condition import Condition +from ..flux.generate import generate + + +class TrainingCallback(L.Callback): + def __init__(self, run_name, training_config: dict = {}): + self.run_name, self.training_config = run_name, training_config + + self.print_every_n_steps = training_config.get("print_every_n_steps", 10) + self.save_interval = training_config.get("save_interval", 1000) + self.sample_interval = training_config.get("sample_interval", 1000) + self.save_path = training_config.get("save_path", "./output") + + self.wandb_config = training_config.get("wandb", None) + self.use_wandb = ( + wandb is not None and os.environ.get("WANDB_API_KEY") is not None + ) + + self.total_steps = 0 + + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx): + gradient_size = 0 + max_gradient_size = 0 + count = 0 + for _, param in pl_module.named_parameters(): + if param.grad is not None: + gradient_size += param.grad.norm(2).item() + max_gradient_size = max(max_gradient_size, param.grad.norm(2).item()) + count += 1 + if count > 0: + gradient_size /= count + + self.total_steps += 1 + + # Print training progress every n steps + if self.use_wandb: + report_dict = { + "steps": batch_idx, + "steps": self.total_steps, + "epoch": trainer.current_epoch, + "gradient_size": gradient_size, + } + loss_value = outputs["loss"].item() * trainer.accumulate_grad_batches + report_dict["loss"] = loss_value + report_dict["t"] = pl_module.last_t + wandb.log(report_dict) + + if self.total_steps % self.print_every_n_steps == 0: + print( + f"Epoch: {trainer.current_epoch}, Steps: {self.total_steps}, Batch: {batch_idx}, Loss: {pl_module.log_loss:.4f}, Gradient size: {gradient_size:.4f}, Max gradient size: {max_gradient_size:.4f}" + ) + + # Save LoRA weights at specified intervals + if self.total_steps % self.save_interval == 0: + print( + f"Epoch: {trainer.current_epoch}, Steps: {self.total_steps} - Saving LoRA weights" + ) + pl_module.save_lora( + f"{self.save_path}/{self.run_name}/ckpt/{self.total_steps}" + ) + + # Generate and save a sample image at specified intervals + if self.total_steps % self.sample_interval == 0: + print( + f"Epoch: {trainer.current_epoch}, Steps: {self.total_steps} - Generating a sample" + ) + self.generate_a_sample( + trainer, + pl_module, + f"{self.save_path}/{self.run_name}/output", + f"lora_{self.total_steps}", + batch["condition_type"][ + 0 + ], # Use the condition type from the current batch + ) + + @torch.no_grad() + def generate_a_sample( + self, + trainer, + pl_module, + save_path, + file_name, + condition_type="super_resolution", + ): + # TODO: change this two variables to parameters + condition_size = trainer.training_config["dataset"]["condition_size"] + target_size = trainer.training_config["dataset"]["target_size"] + position_scale = trainer.training_config["dataset"].get("position_scale", 1.0) + + generator = torch.Generator(device=pl_module.device) + generator.manual_seed(42) + + test_list = [] + + if condition_type == "subject": + test_list.extend( + [ + ( + Image.open("assets/test_in.jpg"), + [0, -32], + "Resting on the picnic table at a lakeside campsite, it's caught in the golden glow of early morning, with mist rising from the water and tall pines casting long shadows behind the scene.", + ), + ( + Image.open("assets/test_out.jpg"), + [0, -32], + "In a bright room. It is placed on a table.", + ), + ] + ) + elif condition_type == "canny": + condition_img = Image.open("assets/vase_hq.jpg").resize( + (condition_size, condition_size) + ) + condition_img = np.array(condition_img) + condition_img = cv2.Canny(condition_img, 100, 200) + condition_img = Image.fromarray(condition_img).convert("RGB") + test_list.append( + ( + condition_img, + [0, 0], + "A beautiful vase on a table.", + {"position_scale": position_scale} if position_scale != 1.0 else {}, + ) + ) + elif condition_type == "coloring": + condition_img = ( + Image.open("assets/vase_hq.jpg") + .resize((condition_size, condition_size)) + .convert("L") + .convert("RGB") + ) + test_list.append((condition_img, [0, 0], "A beautiful vase on a table.")) + elif condition_type == "depth": + if not hasattr(self, "deepth_pipe"): + self.deepth_pipe = pipeline( + task="depth-estimation", + model="LiheYoung/depth-anything-small-hf", + device="cpu", + ) + condition_img = ( + Image.open("assets/vase_hq.jpg") + .resize((condition_size, condition_size)) + .convert("RGB") + ) + condition_img = self.deepth_pipe(condition_img)["depth"].convert("RGB") + test_list.append( + ( + condition_img, + [0, 0], + "A beautiful vase on a table.", + {"position_scale": position_scale} if position_scale != 1.0 else {}, + ) + ) + elif condition_type == "depth_pred": + condition_img = ( + Image.open("assets/vase_hq.jpg") + .resize((condition_size, condition_size)) + .convert("RGB") + ) + test_list.append((condition_img, [0, 0], "A beautiful vase on a table.")) + elif condition_type == "deblurring": + blur_radius = 5 + image = Image.open("./assets/vase_hq.jpg") + condition_img = ( + image.convert("RGB") + .resize((condition_size, condition_size)) + .filter(ImageFilter.GaussianBlur(blur_radius)) + .convert("RGB") + ) + test_list.append( + ( + condition_img, + [0, 0], + "A beautiful vase on a table.", + {"position_scale": position_scale} if position_scale != 1.0 else {}, + ) + ) + elif condition_type == "fill": + condition_img = ( + Image.open("./assets/vase_hq.jpg") + .resize((condition_size, condition_size)) + .convert("RGB") + ) + mask = Image.new("L", condition_img.size, 0) + draw = ImageDraw.Draw(mask) + a = condition_img.size[0] // 4 + b = a * 3 + draw.rectangle([a, a, b, b], fill=255) + condition_img = Image.composite( + condition_img, Image.new("RGB", condition_img.size, (0, 0, 0)), mask + ) + test_list.append((condition_img, [0, 0], "A beautiful vase on a table.")) + elif condition_type == "sr": + condition_img = ( + Image.open("/mnt/bn/yiren-bytenas2/yuang.ai/data/sr/flux_train/sr_bicubic/0000001.png") + .resize((condition_size, condition_size)) + .convert("RGB") + ) + test_list.append((condition_img, [0, 0], "")) + elif condition_type == "cartoon": + condition_img = ( + Image.open("assets/cartoon_boy.png") + .resize((condition_size, condition_size)) + .convert("RGB") + ) + test_list.append( + ( + condition_img, + [0, -16], + "A cartoon character in a white background. He is looking right, and running.", + ) + ) + else: + raise NotImplementedError + + if not os.path.exists(save_path): + os.makedirs(save_path) + for i, (condition_img, position_delta, prompt, *others) in enumerate(test_list): + condition = Condition( + condition_type=condition_type, + condition=condition_img.resize( + (condition_size, condition_size) + ).convert("RGB"), + position_delta=position_delta, + **(others[0] if others else {}), + ) + res = generate( + pl_module.flux_pipe, + prompt=prompt, + conditions=[condition], + height=target_size, + width=target_size, + generator=generator, + model_config=pl_module.model_config, + default_lora=True, + ) + res.images[0].save( + os.path.join(save_path, f"{file_name}_{condition_type}_{i}.jpg") + ) diff --git a/diffusion-dpo-test/src/train/data.py b/diffusion-dpo-test/src/train/data.py new file mode 100644 index 0000000000000000000000000000000000000000..1dc0d94f68ed424dd3f2b2d8fb2b94bc7bb7eec6 --- /dev/null +++ b/diffusion-dpo-test/src/train/data.py @@ -0,0 +1,422 @@ +from PIL import Image, ImageFilter, ImageDraw +import cv2 +import numpy as np +from torch.utils.data import Dataset +import torchvision.transforms as T +import random + + +class Subject200KDataset(Dataset): + def __init__( + self, + base_dataset, + condition_size: int = 512, + target_size: int = 512, + image_size: int = 512, + padding: int = 0, + condition_type: str = "subject", + drop_text_prob: float = 0.1, + drop_image_prob: float = 0.1, + return_pil_image: bool = False, + ): + self.base_dataset = base_dataset + self.condition_size = condition_size + self.target_size = target_size + self.image_size = image_size + self.padding = padding + self.condition_type = condition_type + self.drop_text_prob = drop_text_prob + self.drop_image_prob = drop_image_prob + self.return_pil_image = return_pil_image + + self.to_tensor = T.ToTensor() + + def __len__(self): + return len(self.base_dataset) * 2 + + def __getitem__(self, idx): + # If target is 0, left image is target, right image is condition + target = idx % 2 + item = self.base_dataset[idx // 2] + + # Crop the image to target and condition + image = item["image"] + left_img = image.crop( + ( + self.padding, + self.padding, + self.image_size + self.padding, + self.image_size + self.padding, + ) + ) + right_img = image.crop( + ( + self.image_size + self.padding * 2, + self.padding, + self.image_size * 2 + self.padding * 2, + self.image_size + self.padding, + ) + ) + + # Get the target and condition image + target_image, condition_img = ( + (left_img, right_img) if target == 0 else (right_img, left_img) + ) + + # Resize the image + condition_img = condition_img.resize( + (self.condition_size, self.condition_size) + ).convert("RGB") + target_image = target_image.resize( + (self.target_size, self.target_size) + ).convert("RGB") + + # Get the description + description = item["description"][ + "description_0" if target == 0 else "description_1" + ] + + # Randomly drop text or image + drop_text = random.random() < self.drop_text_prob + drop_image = random.random() < self.drop_image_prob + if drop_text: + description = "" + if drop_image: + condition_img = Image.new( + "RGB", (self.condition_size, self.condition_size), (0, 0, 0) + ) + + return { + "image": self.to_tensor(target_image), + "condition": self.to_tensor(condition_img), + "condition_type": self.condition_type, + "description": description, + # 16 is the downscale factor of the image + "position_delta": np.array([0, -self.condition_size // 16]), + **({"pil_image": image} if self.return_pil_image else {}), + } + + +class ImageConditionDataset(Dataset): + def __init__( + self, + base_dataset, + condition_size: int = 512, + target_size: int = 512, + condition_type: str = "canny", + drop_text_prob: float = 0.1, + drop_image_prob: float = 0.1, + return_pil_image: bool = False, + position_scale=1.0, + ): + self.base_dataset = base_dataset + self.condition_size = condition_size + self.target_size = target_size + self.condition_type = condition_type + self.drop_text_prob = drop_text_prob + self.drop_image_prob = drop_image_prob + self.return_pil_image = return_pil_image + self.position_scale = position_scale + + self.to_tensor = T.ToTensor() + + def __len__(self): + return len(self.base_dataset) + + @property + def depth_pipe(self): + if not hasattr(self, "_depth_pipe"): + from transformers import pipeline + + self._depth_pipe = pipeline( + task="depth-estimation", + model="LiheYoung/depth-anything-small-hf", + device="cpu", + ) + return self._depth_pipe + + def _get_canny_edge(self, img): + resize_ratio = self.condition_size / max(img.size) + img = img.resize( + (int(img.size[0] * resize_ratio), int(img.size[1] * resize_ratio)) + ) + img_np = np.array(img) + img_gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY) + edges = cv2.Canny(img_gray, 100, 200) + return Image.fromarray(edges).convert("RGB") + + def __getitem__(self, idx): + image = self.base_dataset[idx]["jpg"] + image = image.resize((self.target_size, self.target_size)).convert("RGB") + description = self.base_dataset[idx]["json"]["prompt"] + + enable_scale = random.random() < 1 + if not enable_scale: + condition_size = int(self.condition_size * self.position_scale) + position_scale = 1.0 + else: + condition_size = self.condition_size + position_scale = self.position_scale + + # Get the condition image + position_delta = np.array([0, 0]) + if self.condition_type == "canny": + condition_img = self._get_canny_edge(image) + elif self.condition_type == "coloring": + condition_img = ( + image.resize((condition_size, condition_size)) + .convert("L") + .convert("RGB") + ) + elif self.condition_type == "deblurring": + blur_radius = random.randint(1, 10) + condition_img = ( + image.convert("RGB") + .filter(ImageFilter.GaussianBlur(blur_radius)) + .resize((condition_size, condition_size)) + .convert("RGB") + ) + elif self.condition_type == "depth": + condition_img = self.depth_pipe(image)["depth"].convert("RGB") + condition_img = condition_img.resize((condition_size, condition_size)) + elif self.condition_type == "depth_pred": + condition_img = image + image = self.depth_pipe(condition_img)["depth"].convert("RGB") + description = f"[depth] {description}" + elif self.condition_type == "fill": + condition_img = image.resize((condition_size, condition_size)).convert( + "RGB" + ) + w, h = image.size + x1, x2 = sorted([random.randint(0, w), random.randint(0, w)]) + y1, y2 = sorted([random.randint(0, h), random.randint(0, h)]) + mask = Image.new("L", image.size, 0) + draw = ImageDraw.Draw(mask) + draw.rectangle([x1, y1, x2, y2], fill=255) + if random.random() > 0.5: + mask = Image.eval(mask, lambda a: 255 - a) + condition_img = Image.composite( + image, Image.new("RGB", image.size, (0, 0, 0)), mask + ) + elif self.condition_type == "sr": + condition_img = image.resize((condition_size, condition_size)).convert( + "RGB" + ) + position_delta = np.array([0, -condition_size // 16]) + + else: + raise ValueError(f"Condition type {self.condition_type} not implemented") + + # Randomly drop text or image + drop_text = random.random() < self.drop_text_prob + drop_image = random.random() < self.drop_image_prob + if drop_text: + description = "" + if drop_image: + condition_img = Image.new( + "RGB", (condition_size, condition_size), (0, 0, 0) + ) + + return { + "image": self.to_tensor(image), + "condition": self.to_tensor(condition_img), + "condition_type": self.condition_type, + "description": description, + "position_delta": position_delta, + **({"pil_image": [image, condition_img]} if self.return_pil_image else {}), + **({"position_scale": position_scale} if position_scale != 1.0 else {}), + } + +import os +from PIL import Image +from torch.utils.data import Dataset + +class SRBaseDataset(Dataset): + def __init__(self, root_dir, lr_dir='LR', gt_dir='GT', lr_suffix='', gt_suffix=''): + self.lr_root = os.path.join(root_dir, lr_dir) + self.gt_root = os.path.join(root_dir, gt_dir) + + self.filenames = sorted([ + f for f in os.listdir(self.lr_root) + if os.path.isfile(os.path.join(self.lr_root, f)) + ]) + + self.lr_suffix = lr_suffix + self.gt_suffix = gt_suffix + + def __len__(self): + return len(self.filenames) + + def __getitem__(self, idx): + filename = self.filenames[idx] + base_name, ext = os.path.splitext(filename) + + lr_path = os.path.join(self.lr_root, base_name + self.lr_suffix + ext) + gt_path = os.path.join(self.gt_root, base_name + self.gt_suffix + ext) + + lr = Image.open(lr_path) + gt = Image.open(gt_path) + + return {'lr': lr, 'gt': gt} + + +class SRDataset(Dataset): + def __init__( + self, + base_dataset, + condition_size: int = 512, + target_size: int = 512, + condition_type: str = "sr", + drop_text_prob: float = 0.1, + drop_image_prob: float = 0.1, + return_pil_image: bool = False, + position_scale=1.0, + ): + self.base_dataset = base_dataset + self.condition_size = condition_size + self.target_size = target_size + self.condition_type = condition_type + self.drop_text_prob = drop_text_prob + self.drop_image_prob = drop_image_prob + self.return_pil_image = return_pil_image + self.position_scale = position_scale + + self.to_tensor = T.ToTensor() + + def __len__(self): + return len(self.base_dataset) + + def __getitem__(self, idx): + image = self.base_dataset[idx]["gt"] + image = image.resize((self.target_size, self.target_size)).convert("RGB") + description = "" + + enable_scale = random.random() < 1 + if not enable_scale: + condition_size = int(self.condition_size * self.position_scale) + position_scale = 1.0 + else: + condition_size = self.condition_size + position_scale = self.position_scale + + # Get the condition image + position_delta = np.array([0, 0]) + condition_img = self.base_dataset[idx]["lr"] + condition_img = condition_img.resize((condition_size, condition_size),resample=Image.BICUBIC).convert( + "RGB" + ) + # position_delta = np.array([0, -condition_size // 16]) + + # Randomly drop text or image + drop_text = random.random() < self.drop_text_prob + drop_image = random.random() < self.drop_image_prob + if drop_text: + description = "" + if drop_image: + condition_img = Image.new( + "RGB", (condition_size, condition_size), (0, 0, 0) + ) + + return { + "image": self.to_tensor(image), + "condition": self.to_tensor(condition_img), + "condition_type": self.condition_type, + "description": description, + "position_delta": position_delta, + **({"pil_image": [image, condition_img]} if self.return_pil_image else {}), + **({"position_scale": position_scale} if position_scale != 1.0 else {}), + } + +class CartoonDataset(Dataset): + def __init__( + self, + base_dataset, + condition_size: int = 1024, + target_size: int = 1024, + image_size: int = 1024, + padding: int = 0, + condition_type: str = "cartoon", + drop_text_prob: float = 0.1, + drop_image_prob: float = 0.1, + return_pil_image: bool = False, + ): + self.base_dataset = base_dataset + self.condition_size = condition_size + self.target_size = target_size + self.image_size = image_size + self.padding = padding + self.condition_type = condition_type + self.drop_text_prob = drop_text_prob + self.drop_image_prob = drop_image_prob + self.return_pil_image = return_pil_image + + self.to_tensor = T.ToTensor() + + def __len__(self): + return len(self.base_dataset) + + def __getitem__(self, idx): + data = self.base_dataset[idx] + condition_img = data["condition"] + target_image = data["target"] + + # Tag + tag = data["tags"][0] + + target_description = data["target_description"] + + description = { + "lion": "lion like animal", + "bear": "bear like animal", + "gorilla": "gorilla like animal", + "dog": "dog like animal", + "elephant": "elephant like animal", + "eagle": "eagle like bird", + "tiger": "tiger like animal", + "owl": "owl like bird", + "woman": "woman", + "parrot": "parrot like bird", + "mouse": "mouse like animal", + "man": "man", + "pigeon": "pigeon like bird", + "girl": "girl", + "panda": "panda like animal", + "crocodile": "crocodile like animal", + "rabbit": "rabbit like animal", + "boy": "boy", + "monkey": "monkey like animal", + "cat": "cat like animal", + } + + # Resize the image + condition_img = condition_img.resize( + (self.condition_size, self.condition_size) + ).convert("RGB") + target_image = target_image.resize( + (self.target_size, self.target_size) + ).convert("RGB") + + # Process datum to create description + description = data.get( + "description", + f"Photo of a {description[tag]} cartoon character in a white background. Character is facing {target_description['facing_direction']}. Character pose is {target_description['pose']}.", + ) + + # Randomly drop text or image + drop_text = random.random() < self.drop_text_prob + drop_image = random.random() < self.drop_image_prob + if drop_text: + description = "" + if drop_image: + condition_img = Image.new( + "RGB", (self.condition_size, self.condition_size), (0, 0, 0) + ) + + return { + "image": self.to_tensor(target_image), + "condition": self.to_tensor(condition_img), + "condition_type": self.condition_type, + "description": description, + # 16 is the downscale factor of the image + "position_delta": np.array([0, -16]), + } diff --git a/diffusion-dpo-test/src/train/model.py b/diffusion-dpo-test/src/train/model.py new file mode 100644 index 0000000000000000000000000000000000000000..b143858d2e90cbb11e5033766febafd49ea7697a --- /dev/null +++ b/diffusion-dpo-test/src/train/model.py @@ -0,0 +1,185 @@ +import lightning as L +from diffusers.pipelines import FluxPipeline +import torch +from peft import LoraConfig, get_peft_model_state_dict + +import prodigyopt + +from ..flux.transformer import tranformer_forward +from ..flux.condition import Condition +from ..flux.pipeline_tools import encode_images, prepare_text_input + + +class OminiModel(L.LightningModule): + def __init__( + self, + flux_pipe_id: str, + lora_path: str = None, + lora_config: dict = None, + device: str = "cuda", + dtype: torch.dtype = torch.bfloat16, + model_config: dict = {}, + optimizer_config: dict = None, + gradient_checkpointing: bool = False, + ): + # Initialize the LightningModule + super().__init__() + self.model_config = model_config + self.optimizer_config = optimizer_config + + # Load the Flux pipeline + self.flux_pipe: FluxPipeline = ( + FluxPipeline.from_pretrained(flux_pipe_id).to(dtype=dtype).to(device) + ) + self.transformer = self.flux_pipe.transformer + self.transformer.gradient_checkpointing = gradient_checkpointing + self.transformer.train() + + # Freeze the Flux pipeline + self.flux_pipe.text_encoder.requires_grad_(False).eval() + self.flux_pipe.text_encoder_2.requires_grad_(False).eval() + self.flux_pipe.vae.requires_grad_(False).eval() + + # Initialize LoRA layers + self.lora_layers = self.init_lora(lora_path, lora_config) + + self.to(device).to(dtype) + + def init_lora(self, lora_path: str, lora_config: dict): + assert lora_path or lora_config + if lora_path: + # TODO: Implement this + raise NotImplementedError + else: + self.transformer.add_adapter(LoraConfig(**lora_config)) + # TODO: Check if this is correct (p.requires_grad) + lora_layers = filter( + lambda p: p.requires_grad, self.transformer.parameters() + ) + return list(lora_layers) + + def save_lora(self, path: str): + FluxPipeline.save_lora_weights( + save_directory=path, + transformer_lora_layers=get_peft_model_state_dict(self.transformer), + safe_serialization=True, + ) + + def configure_optimizers(self): + # Freeze the transformer + self.transformer.requires_grad_(False) + opt_config = self.optimizer_config + + # Set the trainable parameters + self.trainable_params = self.lora_layers + + # Unfreeze trainable parameters + for p in self.trainable_params: + p.requires_grad_(True) + + # Initialize the optimizer + if opt_config["type"] == "AdamW": + optimizer = torch.optim.AdamW(self.trainable_params, **opt_config["params"]) + elif opt_config["type"] == "Prodigy": + optimizer = prodigyopt.Prodigy( + self.trainable_params, + **opt_config["params"], + ) + elif opt_config["type"] == "SGD": + optimizer = torch.optim.SGD(self.trainable_params, **opt_config["params"]) + else: + raise NotImplementedError + + return optimizer + + def training_step(self, batch, batch_idx): + step_loss = self.step(batch) + self.log_loss = ( + step_loss.item() + if not hasattr(self, "log_loss") + else self.log_loss * 0.95 + step_loss.item() * 0.05 + ) + return step_loss + + def step(self, batch): + imgs = batch["image"] + conditions = batch["condition"] + condition_types = batch["condition_type"] + prompts = batch["description"] + position_delta = batch["position_delta"][0] + position_scale = float(batch.get("position_scale", [1.0])[0]) + + # Prepare inputs + with torch.no_grad(): + # Prepare image input + x_0, img_ids = encode_images(self.flux_pipe, imgs) + + # Prepare text input + prompt_embeds, pooled_prompt_embeds, text_ids = prepare_text_input( + self.flux_pipe, prompts + ) + + # Prepare t and x_t + t = torch.sigmoid(torch.randn((imgs.shape[0],), device=self.device)) + x_1 = torch.randn_like(x_0).to(self.device) + t_ = t.unsqueeze(1).unsqueeze(1) + x_t = ((1 - t_) * x_0 + t_ * x_1).to(self.dtype) + + # Prepare conditions + condition_latents, condition_ids = encode_images(self.flux_pipe, conditions) + + # Add position delta + condition_ids[:, 1] += position_delta[0] + condition_ids[:, 2] += position_delta[1] + + if position_scale != 1.0: + scale_bias = (position_scale - 1.0) / 2 + condition_ids[:, 1] *= position_scale + condition_ids[:, 2] *= position_scale + condition_ids[:, 1] += scale_bias + condition_ids[:, 2] += scale_bias + + # Prepare condition type + condition_type_ids = torch.tensor( + [ + Condition.get_type_id(condition_type) + for condition_type in condition_types + ] + ).to(self.device) + condition_type_ids = ( + torch.ones_like(condition_ids[:, 0]) * condition_type_ids[0] + ).unsqueeze(1) + + # Prepare guidance + guidance = ( + torch.ones_like(t).to(self.device) + if self.transformer.config.guidance_embeds + else None + ) + + # Forward pass + transformer_out = tranformer_forward( + self.transformer, + # Model config + model_config=self.model_config, + # Inputs of the condition (new feature) + condition_latents=condition_latents, + condition_ids=condition_ids, + condition_type_ids=condition_type_ids, + # Inputs to the original transformer + hidden_states=x_t, + timestep=t, + guidance=guidance, + pooled_projections=pooled_prompt_embeds, + encoder_hidden_states=prompt_embeds, + txt_ids=text_ids, + img_ids=img_ids, + joint_attention_kwargs=None, + return_dict=False, + ) + pred = transformer_out[0] + + # Compute loss + loss = torch.nn.functional.mse_loss(pred, (x_1 - x_0), reduction="mean") + self.last_t = t.mean().item() + return loss diff --git a/diffusion-dpo-test/src/train/train.py b/diffusion-dpo-test/src/train/train.py new file mode 100644 index 0000000000000000000000000000000000000000..818afa0f1515872119c4c5dde7a7b80290702706 --- /dev/null +++ b/diffusion-dpo-test/src/train/train.py @@ -0,0 +1,188 @@ +from torch.utils.data import DataLoader +import torch +import lightning as L +import yaml +import os +import time + +from datasets import load_dataset + +from .data import ImageConditionDataset, Subject200KDataset, CartoonDataset, SRBaseDataset, SRDataset +from .model import OminiModel +from .callbacks import TrainingCallback + + +def get_rank(): + try: + rank = int(os.environ.get("LOCAL_RANK")) + except: + rank = 0 + return rank + + +def get_config(): + config_path = os.environ.get("XFL_CONFIG") + assert config_path is not None, "Please set the XFL_CONFIG environment variable" + with open(config_path, "r") as f: + config = yaml.safe_load(f) + return config + + +def init_wandb(wandb_config, run_name): + import wandb + + try: + assert os.environ.get("WANDB_API_KEY") is not None + wandb.init( + project=wandb_config["project"], + name=run_name, + config={}, + ) + except Exception as e: + print("Failed to initialize WanDB:", e) + + +def main(): + # Initialize + is_main_process, rank = get_rank() == 0, get_rank() + torch.cuda.set_device(rank) + config = get_config() + training_config = config["train"] + run_name = time.strftime("%Y%m%d-%H%M%S") + + # Initialize WanDB + wandb_config = training_config.get("wandb", None) + if wandb_config is not None and is_main_process: + init_wandb(wandb_config, run_name) + + print("Rank:", rank) + if is_main_process: + print("Config:", config) + + # Initialize dataset and dataloader + if training_config["dataset"]["type"] == "subject": + dataset = load_dataset("Yuanshi/Subjects200K") + + # Define filter function + def filter_func(item): + if not item.get("quality_assessment"): + return False + return all( + item["quality_assessment"].get(key, 0) >= 5 + for key in ["compositeStructure", "objectConsistency", "imageQuality"] + ) + + # Filter dataset + if not os.path.exists("./cache/dataset"): + os.makedirs("./cache/dataset") + data_valid = dataset["train"].filter( + filter_func, + num_proc=16, + cache_file_name="./cache/dataset/data_valid.arrow", + ) + dataset = Subject200KDataset( + data_valid, + condition_size=training_config["dataset"]["condition_size"], + target_size=training_config["dataset"]["target_size"], + image_size=training_config["dataset"]["image_size"], + padding=training_config["dataset"]["padding"], + condition_type=training_config["condition_type"], + drop_text_prob=training_config["dataset"]["drop_text_prob"], + drop_image_prob=training_config["dataset"]["drop_image_prob"], + ) + elif training_config["dataset"]["type"] == "img": + # Load dataset text-to-image-2M + dataset = load_dataset( + "webdataset", + data_files={"train": training_config["dataset"]["urls"]}, + split="train", + cache_dir="cache/t2i2m", + num_proc=32, + ) + dataset = ImageConditionDataset( + dataset, + condition_size=training_config["dataset"]["condition_size"], + target_size=training_config["dataset"]["target_size"], + condition_type=training_config["condition_type"], + drop_text_prob=training_config["dataset"]["drop_text_prob"], + drop_image_prob=training_config["dataset"]["drop_image_prob"], + position_scale=training_config["dataset"].get("position_scale", 1.0), + ) + elif training_config["dataset"]["type"] == "sr": + dataset = SRBaseDataset(root_dir=training_config["dataset"]["path"],lr_dir="sr_bicubic",gt_dir="gt") + dataset = SRDataset( + dataset, + condition_size=training_config["dataset"]["condition_size"], + target_size=training_config["dataset"]["target_size"], + condition_type=training_config["condition_type"], + drop_text_prob=training_config["dataset"]["drop_text_prob"], + drop_image_prob=training_config["dataset"]["drop_image_prob"], + ) + elif training_config["dataset"]["type"] == "cartoon": + dataset = load_dataset("saquiboye/oye-cartoon", split="train") + dataset = CartoonDataset( + dataset, + condition_size=training_config["dataset"]["condition_size"], + target_size=training_config["dataset"]["target_size"], + image_size=training_config["dataset"]["image_size"], + padding=training_config["dataset"]["padding"], + condition_type=training_config["condition_type"], + drop_text_prob=training_config["dataset"]["drop_text_prob"], + drop_image_prob=training_config["dataset"]["drop_image_prob"], + ) + else: + raise NotImplementedError + + print("Dataset length:", len(dataset)) + train_loader = DataLoader( + dataset, + batch_size=training_config["batch_size"], + shuffle=True, + num_workers=training_config["dataloader_workers"], + ) + + # Initialize model + trainable_model = OminiModel( + flux_pipe_id=config["flux_path"], + lora_config=training_config["lora_config"], + device=f"cuda", + dtype=getattr(torch, config["dtype"]), + optimizer_config=training_config["optimizer"], + model_config=config.get("model", {}), + gradient_checkpointing=training_config.get("gradient_checkpointing", False), + ) + + # Callbacks for logging and saving checkpoints + training_callbacks = ( + [TrainingCallback(run_name, training_config=training_config)] + if is_main_process + else [] + ) + + # Initialize trainer + trainer = L.Trainer( + accumulate_grad_batches=training_config["accumulate_grad_batches"], + callbacks=training_callbacks, + enable_checkpointing=False, + enable_progress_bar=False, + logger=False, + max_steps=training_config.get("max_steps", -1), + max_epochs=training_config.get("max_epochs", -1), + gradient_clip_val=training_config.get("gradient_clip_val", 0.5), + ) + + setattr(trainer, "training_config", training_config) + + # Save config + save_path = training_config.get("save_path", "./output") + if is_main_process: + os.makedirs(f"{save_path}/{run_name}") + with open(f"{save_path}/{run_name}/config.yaml", "w") as f: + yaml.dump(config, f) + + # Start training + trainer.fit(trainable_model, train_loader) + + +if __name__ == "__main__": + main() diff --git a/diffusion-dpo-test/teaser_dataset.py b/diffusion-dpo-test/teaser_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..a97c7d3ce1a8b81470d0383cdb65b2f5635e66a8 --- /dev/null +++ b/diffusion-dpo-test/teaser_dataset.py @@ -0,0 +1,402 @@ +""" +Teaser Figures: Sobolev vs L2 Constraint for Super-Resolution +使用数据集平均功率谱作为 Natural Image 参考线 +""" + +import numpy as np +import matplotlib.pyplot as plt +from scipy import fftpack +from PIL import Image +import os +import glob +from tqdm import tqdm +from mpl_toolkits.axes_grid1.inset_locator import inset_axes + +os.makedirs('results', exist_ok=True) + +# 学术配色方案 (低饱和但不发灰) +COLOR_GT = '#2D2D2D' # 深灰 - Natural Dataset +COLOR_L2 = '#D65F5F' # 柔和红 - L2 +COLOR_OURS = '#3A7CA5' # 柔和蓝 - Sobolev +COLOR_REF = '#999999' # 中灰 - 参考线 + +# ============================================================ +# 配置参数 - 在这里修改所有路径和参数 +# ============================================================ +# 数据集名称 - 会显示在横坐标标签中 +DATASET_NAME = '' # <-- 每次运行前修改这里 + +# LSDIR +DATASET_PATH = '/home/wanghongbo06/baipurui/DATA/LSDIR-val-epoch1/gt' +SR_OURS_PATH = '/home/wanghongbo06/diffusion-dpo-test/results-test/LSDIR-val-epoch1' +SR_L2_PATH = '/home/wanghongbo06/BSRGAN/results_20260112/LSDIR' + +# DIV2K +# DATASET_PATH = '/home/wanghongbo06/baipurui/DATA/DIV2K-val-epoch1/gt' +# SR_OURS_PATH = '/home/wanghongbo06/diffusion-dpo-test/results-test/DIV2K-val-epoch1' +# SR_L2_PATH = '/home/wanghongbo06/BSRGAN/results_20260112/DIV2K' + +# 输出路径 +OUTPUT_PATH = f'vis/fig1_spectrum_LSDIR.png' + +# 其他参数 +IMAGE_EXTENSIONS = ['*.png', '*.jpg', '*.jpeg', '*.bmp', '*.tif', '*.tiff'] +TARGET_SIZE = 512 # 统一裁剪/resize到的尺寸 +MAX_IMAGES = None # 最大图片数量,None表示使用全部 + +# Natural Image 高频增强 +ENHANCE_NATURAL_HIGH_FREQ = False # True: 对 Natural Image 在 80 附近进行高频增强 +ENHANCE_CENTER_FREQ = 140 # 增强中心频率 +ENHANCE_WIDTH = 40 # 增强频率范围宽度 +ENHANCE_STRENGTH = 0.2 # 增强强度 (0~1),越大增强越明显 + +# ============================================================ +# 辅助函数 +# ============================================================ +def get_radial_profile(image): + """计算图像的径向功率谱""" + F = fftpack.fftshift(fftpack.fft2(image)) + psd2D = np.abs(F) ** 2 + h, w = psd2D.shape + y, x = np.indices((h, w)) + r = np.sqrt((x - w/2)**2 + (y - h/2)**2).astype(int) + tbin = np.bincount(r.ravel(), psd2D.ravel()) + nr = np.bincount(r.ravel()) + return tbin / (nr + 1e-10) + +def compute_slope(freq_range, profile): + """计算功率谱斜率""" + log_f = np.log10(freq_range) + log_p = np.log10(profile[freq_range] + 1e-10) + return np.polyfit(log_f, log_p, 1)[0] + +def enhance_high_freq(profile, center_freq=80, width=40, strength=0.3): + """ + 对功率谱进行高频增强 + + Args: + profile: 径向功率谱 + center_freq: 增强中心频率 + width: 增强频率范围宽度(高斯sigma) + strength: 增强强度 (0~1) + + Returns: + 增强后的功率谱 + """ + freq = np.arange(len(profile)) + # 使用高斯函数定义增强区域 + gaussian = np.exp(-((freq - center_freq) ** 2) / (2 * width ** 2)) + # 增强因子:1 + strength * gaussian + enhancement = 1 + strength * gaussian + return profile * enhancement + +def normalize(img): + """归一化图像""" + img = img.astype(np.float64) + return (img - img.mean()) / (img.std() + 1e-10) + +def center_crop(img, target_size): + """中心裁剪图像到目标尺寸""" + h, w = img.shape[:2] + if h < target_size or w < target_size: + # 如果图像太小,先 resize + scale = max(target_size / h, target_size / w) * 1.1 + new_h, new_w = int(h * scale), int(w * scale) + img = np.array(Image.fromarray(img).resize((new_w, new_h), Image.LANCZOS)) + h, w = img.shape[:2] + + start_h = (h - target_size) // 2 + start_w = (w - target_size) // 2 + if len(img.shape) == 3: + return img[start_h:start_h+target_size, start_w:start_w+target_size, :] + return img[start_h:start_h+target_size, start_w:start_w+target_size] + +def load_dataset_images(dataset_path, max_images=None, target_size=512): + """ + 从指定路径加载所有图片 + + Args: + dataset_path: 数据集文件夹路径 + max_images: 最大加载图片数量 + target_size: 目标图像尺寸 + + Returns: + gray_images: 灰度图像列表 + """ + # 收集所有图片路径 + image_paths = [] + for ext in IMAGE_EXTENSIONS: + image_paths.extend(glob.glob(os.path.join(dataset_path, ext))) + image_paths.extend(glob.glob(os.path.join(dataset_path, '**', ext), recursive=True)) + + # 去重 + image_paths = list(set(image_paths)) + + if len(image_paths) == 0: + raise ValueError(f"在 {dataset_path} 中没有找到图片文件!") + + print(f"找到 {len(image_paths)} 张图片") + + # 限制数量 + if max_images is not None and len(image_paths) > max_images: + np.random.seed(42) + image_paths = np.random.choice(image_paths, max_images, replace=False).tolist() + print(f"随机选择 {max_images} 张图片") + + gray_images = [] + for path in tqdm(image_paths, desc="加载图片"): + try: + img = np.array(Image.open(path)) + # 处理 RGBA + if len(img.shape) == 3 and img.shape[2] == 4: + img = img[:, :, :3] + # 中心裁剪 + img = center_crop(img, target_size) + # 转灰度 + if len(img.shape) == 3: + gray = np.array(Image.fromarray(img).convert('L')) + else: + gray = img + gray_images.append(gray) + except Exception as e: + print(f" 跳过 {path}: {e}") + continue + + print(f"成功加载 {len(gray_images)} 张图片") + return gray_images + +def compute_average_spectrum(gray_images): + """ + 计算多张图片的平均功率谱 + + Args: + gray_images: 灰度图像列表 + + Returns: + avg_profile: 平均径向功率谱 + """ + profiles = [] + for img in tqdm(gray_images, desc="计算功率谱"): + profile = get_radial_profile(normalize(img)) + profiles.append(profile) + + # 取平均(对数域平均更合理) + profiles = np.array(profiles) + # 使用几何平均(对数域的算术平均) + avg_profile = np.exp(np.mean(np.log(profiles + 1e-10), axis=0)) + + return avg_profile + +# ============================================================ +# 生成 L2 mock(模拟L2 loss导致的频谱失真) +# ============================================================ +def create_l2_mock(sr_image, high_cut=0.22, high_steep=3, low_boost=0.15): + if len(sr_image.shape) == 3: + result = np.zeros_like(sr_image, dtype=np.float64) + for c in range(3): + result[:,:,c] = create_l2_mock_channel(sr_image[:,:,c], high_cut, high_steep, low_boost) + return np.clip(result, 0, 255).astype(np.uint8) + return create_l2_mock_channel(sr_image, high_cut, high_steep, low_boost) + +def create_l2_mock_channel(img, high_cut, high_steep, low_boost): + img = img.astype(np.float64) + h, w = img.shape + y, x = np.indices((h, w)) + cy, cx = h/2, w/2 + freq_r = np.sqrt(((x-cx)/w)**2 + ((y-cy)/h)**2) + lowpass = 1 / (1 + (freq_r / high_cut) ** (2 * high_steep)) + low_freq_damage = 1 - low_boost * np.exp(-(freq_r / 0.03)**2) + filter_combined = lowpass * low_freq_damage + F = fftpack.fftshift(fftpack.fft2(img)) + F_filtered = F * filter_combined + result = np.real(fftpack.ifft2(fftpack.ifftshift(F_filtered))) + return result + +# ============================================================ +# 主程序 +# ============================================================ +if __name__ == '__main__': + + print("="*70) + print("Power Spectrum Analysis with Dataset Average") + print("="*70) + print(f"\n配置:") + print(f" 数据集路径: {DATASET_PATH}") + print(f" SR Ours路径: {SR_OURS_PATH}") + print(f" SR L2路径: {SR_L2_PATH if SR_L2_PATH else '(自动生成mock)'}") + print(f" 输出路径: {OUTPUT_PATH}") + print(f" 目标尺寸: {TARGET_SIZE}") + print(f" 最大图片数: {MAX_IMAGES if MAX_IMAGES else '全部'}") + + # ============================================================ + # 加载数据集并计算平均功率谱 (Natural Image) + # ============================================================ + print(f"\n[1] 加载 Natural 数据集: {DATASET_PATH}") + natural_images = load_dataset_images(DATASET_PATH, MAX_IMAGES, TARGET_SIZE) + n_natural = len(natural_images) + + print(f"\n[2] 计算 Natural 数据集平均功率谱...") + profile_natural = compute_average_spectrum(natural_images) + + # 高频增强(可选) + if ENHANCE_NATURAL_HIGH_FREQ: + print(f" [*] 对 Natural 功率谱进行高频增强 (center={ENHANCE_CENTER_FREQ}, width={ENHANCE_WIDTH}, strength={ENHANCE_STRENGTH})") + profile_natural = enhance_high_freq(profile_natural, ENHANCE_CENTER_FREQ, ENHANCE_WIDTH, ENHANCE_STRENGTH) + + # ============================================================ + # 加载 SR Ours 结果并计算平均功率谱 + # ============================================================ + print(f"\n[3] 加载 SR Ours 数据集: {SR_OURS_PATH}") + ours_images = load_dataset_images(SR_OURS_PATH, MAX_IMAGES, TARGET_SIZE) + n_ours = len(ours_images) + + print(f"\n[4] 计算 SR Ours 平均功率谱...") + profile_ours = compute_average_spectrum(ours_images) + + # ============================================================ + # 加载 SR L2 结果并计算平均功率谱 + # ============================================================ + if SR_L2_PATH is not None: + print(f"\n[5] 加载 SR L2 数据集: {SR_L2_PATH}") + l2_images = load_dataset_images(SR_L2_PATH, MAX_IMAGES, TARGET_SIZE) + n_l2 = len(l2_images) + + print(f"\n[6] 计算 SR L2 平均功率谱...") + profile_l2 = compute_average_spectrum(l2_images) + else: + print(f"\n[5] 自动生成 L2 mock (基于 Ours 数据集)...") + # 对每张 ours 图片生成 mock 并计算平均 + l2_profiles = [] + for img in tqdm(ours_images, desc="生成L2 mock"): + # 需要先转回 RGB 再生成 mock + mock_gray = create_l2_mock(img, high_cut=0.18, high_steep=2.2, low_boost=0.03) + profile = get_radial_profile(normalize(mock_gray)) + l2_profiles.append(profile) + profile_l2 = np.exp(np.mean(np.log(np.array(l2_profiles) + 1e-10), axis=0)) + n_l2 = len(ours_images) + + # ============================================================ + # 计算斜率 + # ============================================================ + freq = np.arange(len(profile_natural)) + fit_range = np.arange(10, 150) + freq_min, freq_max = 4, 180 + + slope_natural = compute_slope(fit_range, profile_natural) + slope_ours = compute_slope(fit_range, profile_ours) + slope_l2 = compute_slope(fit_range, profile_l2) + + dist_l2 = abs(slope_natural - slope_l2) + dist_ours = abs(slope_natural - slope_ours) + ratio = dist_l2 / dist_ours + + print(f"\n[7] 斜率分析:") + print(f" Natural Dataset (n={n_natural}): slope={slope_natural:.2f}") + print(f" L² Constraint (n={n_l2}): slope={slope_l2:.2f}") + print(f" Sobolev (Ours) (n={n_ours}): slope={slope_ours:.2f}") + print(f" Sobolev is {ratio:.1f}× closer to natural statistics!") + + # ============================================================ + # 绘图 + # ============================================================ + print(f"\n[8] 生成图表...") + + fig, ax = plt.subplots(figsize=(4.5, 5.0)) + ax.set_facecolor('white') + + f_range = freq[freq_min:freq_max] + p_natural = profile_natural[freq_min:freq_max] + p_l2 = profile_l2[freq_min:freq_max] + p_ours = profile_ours[freq_min:freq_max] + + # 幂次变换 + power_exp = 1.0 + f_transformed = f_range ** power_exp + + # 绘制曲线 + ax.semilogy(f_transformed, p_natural, color=COLOR_GT, linewidth=1.8, + label='Natural Dataset', zorder=10) + ax.semilogy(f_transformed, p_ours, color=COLOR_OURS, linewidth=1.8, + label='Sobolev (Ours)', zorder=9) + ax.semilogy(f_transformed, p_l2, color=COLOR_L2, linewidth=1.5, linestyle='--', + label='L² Constraint', zorder=8) + + # 填充差值区域 + ax.fill_between(f_transformed, p_natural, p_ours, alpha=0.20, color=COLOR_OURS, + label='Sobolev Deviation', zorder=2) + ax.fill_between(f_transformed, p_natural, p_l2, alpha=0.18, color=COLOR_L2, + label='L² Deviation', zorder=1) + + # 1/f² 参考线 + ref_line = profile_natural[30] * (30**2) / (f_range**2) + ax.semilogy(f_transformed, ref_line, color='#555555', linewidth=1.5, linestyle='--', + alpha=0.9, label=r'Power-Law $\propto 1/f^2$', zorder=5) + + # 高频区域标注 + high_freq_start = 90 ** power_exp + high_freq_end = 180 ** power_exp + ax.axvspan(high_freq_start, high_freq_end, alpha=0.06, color='#888888', zorder=0) + + # 自定义 x 轴刻度 + tick_freqs = [10, 30, 60, 100, 150] + tick_positions = [f ** power_exp for f in tick_freqs] + ax.set_xticks(tick_positions) + ax.set_xticklabels([str(f) for f in tick_freqs], fontsize=11) + ax.tick_params(axis='y', labelsize=11) + + ax.set_xlabel(f'Spatial Frequency $f$', fontsize=13) + ax.set_ylabel('Power Spectral Density $P(f)$', fontsize=13) + ax.legend(loc='upper right', fontsize=10, framealpha=0.95, edgecolor='#CCCCCC') + ax.grid(True, which='major', alpha=0.25, linestyle='-', linewidth=0.5) + ax.grid(True, which='minor', alpha=0.1, linestyle='-', linewidth=0.3) + ax.set_xlim(f_range[0] ** power_exp, f_range[-1] ** power_exp) + + # Inset 条形图 + ax_inset = inset_axes(ax, width="26%", height="18%", loc='lower left', + bbox_to_anchor=(0.1, 0.05, 1, 1), bbox_transform=ax.transAxes) + + methods = ['L²', 'Ours'] + deviations = [dist_l2, dist_ours] + colors_bar = [COLOR_L2, COLOR_OURS] + + bars = ax_inset.barh(methods, deviations, color=colors_bar, height=0.5, edgecolor='none') + ax_inset.set_title('Slope Deviation', fontsize=10, pad=3) + ax_inset.tick_params(axis='y', labelsize=10, pad=1) + ax_inset.tick_params(axis='x', labelsize=9) + ax_inset.set_xlim(0, max(deviations) * 1.18) + + for bar, dev in zip(bars, deviations): + ax_inset.text(bar.get_width() + 0.02, bar.get_y() + bar.get_height()/2, + f'{dev:.2f}', ha='left', va='center', fontsize=10) + + ax_inset.spines['top'].set_visible(False) + ax_inset.spines['right'].set_visible(False) + ax_inset.set_facecolor('white') + + plt.tight_layout() + plt.savefig(OUTPUT_PATH, dpi=300, bbox_inches='tight', facecolor='white') + plt.show() + print(f"Saved: {OUTPUT_PATH}") + + # ============================================================ + # 总结 + # ============================================================ + print("\n" + "="*70) + print("FIGURE GENERATED") + print("="*70) + print(f""" +Quantitative Results: + Natural images: {n_natural} + SR Ours images: {n_ours} + SR L2 images: {n_l2} + + Natural Slope: {slope_natural:.2f} + L² Slope: {slope_l2:.2f} (delta={dist_l2:.2f}) + Sobolev Slope: {slope_ours:.2f} (delta={dist_ours:.2f}) + + Sobolev is {ratio:.1f}x closer to natural image statistics! + +Output: + {OUTPUT_PATH} +""") + print("="*70) diff --git a/diffusion-dpo-test/test-15.py b/diffusion-dpo-test/test-15.py new file mode 100644 index 0000000000000000000000000000000000000000..516976cb27a73c1cb1758d613735c5ae5ae6fa81 --- /dev/null +++ b/diffusion-dpo-test/test-15.py @@ -0,0 +1,68 @@ +import os +os.environ["HF_HOME"] = "/home/wanghongbo06/.cache/huggingface" + +import torch +from diffusers.pipelines import FluxPipeline +from src.flux.condition import Condition +from src.flux.generate import generate, seed_everything +from color_fix import wavelet_color_fix, adain_color_fix +from PIL import Image +from tqdm import tqdm + +# 设置路径 +input_folder = "/home/wanghongbo06/diffusion-dpo-test/data_val" +output_folder = "./results-test/dpo-only/lora_15" +os.makedirs(output_folder, exist_ok=True) + + +# 加载模型 +pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", + torch_dtype=torch.bfloat16 +).to("cuda") + +# 加载第一个 LoRA +pipe.load_lora_weights( + "/home/wanghongbo06/baipurui/CKPTs/FLUX_SR/pytorch_lora_weights_v2.safetensors", + adapter_name="sr" +) + +# 加载第二个 LoRA +pipe.load_lora_weights( + "/home/wanghongbo06/diffusion-dpo-adv/results_sobolev_1212_2/checkpoint-400/lora_dpo/adapter_model.safetensors", + adapter_name="sr2" +) + +prompt = "" + +# 遍历输入文件夹的图像 +for filename in tqdm(sorted(os.listdir(input_folder))): + if not filename.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".webp")): + continue + + image_path = os.path.join(input_folder, filename) + image = Image.open(image_path).convert("RGB") + + # 居中裁剪 + resize 到 512x512 + w, h = image.size + min_dim = min(w, h) + image = image.crop( + ((w - min_dim) // 2, (h - min_dim) // 2, (w + min_dim) // 2, (h + min_dim) // 2) + ).resize((512, 512), Image.BICUBIC) + + # 构造条件并生成图像 + condition = Condition("sr", image) + seed_everything() + + result_img = generate( + pipe, + prompt=prompt, + conditions=[condition], + default_lora=True, + ).images[0] + + # 色彩修复 + result_img = adain_color_fix(result_img, image) + + # 保存结果图像 + result_img.save(os.path.join(output_folder, filename)) \ No newline at end of file diff --git a/diffusion-dpo-test/test-60.py b/diffusion-dpo-test/test-60.py new file mode 100644 index 0000000000000000000000000000000000000000..92a217e9b21555f4d7cb49ebd5b581f3112ba1c6 --- /dev/null +++ b/diffusion-dpo-test/test-60.py @@ -0,0 +1,72 @@ +import os +os.environ["HF_HOME"] = "/home/wanghongbo06/.cache/huggingface" + +import torch +from diffusers.pipelines import FluxPipeline +from src.flux.condition import Condition +from src.flux.generate import generate, seed_everything +from color_fix import wavelet_color_fix, adain_color_fix +from PIL import Image +from tqdm import tqdm + +# 设置路径 +input_folder = "/home/wanghongbo06/diffusion-dpo-test/data_val" +output_folder = "./results-test/dpo-only/lora_60" +os.makedirs(output_folder, exist_ok=True) + + +# 加载模型 +pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", + torch_dtype=torch.bfloat16 +).to("cuda") + +# 加载并融合第一个 LoRA +# pipe.load_lora_weights( +# "/home/wanghongbo06/baipurui/CKPTs/FLUX_SR/pytorch_lora_weights_v2.safetensors", +# adapter_name="sr" +# ) +# pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr"]) +# pipe.unload_lora_weights() + + +# 加载并融合第二个 LoRA +pipe.load_lora_weights( + "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors", + adapter_name="sr2" +) +pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr2"]) + +prompt = "" + +# 遍历输入文件夹的图像 +for filename in tqdm(sorted(os.listdir(input_folder))): + if not filename.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".webp")): + continue + + image_path = os.path.join(input_folder, filename) + image = Image.open(image_path).convert("RGB") + + # 居中裁剪 + resize 到 512x512 + w, h = image.size + min_dim = min(w, h) + image = image.crop( + ((w - min_dim) // 2, (h - min_dim) // 2, (w + min_dim) // 2, (h + min_dim) // 2) + ).resize((512, 512), Image.BICUBIC) + + # 构造条件并生成图像 + condition = Condition("sr", image) + seed_everything() + + result_img = generate( + pipe, + prompt=prompt, + conditions=[condition], + default_lora=True, + ).images[0] + + # 色彩修复 + result_img = adain_color_fix(result_img, image) + + # 保存结果图像 + result_img.save(os.path.join(output_folder, filename)) \ No newline at end of file diff --git a/diffusion-dpo-test/test.py b/diffusion-dpo-test/test.py new file mode 100644 index 0000000000000000000000000000000000000000..e64a2a0633f2529e1b4db284f483dedce5edeb78 --- /dev/null +++ b/diffusion-dpo-test/test.py @@ -0,0 +1,71 @@ +import os +os.environ["HF_HOME"] = "/home/wanghongbo06/.cache/huggingface" + +import torch +from diffusers.pipelines import FluxPipeline +from src.flux.condition import Condition +from src.flux.generate import generate, seed_everything +from color_fix import wavelet_color_fix, adain_color_fix +from PIL import Image +from tqdm import tqdm + +# 设置路径 +input_folder = "/home/wanghongbo06/diffusion-dpo-test/data_val" +output_folder = "./results-test/adv/lora_60" +os.makedirs(output_folder, exist_ok=True) + +# 加载模型 +pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", + torch_dtype=torch.bfloat16 +).to("cuda") + +# 加载并融合第一个 LoRA +pipe.load_lora_weights( + "/home/wanghongbo06/baipurui/CKPTs/FLUX_SR/pytorch_lora_weights_v2.safetensors", + adapter_name="sr" +) +pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr"]) +pipe.unload_lora_weights() + +# 加载并融合第二个 LoRA +pipe.load_lora_weights( + "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors", + adapter_name="sr2" +) +pipe.fuse_lora(lora_scale=1.0, adapter_names=["sr2"]) +pipe.unload_lora_weights() + +prompt = "" + +# 遍历输入文件夹的图像 +for filename in tqdm(sorted(os.listdir(input_folder))): + if not filename.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".webp")): + continue + + image_path = os.path.join(input_folder, filename) + image = Image.open(image_path).convert("RGB") + + # 居中裁剪 + resize 到 512x512 + w, h = image.size + min_dim = min(w, h) + image = image.crop( + ((w - min_dim) // 2, (h - min_dim) // 2, (w + min_dim) // 2, (h + min_dim) // 2) + ).resize((512, 512), Image.BICUBIC) + + # 构造条件并生成图像 + condition = Condition("sr", image) + seed_everything() + + result_img = generate( + pipe, + prompt=prompt, + conditions=[condition], + default_lora=True, + ).images[0] + + # 色彩修复 + result_img = adain_color_fix(result_img, image) + + # 保存结果图像 + result_img.save(os.path.join(output_folder, filename)) \ No newline at end of file diff --git a/diffusion-dpo-test/test_aya.py b/diffusion-dpo-test/test_aya.py new file mode 100644 index 0000000000000000000000000000000000000000..8f1821fde80aca24d91cbe9a1ba12c8ad77fa16f --- /dev/null +++ b/diffusion-dpo-test/test_aya.py @@ -0,0 +1,68 @@ + +import os +os.environ["HF_HOME"] = "/home/wanghongbo06/.cache/huggingface" + +import torch +from diffusers.pipelines import FluxPipeline +from src.flux.condition import Condition +from src.flux.generate import generate, seed_everything +from color_fix import wavelet_color_fix, adain_color_fix + +from PIL import Image +import os +from tqdm import tqdm + + +input_folder = "/home/wanghongbo06/diffusion-dpo-test/data_val" +output_folder = "./results-test/dual_lora/checkpoint-105" +os.makedirs(output_folder, exist_ok=True) + +# LoRA 路径 +SR_LORA_PATH = "/home/wanghongbo06/baipurui/CKPTs/FLUX_SR/pytorch_lora_weights_v2.safetensors" +DPO_LORA_PATH = "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-105/lora_train_unet/adapter_model.safetensors" + +# 加载模型 +pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", + torch_dtype=torch.bfloat16 +).to("cuda") + +pipe.load_lora_weights( + SR_LORA_PATH, + adapter_name="sr" +) +pipe.fuse_lora(lora_scale=1.0, adapter_namse=["sr"]) +pipe.unload_lora_weights() +prompt = "" + +# 遍历输入文件夹的图像 +for filename in tqdm(sorted(os.listdir(input_folder))): + if not filename.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".webp")): + continue + + image_path = os.path.join(input_folder, filename) + image = Image.open(image_path).convert("RGB") + + # 居中裁剪 + resize 到 512x512 + w, h = image.size + min_dim = min(w, h) + image = image.crop( + ((w - min_dim) // 2, (h - min_dim) // 2, (w + min_dim) // 2, (h + min_dim) // 2) + ).resize((512, 512), Image.BICUBIC) + + # 构造条件并生成图像 + condition = Condition("sr", image) + seed_everything() + + result_img = generate( + pipe, + prompt=prompt, + conditions=[condition], + default_lora=True, + ).images[0] + + # 色彩修复 + result_img = adain_color_fix(result_img, image) + + # 保存结果图像 + result_img.save(os.path.join(output_folder, filename)) \ No newline at end of file diff --git a/diffusion-dpo-test/test_dpo_scale_ablation.py b/diffusion-dpo-test/test_dpo_scale_ablation.py new file mode 100644 index 0000000000000000000000000000000000000000..3cda922e880a0678ba771789bd31fcae2a7708c9 --- /dev/null +++ b/diffusion-dpo-test/test_dpo_scale_ablation.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +""" +测试不同 DPO LoRA scale 的效果 +找到最佳的 DPO 权重 +""" +import os +os.environ["HF_HOME"] = "/home/wanghongbo06/.cache/huggingface" + +import torch +from diffusers.pipelines import FluxPipeline +from src.flux.condition import Condition +from src.flux.generate import generate, seed_everything +from color_fix import adain_color_fix +from PIL import Image +from tqdm import tqdm + +# ==================== 配置 ==================== +input_folder = "/home/wanghongbo06/diffusion-dpo-test/data_val" +base_output_folder = "./results-test/dpo_scale_ablation" + +# LoRA 路径 +SR_LORA_PATH = "/home/wanghongbo06/baipurui/CKPTs/FLUX_SR/pytorch_lora_weights_v2.safetensors" +DPO_LORA_PATH = "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors" + +# 要测试的 DPO scale 值 +DPO_SCALES = [0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0] + +# 只测试前 N 张图 +MAX_IMAGES = 3 + +# ==================== 加载模型 ==================== +print("=" * 60) +print("加载基础 Flux 模型...") +pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", + torch_dtype=torch.bfloat16 +).to("cuda") + +# 加载两个 LoRA(不融合,方便动态调整权重) +print(f"加载 SR LoRA: {SR_LORA_PATH}") +pipe.load_lora_weights(SR_LORA_PATH, adapter_name="sr") + +print(f"加载 DPO LoRA: {DPO_LORA_PATH}") +pipe.load_lora_weights(DPO_LORA_PATH, adapter_name="dpo") + +print("=" * 60) + +# ==================== 测试不同 scale ==================== +# 获取测试图片列表 +test_images = [] +for filename in sorted(os.listdir(input_folder)): + if filename.lower().endswith((".png", ".jpg", ".jpeg")): + test_images.append(filename) + if len(test_images) >= MAX_IMAGES: + break + +print(f"\n测试 {len(test_images)} 张图片,{len(DPO_SCALES)} 种 DPO scale") +print(f"DPO scales: {DPO_SCALES}") +print("=" * 60) + +for dpo_scale in DPO_SCALES: + output_folder = os.path.join(base_output_folder, f"dpo_scale_{dpo_scale}") + os.makedirs(output_folder, exist_ok=True) + + # 设置权重 + pipe.set_adapters(["sr", "dpo"], adapter_weights=[1.0, dpo_scale]) + print(f"\n>>> DPO scale = {dpo_scale}") + + for filename in tqdm(test_images, desc=f"scale={dpo_scale}"): + image_path = os.path.join(input_folder, filename) + image = Image.open(image_path).convert("RGB") + + # 居中裁剪 + resize 到 512x512 + w, h = image.size + min_dim = min(w, h) + image = image.crop( + ((w - min_dim) // 2, (h - min_dim) // 2, (w + min_dim) // 2, (h + min_dim) // 2) + ).resize((512, 512), Image.BICUBIC) + + condition = Condition("sr", image) + seed_everything() # 固定种子 + + result_img = generate( + pipe, + prompt="", + conditions=[condition], + default_lora=True, + ).images[0] + + result_img = adain_color_fix(result_img, image) + result_img.save(os.path.join(output_folder, filename)) + +print(f"\n✅ 完成!结果保存在: {base_output_folder}") +print("\n各 scale 对应的文件夹:") +for dpo_scale in DPO_SCALES: + print(f" dpo_scale_{dpo_scale}/") + +print("\n请对比这些结果,找到最佳的 DPO scale!") +print(" - scale=0.0: 只有 SR LoRA,没有 DPO(基准)") +print(" - scale=0.01~0.1: 轻微的 DPO 效果") +print(" - scale=0.5~1.0: 较强的 DPO 效果") + diff --git a/diffusion-dpo-test/test_dual_lora.py b/diffusion-dpo-test/test_dual_lora.py new file mode 100644 index 0000000000000000000000000000000000000000..bf70c08570b02a161d22649366106d8cdc21b020 --- /dev/null +++ b/diffusion-dpo-test/test_dual_lora.py @@ -0,0 +1,87 @@ +import os +os.environ["HF_HOME"] = "/home/wanghongbo06/.cache/huggingface" + +import torch +from diffusers.pipelines import FluxPipeline +from src.flux.condition import Condition +from src.flux.generate import generate, seed_everything +from color_fix import wavelet_color_fix, adain_color_fix +from PIL import Image +from tqdm import tqdm + + +input_folder = "/home/wanghongbo06/baipurui/DATA/DIV2K-val/gt" +output_folder = "/home/wanghongbo06/diffusion-dpo-test/DIV2K-val/sobolev-400" +os.makedirs(output_folder, exist_ok=True) + +# LoRA 路径 +SR_LORA_PATH = "/home/wanghongbo06/baipurui/CKPTs/FLUX_SR/pytorch_lora_weights_v2.safetensors" +# 新的训练代码保存路径为 lora_dpo(而不是 lora_train_unet) +DPO_LORA_PATH = "/home/wanghongbo06/diffusion-dpo-adv/results_sobolev_1212_2/checkpoint-400/lora_dpo/adapter_model.safetensors" + +# LoRA scale(可以调整权重强度) +SR_LORA_SCALE = 1.0 +DPO_LORA_SCALE = 1.0 # 可以调整,例如 0.1, 0.5, 1.0, 2.0 + + +print("=" * 60) +print("加载基础 Flux 模型...") +pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", + torch_dtype=torch.bfloat16 +).to("cuda") + + +print("=" * 60) +print(f"加载初始超分 LoRA: {SR_LORA_PATH}") +pipe.load_lora_weights(SR_LORA_PATH, adapter_name="sr") +print(" ✅ SR LoRA 已加载") + +print(f"加载 DPO LoRA: {DPO_LORA_PATH}") +pipe.load_lora_weights(DPO_LORA_PATH, adapter_name="dpo") +print(" ✅ DPO LoRA 已加载") + +# 使用 set_adapters 同时激活两个 LoRA 并设置权重 +pipe.set_adapters(["sr", "dpo"], adapter_weights=[SR_LORA_SCALE, DPO_LORA_SCALE]) +print(f" ✅ 已激活 adapters: sr (scale={SR_LORA_SCALE}), dpo (scale={DPO_LORA_SCALE})") + +print("=" * 60) +print(f"模型加载完成!") +print(f" - 使用方式: set_adapters (动态加载,不融合)") +print(f" - SR LoRA scale: {SR_LORA_SCALE}") +print(f" - DPO LoRA scale: {DPO_LORA_SCALE}") +print("=" * 60) + + +prompt = "" + +for filename in tqdm(sorted(os.listdir(input_folder)), desc="生成图像"): + if not filename.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".webp")): + continue + + image_path = os.path.join(input_folder, filename) + image = Image.open(image_path).convert("RGB") + + # 居中裁剪 + resize 到 512x512 + w, h = image.size + min_dim = min(w, h) + image = image.crop( + ((w - min_dim) // 2, (h - min_dim) // 2, (w + min_dim) // 2, (h + min_dim) // 2) + ).resize((512, 512), Image.BICUBIC) + + # 构造条件并生成图像 + condition = Condition("sr", image) + seed_everything() # 固定种子以便对比 + + result_img = generate( + pipe, + prompt=prompt, + conditions=[condition], + default_lora=True, + ).images[0] + + result_img = adain_color_fix(result_img, image) + + result_img.save(os.path.join(output_folder, filename)) + +print(f"\n✅ 完成!结果保存在: {output_folder}") diff --git a/diffusion-dpo-test/test_dual_lora_adv.py b/diffusion-dpo-test/test_dual_lora_adv.py new file mode 100644 index 0000000000000000000000000000000000000000..8b3b5994c9bf825b19b465657a7abe4ca6674306 --- /dev/null +++ b/diffusion-dpo-test/test_dual_lora_adv.py @@ -0,0 +1,270 @@ +import os +os.environ["HF_HOME"] = "/home/wanghongbo06/.cache/huggingface" + +import torch +import torch.multiprocessing as mp +from diffusers.pipelines import FluxPipeline +from src.flux.condition import Condition +from src.flux.generate import generate, seed_everything +from color_fix import wavelet_color_fix, adain_color_fix +from PIL import Image +from tqdm import tqdm +import time + + +# ============== 配置 ============== +input_folder = "/home/wanghongbo06/baipurui/DATA/Case/0825" +output_folder = "/home/wanghongbo06/baipurui/DATA/Case/0825_r" + +# LoRA 路径 +SR_LORA_PATH = "/home/wanghongbo06/baipurui/OminiControl/runs/20260105-171922/ckpt/800/pytorch_lora_weights.safetensors" + +# 可视化 Adversarial Sample +ADV_LORA_PATH = '/home/wanghongbo06/diffusion-dpo/results/final_lora/adapter_model.safetensors' +DPO_LORA_PATH = ADV_LORA_PATH + + + +# LoRA scale(可以调整权重强度) +SR_LORA_SCALE = 1.0 +DPO_LORA_SCALE = 1.0 + +# 多卡配置 +NUM_GPUS = 4 +# 同时加载模型的最大进程数(设为1表示串行加载,避免I/O瓶颈) +MAX_CONCURRENT_LOAD = 1 +# ================================ + + +def load_pipeline(gpu_id, load_semaphore=None): + """在指定 GPU 上加载 pipeline,使用信号量控制并发加载""" + device = f"cuda:{gpu_id}" + + # 使用信号量控制同时加载模型的进程数 + if load_semaphore is not None: + load_semaphore.acquire() + + try: + print(f"[GPU {gpu_id}] 开始加载模型...") + load_start = time.time() + + pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", + torch_dtype=torch.bfloat16, + local_files_only=True, # 只使用本地缓存,避免网络检查 + ).to(device) + + # 加载 LoRA + pipe.load_lora_weights(SR_LORA_PATH, adapter_name="sr") + pipe.load_lora_weights(DPO_LORA_PATH, adapter_name="dpo") + # 这里不直接固定 LoRA 组合,在采样阶段分别切换: + # - winner 分支:仅使用 SR LoRA + # - adv 分支:使用 SR + DPO(Adversary)LoRA + + load_time = time.time() - load_start + print(f"[GPU {gpu_id}] 模型加载完成,耗时 {load_time:.1f}s") + + finally: + if load_semaphore is not None: + load_semaphore.release() + + return pipe + + +def process_images(gpu_id, image_list, output_folder, load_semaphore, ready_event, start_barrier): + """ + 单个 GPU 上的处理函数 + Args: + gpu_id: GPU 编号 + image_list: 该 GPU 需要处理的图片文件名列表 + output_folder: 输出目录 + load_semaphore: 控制模型加载并发的信号量 + ready_event: 通知主进程模型已加载完成 + start_barrier: 同步所有进程开始推理 + """ + if len(image_list) == 0: + ready_event.set() + start_barrier.wait() + return + + # 加载模型到指定 GPU(通过信号量控制并发) + pipe = load_pipeline(gpu_id, load_semaphore) + + # 通知主进程该GPU模型已加载完成 + ready_event.set() + + # 等待所有GPU都加载完成后再开始推理 + start_barrier.wait() + + print(f"[GPU {gpu_id}] 开始处理 {len(image_list)} 张图片") + + prompt = "" + + # 只在 GPU 0 上显示主进度条 + pbar = tqdm( + image_list, + desc=f"GPU {gpu_id}", + position=gpu_id, + leave=True, + ncols=100, + bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]' + ) + + for filename in pbar: + image_path = os.path.join(input_folder, filename) + image = Image.open(image_path).convert("RGB") + + # 居中裁剪 + resize 到 512x512 + w, h = image.size + min_dim = min(w, h) + image = image.crop( + ((w - min_dim) // 2, (h - min_dim) // 2, (w + min_dim) // 2, (h + min_dim) // 2) + ).resize((512, 512), Image.BICUBIC) + + # 构造条件(SR 条件) + condition = Condition("sr", image) + + # ======== Coupled Sampling:可视化 Adv 对 sample 的影响 ======== + # 我们用相同的噪声 x0,在两条路径上采样: + # - winner 样本 x1^w:仅使用 SR LoRA + # - adversarial 样本 x1^a:使用 SR + DPO (Adversary) LoRA + # 虽然底层 FLUX 采样是黑盒的,但通过“相同噪声 + 不同 LoRA 组合” + # 可以近似对应你文中描述的: + # x_t^w --> 通过 v_φ 得到 \hat{x}_1^a --> 使用同一 x0 反投影得到 x_t^a + device = pipe._execution_device if hasattr(pipe, "_execution_device") else pipe.device + base_seed = 1 # 如需每张图不同的噪声,可以改为 hash(filename) 等 + + # ---------- winner 分支:仅 SR LoRA ---------- + pipe.set_adapters(["sr"], adapter_weights=[SR_LORA_SCALE]) + gen_winner = torch.Generator(device=device).manual_seed(base_seed) + winner_img = generate( + pipe, + prompt=prompt, + conditions=[condition], + default_lora=True, + generator=gen_winner, + ).images[0] + winner_img = adain_color_fix(winner_img, image) + + # ---------- adversarial 分支:SR + DPO LoRA(Adv) ---------- + pipe.set_adapters(["sr", "dpo"], adapter_weights=[SR_LORA_SCALE, DPO_LORA_SCALE]) + gen_adv = torch.Generator(device=device).manual_seed(base_seed) + adv_img = generate( + pipe, + prompt=prompt, + conditions=[condition], + default_lora=True, + generator=gen_adv, + ).images[0] + adv_img = adain_color_fix(adv_img, image) + + # ---------- 可视化拼图:input | winner | adv ---------- + vis_w, vis_h = image.size + # 保证三张都是同一尺寸(安全起见再 resize 一次) + image_vis = image.resize((vis_w, vis_h), Image.BICUBIC) + winner_vis = winner_img.resize((vis_w, vis_h), Image.BICUBIC) + adv_vis = adv_img.resize((vis_w, vis_h), Image.BICUBIC) + + concat = Image.new("RGB", (vis_w * 3, vis_h)) + concat.paste(image_vis, (0, 0)) + concat.paste(winner_vis, (vis_w, 0)) + concat.paste(adv_vis, (vis_w * 2, 0)) + + # 输出文件:一张图中从左到右分别是:Input | Winner (SR) | Adv (SR + DPO) + concat.save(os.path.join(output_folder, filename)) + + print(f"[GPU {gpu_id}] ✅ 完成!") + + +def main(): + os.makedirs(output_folder, exist_ok=True) + + # 获取所有待处理的图片 + all_images = sorted([ + f for f in os.listdir(input_folder) + if f.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".webp")) + ]) + + total_images = len(all_images) + print("=" * 60) + print(f"📁 输入目录: {input_folder}") + print(f"📁 输出目录: {output_folder}") + print(f"🖼️ 总图片数: {total_images}") + print(f"🎮 GPU 数量: {NUM_GPUS}") + print(f"📦 每 GPU 处理: ~{total_images // NUM_GPUS} 张") + print(f"⚙️ 模型加载并发数: {MAX_CONCURRENT_LOAD}") + print("=" * 60) + + # 将图片列表平均分配给各个 GPU + image_chunks = [[] for _ in range(NUM_GPUS)] + for i, img in enumerate(all_images): + image_chunks[i % NUM_GPUS].append(img) + + # 记录开始时间 + start_time = time.time() + + # 使用多进程并行处理 + mp.set_start_method('spawn', force=True) + + # 创建信号量来限制同时加载模型的进程数(避免I/O瓶颈) + load_semaphore = mp.Semaphore(MAX_CONCURRENT_LOAD) + + # 创建事件来追踪每个进程的模型加载状态 + ready_events = [mp.Event() for _ in range(NUM_GPUS)] + + # 创建屏障来同步所有进程在加载完成后开始推理 + start_barrier = mp.Barrier(NUM_GPUS) + + processes = [] + + print(f"\n⏳ 开始加载模型(最多 {MAX_CONCURRENT_LOAD} 个并发,避免I/O瓶颈)...") + + for gpu_id in range(NUM_GPUS): + p = mp.Process( + target=process_images, + args=(gpu_id, image_chunks[gpu_id], output_folder, + load_semaphore, ready_events[gpu_id], start_barrier) + ) + p.start() + processes.append(p) + + # 等待所有模型加载完成 + loaded_count = 0 + for i, event in enumerate(ready_events): + event.wait() + loaded_count += 1 + print(f" ✅ GPU {i} 就绪 ({loaded_count}/{NUM_GPUS})") + + load_time = time.time() - start_time + print(f"\n⏱️ 模型加载总耗时: {load_time:.1f}s ({load_time/60:.1f} 分钟)") + print("🚀 所有模型加载完成,开始并行推理...\n") + + # 等待所有进程完成 + for p in processes: + p.join() + + # 计算总耗时 + total_time = time.time() - start_time + inference_time = total_time - load_time + avg_time = inference_time / total_images if total_images > 0 else 0 + + print("\n" + "=" * 60) + print(f"🎉 全部完成!") + print(f"📊 处理统计:") + print(f" - 总图片数: {total_images}") + print(f" - 模型加载: {load_time:.1f} 秒 ({load_time/60:.1f} 分钟)") + print(f" - 推理耗时: {inference_time:.1f} 秒 ({inference_time/60:.1f} 分钟)") + print(f" - 总耗时: {total_time:.1f} 秒 ({total_time/60:.1f} 分钟)") + print(f" - 平均每张: {avg_time:.2f} 秒") + print(f" - 吞吐量: {total_images/inference_time*60:.1f} 张/分钟" if inference_time > 0 else "") + print(f"📁 结果保存在: {output_folder}") + print("=" * 60) + + +if __name__ == "__main__": + main() + + + + +# pyiqa psnr ssim lpips musiq clipiqa+ --target /home/wanghongbo06/diffusion-dpo-test/DIV2K-val/sobolev-400 --r /home/wanghongbo06/baipurui/DATA/DIV2K-val/gt \ No newline at end of file diff --git a/diffusion-dpo-test/test_dual_lora_dynamic.py b/diffusion-dpo-test/test_dual_lora_dynamic.py new file mode 100644 index 0000000000000000000000000000000000000000..e23a4ec3205e2db6a03ffd21579a0bc82a522b15 --- /dev/null +++ b/diffusion-dpo-test/test_dual_lora_dynamic.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +测试脚本:同时加载初始超分 LoRA + DPO 训练的 LoRA(动态权重版) + +这个版本不融合 LoRA,而是使用 set_adapters 动态设置权重 +优点:可以在推理时动态调整两个 LoRA 的权重比例 +""" +import os +os.environ["HF_HOME"] = "/home/wanghongbo06/.cache/huggingface" + +import torch +from diffusers.pipelines import FluxPipeline +from src.flux.condition import Condition +from src.flux.generate import generate, seed_everything +from color_fix import wavelet_color_fix, adain_color_fix +from PIL import Image +from tqdm import tqdm + +# ==================== 配置 ==================== +input_folder = "/home/wanghongbo06/diffusion-dpo-test/data_val" +output_folder = "./results-test/dual_lora_dynamic" +os.makedirs(output_folder, exist_ok=True) + +# LoRA 路径 +SR_LORA_PATH = "/home/wanghongbo06/baipurui/CKPTs/FLUX_SR/pytorch_lora_weights_v2.safetensors" +DPO_LORA_PATH = "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors" + +# LoRA 权重(可以动态调整!) +SR_WEIGHT = 1.0 +DPO_WEIGHT = 1.0 # 可以尝试 0.5, 1.0, 2.0 等不同值 + +# ==================== 加载模型 ==================== +print("=" * 60) +print("加载基础 Flux 模型...") +pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", + torch_dtype=torch.bfloat16 +).to("cuda") + +# ==================== 方法 2:动态加载(不融合) ==================== +# 同时加载两个 LoRA,使用 set_adapters 设置权重 + +print("=" * 60) +print(f"加载初始超分 LoRA: {SR_LORA_PATH}") +pipe.load_lora_weights(SR_LORA_PATH, adapter_name="sr") +print(" ✅ SR LoRA 已加载") + +print(f"加载 DPO LoRA: {DPO_LORA_PATH}") +pipe.load_lora_weights(DPO_LORA_PATH, adapter_name="dpo") +print(" ✅ DPO LoRA 已加载") + +# 设置两个 adapter 同时生效,并指定各自的权重 +pipe.set_adapters(["sr", "dpo"], adapter_weights=[SR_WEIGHT, DPO_WEIGHT]) +print(f"\n已设置 adapter 权重: sr={SR_WEIGHT}, dpo={DPO_WEIGHT}") +print("=" * 60) + +# ==================== 测试推理 ==================== +prompt = "" + +for filename in tqdm(sorted(os.listdir(input_folder)), desc="生成图像"): + if not filename.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".webp")): + continue + + image_path = os.path.join(input_folder, filename) + image = Image.open(image_path).convert("RGB") + + # 居中裁剪 + resize 到 512x512 + w, h = image.size + min_dim = min(w, h) + image = image.crop( + ((w - min_dim) // 2, (h - min_dim) // 2, (w + min_dim) // 2, (h + min_dim) // 2) + ).resize((512, 512), Image.BICUBIC) + + # 构造条件并生成图像 + condition = Condition("sr", image) + seed_everything() + + result_img = generate( + pipe, + prompt=prompt, + conditions=[condition], + default_lora=True, + ).images[0] + + # 色彩修复 + result_img = adain_color_fix(result_img, image) + + # 保存结果图像 + result_img.save(os.path.join(output_folder, filename)) + +print(f"\n✅ 完成!结果保存在: {output_folder}") + + +# ==================== 可选:测试不同权重组合 ==================== +def test_different_weights(): + """ + 测试不同的 DPO 权重,看效果差异 + """ + weight_combinations = [ + (1.0, 0.0), # 只有 SR,没有 DPO + (1.0, 0.5), # SR + 0.5x DPO + (1.0, 1.0), # SR + 1x DPO + (1.0, 2.0), # SR + 2x DPO(更强的 DPO 效果) + ] + + for sr_w, dpo_w in weight_combinations: + output_dir = f"./results-test/weight_ablation/sr{sr_w}_dpo{dpo_w}" + os.makedirs(output_dir, exist_ok=True) + + # 动态调整权重 + pipe.set_adapters(["sr", "dpo"], adapter_weights=[sr_w, dpo_w]) + print(f"\n测试权重组合: sr={sr_w}, dpo={dpo_w}") + + # 只测试第一张图 + for filename in sorted(os.listdir(input_folder))[:1]: + if not filename.lower().endswith((".png", ".jpg", ".jpeg")): + continue + + image_path = os.path.join(input_folder, filename) + image = Image.open(image_path).convert("RGB") + + w, h = image.size + min_dim = min(w, h) + image = image.crop( + ((w - min_dim) // 2, (h - min_dim) // 2, (w + min_dim) // 2, (h + min_dim) // 2) + ).resize((512, 512), Image.BICUBIC) + + condition = Condition("sr", image) + seed_everything() + + result_img = generate( + pipe, + prompt="", + conditions=[condition], + default_lora=True, + ).images[0] + + result_img = adain_color_fix(result_img, image) + result_img.save(os.path.join(output_dir, filename)) + print(f" 保存到: {output_dir}/{filename}") + +# 取消注释下面这行来运行权重消融实验 +# test_different_weights() + diff --git a/diffusion-dpo-test/test_dual_lora_multi_gpus.py b/diffusion-dpo-test/test_dual_lora_multi_gpus.py new file mode 100644 index 0000000000000000000000000000000000000000..7c1a57f4cb45b5a96671adfaef09308578705ee0 --- /dev/null +++ b/diffusion-dpo-test/test_dual_lora_multi_gpus.py @@ -0,0 +1,234 @@ +import os +os.environ["HF_HOME"] = "/home/wanghongbo06/.cache/huggingface" + +import torch +import torch.multiprocessing as mp +from diffusers.pipelines import FluxPipeline +from src.flux.condition import Condition +from src.flux.generate import generate, seed_everything +from color_fix import wavelet_color_fix, adain_color_fix +from PIL import Image +from tqdm import tqdm +import time + + +# ============== 配置 ============== +input_folder = "/home/wanghongbo06/baipurui/DATA/LSDIR/65/lr" +output_folder = "/home/wanghongbo06/baipurui/DATA/LSDIR/65/result" + +# LoRA 路径 +# SR_LORA_PATH = "/home/wanghongbo06/baipurui/CKPTs/FLUX_SR/pytorch_lora_weights_v2.safetensors" +SR_LORA_PATH = "/home/wanghongbo06/baipurui/OminiControl/runs/20260105-171922/ckpt/800/pytorch_lora_weights.safetensors" +# DPO_LORA_PATH = "/home/wanghongbo06/diffusion-dpo-adv/results/results_sobolev_20251212_2/checkpoint-400/lora_dpo/adapter_model.safetensors" +DPO_LORA_PATH = "/home/wanghongbo06/diffusion-dpo-adv/results/results_sobolev_20260107_1356/checkpoint-500/lora_dpo/adapter_model.safetensors" + +# 可视化 Adversarial Sample +# ADV_LORA_PATH = '/home/wanghongbo06/diffusion-dpo/results/final_lora/adapter_model.safetensors' +# DPO_LORA_PATH = ADV_LORA_PATH + + + +# LoRA scale(可以调整权重强度) +SR_LORA_SCALE = 1.0 +DPO_LORA_SCALE = 1.0 + +# 多卡配置 +NUM_GPUS = 1 +# 同时加载模型的最大进程数(设为1表示串行加载,避免I/O瓶颈) +MAX_CONCURRENT_LOAD = 1 +# ================================ + + +def load_pipeline(gpu_id, load_semaphore=None): + """在指定 GPU 上加载 pipeline,使用信号量控制并发加载""" + device = f"cuda:{gpu_id}" + + # 使用信号量控制同时加载模型的进程数 + if load_semaphore is not None: + load_semaphore.acquire() + + try: + print(f"[GPU {gpu_id}] 开始加载模型...") + load_start = time.time() + pipe = FluxPipeline.from_pretrained( + '/home/wanghongbo06/baipurui/.cache/huggingface/hub/models--black-forest-labs--FLUX.1-dev/snapshots/3de623fc3c33e44ffbe2bad470d0f45bccf2eb21', + torch_dtype=torch.bfloat16, + token="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", + local_files_on=True, + catch_dir=".cache/flux-sr" + ).to(device) + + # 加载 LoRA + pipe.load_lora_weights(SR_LORA_PATH, adapter_name="sr") + pipe.load_lora_weights(DPO_LORA_PATH, adapter_name="dpo") + pipe.set_adapters(["sr", "dpo"], adapter_weights=[SR_LORA_SCALE, DPO_LORA_SCALE]) + + load_time = time.time() - load_start + print(f"[GPU {gpu_id}] 模型加载完成,耗时 {load_time:.1f}s") + + finally: + if load_semaphore is not None: + load_semaphore.release() + + return pipe + + +def process_images(gpu_id, image_list, output_folder, load_semaphore, ready_event, start_barrier): + """ + 单个 GPU 上的处理函数 + Args: + gpu_id: GPU 编号 + image_list: 该 GPU 需要处理的图片文件名列表 + output_folder: 输出目录 + load_semaphore: 控制模型加载并发的信号量 + ready_event: 通知主进程模型已加载完成 + start_barrier: 同步所有进程开始推理 + """ + if len(image_list) == 0: + ready_event.set() + start_barrier.wait() + return + + # 加载模型到指定 GPU(通过信号量控制并发) + pipe = load_pipeline(gpu_id, load_semaphore) + + # 通知主进程该GPU模型已加载完成 + ready_event.set() + + # 等待所有GPU都加载完成后再开始推理 + start_barrier.wait() + + print(f"[GPU {gpu_id}] 开始处理 {len(image_list)} 张图片") + + prompt = "" + + # 只在 GPU 0 上显示主进度条 + pbar = tqdm( + image_list, + desc=f"GPU {gpu_id}", + position=gpu_id, + leave=True, + ncols=100, + bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]' + ) + + for filename in pbar: + image_path = os.path.join(input_folder, filename) + image = Image.open(image_path).convert("RGB") + + # 居中裁剪 + resize 到 512x512 + w, h = image.size + min_dim = min(w, h) + image = image.crop( + ((w - min_dim) // 2, (h - min_dim) // 2, (w + min_dim) // 2, (h + min_dim) // 2) + ).resize((512, 512), Image.BICUBIC) + + # 构造条件并生成图像 + condition = Condition("sr", image) + seed_everything(1) + + result_img = generate( + pipe, + prompt=prompt, + conditions=[condition], + default_lora=True, + ).images[0] + + result_img = adain_color_fix(result_img, image) + result_img.save(os.path.join(output_folder, filename)) + + print(f"[GPU {gpu_id}] ✅ 完成!") + + +def main(): + os.makedirs(output_folder, exist_ok=True) + + # 获取所有待处理的图片 + all_images = sorted([ + f for f in os.listdir(input_folder) + if f.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".webp")) + ]) + + total_images = len(all_images) + print("=" * 60) + print(f"📁 输入目录: {input_folder}") + print(f"📁 输出目录: {output_folder}") + print(f"🖼️ 总图片数: {total_images}") + print(f"🎮 GPU 数量: {NUM_GPUS}") + print(f"📦 每 GPU 处理: ~{total_images // NUM_GPUS} 张") + print(f"⚙️ 模型加载并发数: {MAX_CONCURRENT_LOAD}") + print("=" * 60) + + # 将图片列表平均分配给各个 GPU + image_chunks = [[] for _ in range(NUM_GPUS)] + for i, img in enumerate(all_images): + image_chunks[i % NUM_GPUS].append(img) + + # 记录开始时间 + start_time = time.time() + + # 使用多进程并行处理 + mp.set_start_method('spawn', force=True) + + # 创建信号量来限制同时加载模型的进程数(避免I/O瓶颈) + load_semaphore = mp.Semaphore(MAX_CONCURRENT_LOAD) + + # 创建事件来追踪每个进程的模型加载状态 + ready_events = [mp.Event() for _ in range(NUM_GPUS)] + + # 创建屏障来同步所有进程在加载完成后开始推理 + start_barrier = mp.Barrier(NUM_GPUS) + + processes = [] + + print(f"\n⏳ 开始加载模型(最多 {MAX_CONCURRENT_LOAD} 个并发,避免I/O瓶颈)...") + + for gpu_id in range(NUM_GPUS): + p = mp.Process( + target=process_images, + args=(gpu_id, image_chunks[gpu_id], output_folder, + load_semaphore, ready_events[gpu_id], start_barrier) + ) + p.start() + processes.append(p) + + # 等待所有模型加载完成 + loaded_count = 0 + for i, event in enumerate(ready_events): + event.wait() + loaded_count += 1 + print(f" ✅ GPU {i} 就绪 ({loaded_count}/{NUM_GPUS})") + + load_time = time.time() - start_time + print(f"\n⏱️ 模型加载总耗时: {load_time:.1f}s ({load_time/60:.1f} 分钟)") + print("🚀 所有模型加载完成,开始并行推理...\n") + + # 等待所有进程完成 + for p in processes: + p.join() + + # 计算总耗时 + total_time = time.time() - start_time + inference_time = total_time - load_time + avg_time = inference_time / total_images if total_images > 0 else 0 + + print("\n" + "=" * 60) + print(f"🎉 全部完成!") + print(f"📊 处理统计:") + print(f" - 总图片数: {total_images}") + print(f" - 模型加载: {load_time:.1f} 秒 ({load_time/60:.1f} 分钟)") + print(f" - 推理耗时: {inference_time:.1f} 秒 ({inference_time/60:.1f} 分钟)") + print(f" - 总耗时: {total_time:.1f} 秒 ({total_time/60:.1f} 分钟)") + print(f" - 平均每张: {avg_time:.2f} 秒") + print(f" - 吞吐量: {total_images/inference_time*60:.1f} 张/分钟" if inference_time > 0 else "") + print(f"📁 结果保存在: {output_folder}") + print("=" * 60) + + +if __name__ == "__main__": + main() + + + + +# pyiqa psnr ssim lpips musiq clipiqa+ --target /home/wanghongbo06/diffusion-dpo-test/DIV2K-val/sobolev-400 --r /home/wanghongbo06/baipurui/DATA/DIV2K-val/gt \ No newline at end of file diff --git a/diffusion-dpo-test/test_flops.py b/diffusion-dpo-test/test_flops.py new file mode 100644 index 0000000000000000000000000000000000000000..639c5a0ce05546795a002196bb9c055546efb7b0 --- /dev/null +++ b/diffusion-dpo-test/test_flops.py @@ -0,0 +1,756 @@ +import os +os.environ["HF_HOME"] = "/home/wanghongbo06/.cache/huggingface" + +import torch +import torch.multiprocessing as mp +from diffusers.pipelines import FluxPipeline +from src.flux.condition import Condition +from src.flux.generate import generate, seed_everything +from color_fix import wavelet_color_fix, adain_color_fix +from PIL import Image +from tqdm import tqdm +import time +import numpy as np +from dataclasses import dataclass +from typing import List, Dict, Any + + +# ============== 配置 ============== +input_folder = "/home/wanghongbo06/baipudui/DATA/DIV2K/DIV2K-val-epoch1/lr" +output_folder = "/home/wanghongbo06/baipurui/results/flops" + +# LoRA 路径 +# SR_LORA_PATH = "/home/wanghongbo06/baipurui/CKPTs/FLUX_SR/pytorch_lora_weights_v2.safetensors" +SR_LORA_PATH = "/home/wanghongbo06/baipurui/OminiControl/runs/20260105-171922/ckpt/800/pytorch_lora_weights.safetensors" +# DPO_LORA_PATH = "/home/wanghongbo06/diffusion-dpo-adv/results/results_sobolev_20251212_2/checkpoint-400/lora_dpo/adapter_model.safetensors" +DPO_LORA_PATH = "/home/wanghongbo06/diffusion-dpo-adv/results/results_sobolev_20260107_1356/checkpoint-500/lora_dpo/adapter_model.safetensors" + +# 可视化 Adversarial Sample +# ADV_LORA_PATH = '/home/wanghongbo06/diffusion-dpo/results/final_lora/adapter_model.safetensors' +# DPO_LORA_PATH = ADV_LORA_PATH + + + +# LoRA scale(可以调整权重强度) +SR_LORA_SCALE = 1.0 +DPO_LORA_SCALE = 1.0 + +# 多卡配置 +NUM_GPUS = 1 +# 同时加载模型的最大进程数(设为1表示串行加载,避免I/O瓶颈) +MAX_CONCURRENT_LOAD = 1 + +# 性能测试配置 +WARMUP_IMAGES = 10 # 预热图片数(不计入统计) +ENABLE_PROFILING = True # 是否启用详细性能分析 +# ================================ + + +@dataclass +class PerformanceMetrics: + """性能指标数据类""" + gpu_id: int + inference_times: List[float] # 每张图的推理时间(不含预热) + warmup_time: float # 预热时间 + peak_memory_mb: float # 显存峰值 (MB) + allocated_memory_mb: float # 实际分配显存 (MB) + reserved_memory_mb: float # 保留显存 (MB) + total_images: int # 处理的总图片数 + + @property + def avg_inference_time(self) -> float: + """平均推理时间(不含预热)""" + if len(self.inference_times) == 0: + return 0.0 + return np.mean(self.inference_times) + + @property + def std_inference_time(self) -> float: + """推理时间标准差""" + if len(self.inference_times) < 2: + return 0.0 + return np.std(self.inference_times) + + @property + def throughput(self) -> float: + """吞吐量(图片/秒)""" + if len(self.inference_times) == 0: + return 0.0 + total_time = sum(self.inference_times) + return len(self.inference_times) / total_time if total_time > 0 else 0.0 + + @property + def memory_efficiency(self) -> float: + """显存效率 = 实际分配 / 保留显存""" + if self.reserved_memory_mb == 0: + return 0.0 + return self.allocated_memory_mb / self.reserved_memory_mb * 100 + + +def estimate_model_flops(pipe, height=512, width=512, num_inference_steps=28): + """ + 修正后的 FLOPs 估算函数 (针对 Flux 架构优化) + """ + try: + from fvcore.nn import FlopCountAnalysis, flop_count_str + + # 1. 获取正确的维度信息 + transformer = pipe.transformer + config = transformer.config + + # Flux 特定的维度参数 + num_heads = config.num_attention_heads + head_dim = config.attention_head_dim + hidden_size = num_heads * head_dim # 通常是 3072 + + # 2. 计算 Latent 空间的分辨率 + # Flux 使用的 VAE 通常由 8x 下采样,patch size 为 1 或 2 + # 这里假设 input 是 latent,Sequence length = (H/16) * (W/16) * Time_ids ? + # Flux 处理 patch 后的 latent。Standard latent is H/8, W/8. + # Then patched to 2x2? Let's assume standard packed sequence length. + # 对于 512x512 图片 -> Latent 64x64 = 4096 tokens. + packed_seq_len = (height // 8) * (width // 8) // 4 # Flux patch_size=2 implies /2 on each dim? + # 更安全的做法:直接取 4096 (针对 512x512) 或根据实际 latent 形状 + # Flux 的 latent 是 H/8, W/8. Flatten 后是 4096. + seq_len = (height // 8) * (width // 8) + + print(f"DEBUG: Estimating with Hidden Size: {hidden_size}, Seq Len: {seq_len}") + + device = next(transformer.parameters()).device + dtype = next(transformer.parameters()).dtype + + # 3. 构造正确维度的 Dummy Inputs + # 注意:Flux forward 需要正确的 img_ids 和 txt_ids 才能跑通, + # 为了避免构造复杂的 IDs 导致报错,我们这里只针对主要的 Linear 层进行 Hook, + # 或者尝试构造尽可能真实的输入。 + + dummy_hidden_states = torch.randn(1, seq_len, hidden_size, device=device, dtype=dtype) + + # Encoder hidden states (T5/CLIP text embeddings) + # Flux text context length is usually 512 + dummy_encoder_hidden_states = torch.randn(1, 512, hidden_size, device=device, dtype=dtype) + + # Pooled projections + dummy_pooled = torch.randn(1, 768, device=device, dtype=dtype) + + # Timestep + dummy_timestep = torch.tensor([500], device=device, dtype=dtype) # half precision + + # Flux 需要 img_ids 来计算 RoPE,如果传 None 可能会报错或跳过计算 + # 这里尝试只传必要的 args。如果 fvcore 报错,可能需要手动计算 Linear 层的 flops + + inputs = ( + dummy_hidden_states, + dummy_encoder_hidden_states, + dummy_pooled, + dummy_timestep, + # img_ids, txt_ids, guidance 通常可以为 None 或跳过,取决于具体实现 + # 如果报错,需要补全这些参数 + ) + + # 4. 运行分析 + # 忽略未调用的参数警告 + flops_analysis = FlopCountAnalysis(transformer, inputs) + + # 强制忽略未使用的算子警告 + flops_analysis.unsupported_ops_warnings(False) + + single_forward_flops = flops_analysis.total() + + # 5. 加上 VAE 的估算 (粗略估算,通常 VAE 约占总量的 5-10% 或更少,但在 SR 中不能完全忽略) + # 这里为了保守,只算 Transformer,但在报告中注明 "Transformer Only" + + total_flops = single_forward_flops * num_inference_steps + + print(f"DEBUG: Single step FLOPs: {single_forward_flops/1e12:.4f} TFLOPs") + return total_flops, "fvcore (Transformer Only)" + + except Exception as e: + print(f"fvcore FLOPs 估算失败: {e}") + # 回退到理论计算 (Theoretical Calculation for Transformer) + # Kaplan Scaling Laws approx: 6 * N * D_model^2 * Seq_len ??? + # 这里的备用方案应该更科学一点 + + # 简单的 Transformer FLOPs 理论公式: + # FLOPs per token ≈ 72 * (d_model ^ 2) (包含 attention 和 FFN) ? + # 更准确的近似: + # FLOPs = 24 * B * S * H^2 + 4 * B * S^2 * H (Attention + FFN) + + try: + config = pipe.transformer.config + H = config.num_attention_heads * config.attention_head_dim + L = config.num_layers + S = (height // 8) * (width // 8) + + # 这是一个非常粗略的 Transformer 理论计算 + # 1. Linear Layers (Q,K,V, Out, MLP up, MLP down) + # 每一层通常有 4个投影 (Attn) + 3个投影 (MLP)? Flux 是 MMDiT 结构更复杂 + # 保守估计:每层参数量 P_layer. FLOPs ≈ 2 * P_layer * S + + total_params = sum(p.numel() for p in pipe.transformer.parameters()) + # Transformer FLOPs ≈ 2 * Params * Sequence_Length + theoretical_flops = 2 * total_params * S * num_inference_steps + + return theoretical_flops, "Theoretical (2*Params*SeqLen)" + except: + return 0, "failed" + +def profile_single_inference(pipe, image, prompt, condition, device): + """ + 对单次推理进行详细的性能分析 + """ + # 确保在正确的设备上操作 + device_id = int(device.split(':')[1]) if isinstance(device, str) else device + + torch.cuda.reset_peak_memory_stats(device) + + # 使用 with torch.cuda.device 确保事件在正确的设备上创建 + with torch.cuda.device(device_id): + torch.cuda.synchronize() + + # 使用 time.perf_counter 作为更可靠的计时方式(多GPU兼容) + start_time = time.perf_counter() + + result_img = generate( + pipe, + prompt=prompt, + conditions=[condition], + default_lora=True, + ).images[0] + + torch.cuda.synchronize() + end_time = time.perf_counter() + + inference_time = end_time - start_time + + # 获取显存信息 + peak_memory = torch.cuda.max_memory_allocated(device) / (1024 ** 2) # MB + allocated_memory = torch.cuda.memory_allocated(device) / (1024 ** 2) # MB + reserved_memory = torch.cuda.memory_reserved(device) / (1024 ** 2) # MB + + return result_img, inference_time, peak_memory, allocated_memory, reserved_memory + + +def load_pipeline(gpu_id, load_semaphore=None): + """在指定 GPU 上加载 pipeline,使用信号量控制并发加载""" + device = f"cuda:{gpu_id}" + + # 显式设置当前进程使用的 GPU + torch.cuda.set_device(gpu_id) + + # 使用信号量控制同时加载模型的进程数 + if load_semaphore is not None: + load_semaphore.acquire() + + try: + print(f"[GPU {gpu_id}] 开始加载模型...") + load_start = time.time() + pipe = FluxPipeline.from_pretrained( + '/home/wanghongbo06/baipurui/.cache/huggingface/hub/models--black-forest-labs--FLUX.1-dev/snapshots/3de623fc3c33e44ffbe2bad470d0f45bccf2eb21', + torch_dtype=torch.bfloat16, + token="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", + local_files_on=True, + catch_dir=".cache/flux-sr" + ).to(device) + + # 加载 LoRA + pipe.load_lora_weights(SR_LORA_PATH, adapter_name="sr") + pipe.load_lora_weights(DPO_LORA_PATH, adapter_name="dpo") + pipe.set_adapters(["sr", "dpo"], adapter_weights=[SR_LORA_SCALE, DPO_LORA_SCALE]) + + load_time = time.time() - load_start + print(f"[GPU {gpu_id}] 模型加载完成,耗时 {load_time:.1f}s") + + finally: + if load_semaphore is not None: + load_semaphore.release() + + return pipe + + +def process_images(gpu_id, image_list, output_folder, load_semaphore, ready_event, start_barrier, metrics_dict=None): + """ + 单个 GPU 上的处理函数 + Args: + gpu_id: GPU 编号 + image_list: 该 GPU 需要处理的图片文件名列表 + output_folder: 输出目录 + load_semaphore: 控制模型加载并发的信号量 + ready_event: 通知主进程模型已加载完成 + start_barrier: 同步所有进程开始推理 + metrics_dict: 用于存储性能指标的共享字典 + """ + try: + if len(image_list) == 0: + ready_event.set() + start_barrier.wait() + return + + device = f"cuda:{gpu_id}" + + # 显式设置当前进程使用的 GPU(在子进程开始时设置) + torch.cuda.set_device(gpu_id) + + # 加载模型到指定 GPU(通过信号量控制并发) + pipe = load_pipeline(gpu_id, load_semaphore) + + # 通知主进程该GPU模型已加载完成 + ready_event.set() + + # 等待所有GPU都加载完成后再开始推理 + start_barrier.wait() + + print(f"[GPU {gpu_id}] 开始处理 {len(image_list)} 张图片") + + prompt = "" + + # 性能统计变量 + inference_times = [] + warmup_time = 0.0 + peak_memory_mb = 0.0 + allocated_memory_mb = 0.0 + reserved_memory_mb = 0.0 + + # 重置显存统计 + torch.cuda.reset_peak_memory_stats(device) + + # 只在 GPU 0 上显示主进度条 + pbar = tqdm( + enumerate(image_list), + total=len(image_list), + desc=f"GPU {gpu_id}", + position=gpu_id, + leave=True, + ncols=120, + bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]' + ) + + for idx, filename in pbar: + image_path = os.path.join(input_folder, filename) + image = Image.open(image_path).convert("RGB") + + # 居中裁剪 + resize 到 512x512 + w, h = image.size + min_dim = min(w, h) + image = image.crop( + ((w - min_dim) // 2, (h - min_dim) // 2, (w + min_dim) // 2, (h + min_dim) // 2) + ).resize((512, 512), Image.BICUBIC) + + # 构造条件并生成图像 + condition = Condition("sr", image) + seed_everything(1) + + # 使用精确计时进行推理 + result_img, inf_time, peak_mem, alloc_mem, reserved_mem = profile_single_inference( + pipe, image, prompt, condition, device + ) + + # 更新显存峰值 + peak_memory_mb = max(peak_memory_mb, peak_mem) + allocated_memory_mb = alloc_mem + reserved_memory_mb = reserved_mem + + # 区分预热和正式推理 + if idx < WARMUP_IMAGES: + warmup_time += inf_time + pbar.set_postfix({ + 'warmup': f'{inf_time:.2f}s', + 'mem': f'{peak_mem:.0f}MB' + }) + else: + inference_times.append(inf_time) + avg_time = np.mean(inference_times) + pbar.set_postfix({ + 'time': f'{inf_time:.2f}s', + 'avg': f'{avg_time:.2f}s', + 'mem': f'{peak_mem:.0f}MB' + }) + + result_img = adain_color_fix(result_img, image) + result_img.save(os.path.join(output_folder, filename)) + + # 获取最终显存统计 + final_peak_memory = torch.cuda.max_memory_allocated(device) / (1024 ** 2) + final_allocated = torch.cuda.memory_allocated(device) / (1024 ** 2) + final_reserved = torch.cuda.memory_reserved(device) / (1024 ** 2) + + # 创建性能指标对象 + metrics = PerformanceMetrics( + gpu_id=gpu_id, + inference_times=inference_times, + warmup_time=warmup_time, + peak_memory_mb=max(peak_memory_mb, final_peak_memory), + allocated_memory_mb=final_allocated, + reserved_memory_mb=final_reserved, + total_images=len(image_list) + ) + + # 存储到共享字典 + if metrics_dict is not None: + metrics_dict[gpu_id] = { + 'inference_times': inference_times, + 'warmup_time': warmup_time, + 'peak_memory_mb': metrics.peak_memory_mb, + 'allocated_memory_mb': metrics.allocated_memory_mb, + 'reserved_memory_mb': metrics.reserved_memory_mb, + 'total_images': len(image_list), + 'avg_inference_time': metrics.avg_inference_time, + 'std_inference_time': metrics.std_inference_time, + 'throughput': metrics.throughput, + 'memory_efficiency': metrics.memory_efficiency + } + + # 打印单 GPU 性能摘要 + print(f"\n[GPU {gpu_id}] ✅ 完成!") + print(f" 📊 性能摘要:") + print(f" - 处理图片数: {len(image_list)} (预热: {WARMUP_IMAGES}, 统计: {len(inference_times)})") + print(f" - 预热时间: {warmup_time:.2f}s") + print(f" - 平均推理时间: {metrics.avg_inference_time:.3f}s ± {metrics.std_inference_time:.3f}s") + print(f" - 吞吐量: {metrics.throughput:.2f} 图/秒") + print(f" - 显存峰值: {metrics.peak_memory_mb:.1f} MB") + print(f" - 显存效率: {metrics.memory_efficiency:.1f}%") + + except Exception as e: + print(f"\n[GPU {gpu_id}] ❌ 错误: {e}") + import traceback + traceback.print_exc() + # 确保事件被设置,避免死锁 + ready_event.set() + raise + + +def print_performance_report(metrics_dict: Dict[int, Dict], load_time: float, total_time: float, total_images: int): + """ + 打印详细的性能报告 + """ + print("\n" + "=" * 70) + print(" 📊 详细性能报告") + print("=" * 70) + + # 汇总所有 GPU 的数据 + all_inference_times = [] + total_warmup_time = 0.0 + max_peak_memory = 0.0 + total_allocated_memory = 0.0 + total_reserved_memory = 0.0 + + for gpu_id, metrics in sorted(metrics_dict.items()): + all_inference_times.extend(metrics['inference_times']) + total_warmup_time += metrics['warmup_time'] + max_peak_memory = max(max_peak_memory, metrics['peak_memory_mb']) + total_allocated_memory += metrics['allocated_memory_mb'] + total_reserved_memory += metrics['reserved_memory_mb'] + + # ============== 1. 推理时间统计 ============== + print("\n🕐 推理时间统计:") + print("-" * 50) + + if len(all_inference_times) > 0: + avg_time = np.mean(all_inference_times) + std_time = np.std(all_inference_times) + min_time = np.min(all_inference_times) + max_time = np.max(all_inference_times) + median_time = np.median(all_inference_times) + p95_time = np.percentile(all_inference_times, 95) + p99_time = np.percentile(all_inference_times, 99) + + print(f" 统计图片数: {len(all_inference_times)} (排除预热 {WARMUP_IMAGES * len(metrics_dict)} 张)") + print(f" 平均推理时间: {avg_time:.4f} 秒/张") + print(f" 标准差: {std_time:.4f} 秒") + print(f" 最小值: {min_time:.4f} 秒") + print(f" 最大值: {max_time:.4f} 秒") + print(f" 中位数: {median_time:.4f} 秒") + print(f" P95: {p95_time:.4f} 秒") + print(f" P99: {p99_time:.4f} 秒") + print(f" 预热总时间: {total_warmup_time:.2f} 秒") + else: + print(" ⚠️ 没有有效的推理时间数据") + + # ============== 2. 吞吐量统计 ============== + print("\n⚡ 吞吐量 (Throughput):") + print("-" * 50) + + if len(all_inference_times) > 0: + total_inference_time = sum(all_inference_times) + throughput_per_sec = len(all_inference_times) / total_inference_time if total_inference_time > 0 else 0 + throughput_per_min = throughput_per_sec * 60 + throughput_per_hour = throughput_per_sec * 3600 + + # 多卡并行吞吐量(wall-clock time) + inference_wall_time = total_time - load_time + parallel_throughput_sec = total_images / inference_wall_time if inference_wall_time > 0 else 0 + parallel_throughput_min = parallel_throughput_sec * 60 + + print(f" 单 GPU 吞吐量:") + print(f" - {throughput_per_sec:.3f} 图/秒") + print(f" - {throughput_per_min:.1f} 图/分钟") + print(f" - {throughput_per_hour:.0f} 图/小时") + print(f" {len(metrics_dict)} GPU 并行吞吐量 (wall-clock):") + print(f" - {parallel_throughput_sec:.3f} 图/秒") + print(f" - {parallel_throughput_min:.1f} 图/分钟") + + # ============== 3. 显存统计 ============== + print("\n💾 显存 (GPU Memory):") + print("-" * 50) + + for gpu_id, metrics in sorted(metrics_dict.items()): + print(f" GPU {gpu_id}:") + print(f" - 显存峰值: {metrics['peak_memory_mb']:.1f} MB ({metrics['peak_memory_mb']/1024:.2f} GB)") + print(f" - 实际分配: {metrics['allocated_memory_mb']:.1f} MB") + print(f" - 保留显存: {metrics['reserved_memory_mb']:.1f} MB") + print(f" - 显存效率: {metrics['memory_efficiency']:.1f}%") + + if len(metrics_dict) > 1: + print(f" 汇总:") + print(f" - 最大显存峰值: {max_peak_memory:.1f} MB ({max_peak_memory/1024:.2f} GB)") + print(f" - 总分配显存: {total_allocated_memory:.1f} MB") + + # ============== 4. FLOPs 估算 ============== + print("\n🔢 计算量 (FLOPs) - 估算:") + print("-" * 50) + print(" ⚠️ FLOPs 估算需要在单 GPU 模式下单独运行") + print(" 💡 提示: 设置 NUM_GPUS=1 并运行 estimate_flops_standalone() 获取准确值") + + # ============== 5. 时间分解 ============== + print("\n⏱️ 时间分解:") + print("-" * 50) + inference_time = total_time - load_time + print(f" 模型加载时间: {load_time:.1f} 秒 ({load_time/total_time*100:.1f}%)") + print(f" 推理时间: {inference_time:.1f} 秒 ({inference_time/total_time*100:.1f}%)") + print(f" 总时间: {total_time:.1f} 秒") + + # ============== 6. 汇总 ============== + print("\n" + "=" * 70) + print(" 📈 性能汇总") + print("=" * 70) + + if len(all_inference_times) > 0: + avg_time = np.mean(all_inference_times) + print(f""" +┌─────────────────────────────────────────────────────────────────┐ +│ 指标 │ 值 │ +├─────────────────────────────────────────────────────────────────┤ +│ 平均推理时间 (不含预热) │ {avg_time:.4f} 秒/张 │ +│ 吞吐量 (单GPU) │ {throughput_per_sec:.3f} 图/秒 │ +│ 吞吐量 ({len(metrics_dict)}GPU 并行) │ {parallel_throughput_sec:.3f} 图/秒 │ +│ 显存峰值 │ {max_peak_memory:.1f} MB ({max_peak_memory/1024:.2f} GB) │ +│ 总处理图片 │ {total_images} 张 │ +└─────────────────────────────────────────────────────────────────┘ +""") + + print("=" * 70) + + +def estimate_flops_standalone(): + """ + 独立运行的 FLOPs 估算函数 + 需要在单 GPU 上运行 + """ + print("=" * 60) + print("🔢 正在估算模型 FLOPs...") + print("=" * 60) + + device = "cuda:0" + + # 加载模型 + print("加载模型中...") + pipe = FluxPipeline.from_pretrained( + '/home/wanghongbo06/baipurui/.cache/huggingface/hub/models--black-forest-labs--FLUX.1-dev/snapshots/3de623fc3c33e44ffbe2bad470d0f45bccf2eb21', + torch_dtype=torch.bfloat16, + token="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", + local_files_on=True, + catch_dir=".cache/flux-sr" + ).to(device) + + pipe.load_lora_weights(SR_LORA_PATH, adapter_name="sr") + pipe.load_lora_weights(DPO_LORA_PATH, adapter_name="dpo") + pipe.set_adapters(["sr", "dpo"], adapter_weights=[SR_LORA_SCALE, DPO_LORA_SCALE]) + + # 估算 FLOPs + flops, method = estimate_model_flops(pipe) + + if flops > 0: + print(f"\n📊 FLOPs 估算结果 (方法: {method}):") + print(f" - 每次推理 FLOPs: {flops:.2e}") + print(f" - 每次推理 TFLOPs: {flops / 1e12:.2f}") + + # 如果有推理时间,可以计算 FLOPS (每秒浮点运算数) + # FLOPS = FLOPs / inference_time + else: + print("❌ FLOPs 估算失败") + + return flops + + +def save_metrics_to_json(metrics_dict: Dict, output_path: str, load_time: float, total_time: float, total_images: int): + """ + 将性能指标保存到 JSON 文件 + """ + import json + + # 计算汇总指标 + all_times = [] + for gpu_id, m in metrics_dict.items(): + all_times.extend(m['inference_times']) + + inference_wall_time = total_time - load_time + + summary = { + 'avg_inference_time_sec': float(np.mean(all_times)) if all_times else 0, + 'std_inference_time_sec': float(np.std(all_times)) if all_times else 0, + 'min_inference_time_sec': float(np.min(all_times)) if all_times else 0, + 'max_inference_time_sec': float(np.max(all_times)) if all_times else 0, + 'median_inference_time_sec': float(np.median(all_times)) if all_times else 0, + 'p95_inference_time_sec': float(np.percentile(all_times, 95)) if all_times else 0, + 'p99_inference_time_sec': float(np.percentile(all_times, 99)) if all_times else 0, + 'throughput_single_gpu_per_sec': float(len(all_times) / sum(all_times)) if all_times and sum(all_times) > 0 else 0, + 'throughput_parallel_per_sec': float(total_images / inference_wall_time) if inference_wall_time > 0 else 0, + 'peak_memory_mb': max([m['peak_memory_mb'] for m in metrics_dict.values()]) if metrics_dict else 0, + 'peak_memory_gb': max([m['peak_memory_mb'] for m in metrics_dict.values()]) / 1024 if metrics_dict else 0, + 'total_images': total_images, + 'warmup_images': WARMUP_IMAGES * len(metrics_dict), + 'measured_images': len(all_times), + 'model_load_time_sec': load_time, + 'inference_wall_time_sec': inference_wall_time, + 'total_time_sec': total_time, + 'num_gpus': len(metrics_dict), + } + + result = { + 'summary': summary, + 'per_gpu_metrics': {str(k): v for k, v in metrics_dict.items()} + } + + with open(output_path, 'w') as f: + json.dump(result, f, indent=2, ensure_ascii=False) + + print(f"📄 性能指标已保存到: {output_path}") + + +def main(save_metrics_path: str = None): + """ + 主函数 + Args: + save_metrics_path: 可选,保存性能指标的 JSON 文件路径 + """ + os.makedirs(output_folder, exist_ok=True) + + # 获取所有待处理的图片 + all_images = sorted([ + f for f in os.listdir(input_folder) + if f.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".webp")) + ]) + + total_images = len(all_images) + print("=" * 70) + print(" 🚀 Diffusion 超分性能测试") + print("=" * 70) + print(f"📁 输入目录: {input_folder}") + print(f"📁 输出目录: {output_folder}") + print(f"🖼️ 总图片数: {total_images}") + print(f"🎮 GPU 数量: {NUM_GPUS}") + print(f"📦 每 GPU 处理: ~{total_images // NUM_GPUS} 张") + print(f"⚙️ 模型加载并发数: {MAX_CONCURRENT_LOAD}") + print(f"🔥 预热图片数: {WARMUP_IMAGES} (每个GPU)") + print(f"📊 性能分析: {'开启' if ENABLE_PROFILING else '关闭'}") + print("=" * 70) + + # 将图片列表平均分配给各个 GPU + image_chunks = [[] for _ in range(NUM_GPUS)] + for i, img in enumerate(all_images): + image_chunks[i % NUM_GPUS].append(img) + + # 记录开始时间 + start_time = time.time() + + # 使用多进程并行处理 + mp.set_start_method('spawn', force=True) + + # 创建信号量来限制同时加载模型的进程数(避免I/O瓶颈) + load_semaphore = mp.Semaphore(MAX_CONCURRENT_LOAD) + + # 创建事件来追踪每个进程的模型加载状态 + ready_events = [mp.Event() for _ in range(NUM_GPUS)] + + # 创建屏障来同步所有进程在加载完成后开始推理 + start_barrier = mp.Barrier(NUM_GPUS) + + # 创建共享字典存储各 GPU 的性能指标 + manager = mp.Manager() + metrics_dict = manager.dict() + + processes = [] + + print(f"\n⏳ 开始加载模型(最多 {MAX_CONCURRENT_LOAD} 个并发,避免I/O瓶颈)...") + + for gpu_id in range(NUM_GPUS): + p = mp.Process( + target=process_images, + args=(gpu_id, image_chunks[gpu_id], output_folder, + load_semaphore, ready_events[gpu_id], start_barrier, metrics_dict) + ) + p.start() + processes.append(p) + + # 等待所有模型加载完成 + loaded_count = 0 + for i, event in enumerate(ready_events): + event.wait() + loaded_count += 1 + print(f" ✅ GPU {i} 就绪 ({loaded_count}/{NUM_GPUS})") + + load_time = time.time() - start_time + print(f"\n⏱️ 模型加载总耗时: {load_time:.1f}s ({load_time/60:.1f} 分钟)") + print("🚀 所有模型加载完成,开始并行推理...\n") + + # 等待所有进程完成 + for p in processes: + p.join() + + # 计算总耗时 + total_time = time.time() - start_time + + # 将 manager.dict 转换为普通 dict + metrics_dict_normal = dict(metrics_dict) + + # 打印详细性能报告 + print_performance_report(metrics_dict_normal, load_time, total_time, total_images) + + # 保存性能指标到文件 + if save_metrics_path: + save_metrics_to_json(metrics_dict_normal, save_metrics_path, load_time, total_time, total_images) + + print(f"\n📁 结果保存在: {output_folder}") + print("=" * 70) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description='Diffusion 超分性能测试') + parser.add_argument('--mode', type=str, default='benchmark', choices=['benchmark', 'flops'], + help='运行模式: benchmark (默认) 或 flops (仅估算FLOPs)') + parser.add_argument('--save-metrics', type=str, default=None, + help='保存性能指标到 JSON 文件的路径') + parser.add_argument('--num-gpus', type=int, default=None, + help='使用的 GPU 数量 (覆盖默认值)') + parser.add_argument('--warmup', type=int, default=None, + help='预热图片数量 (覆盖默认值)') + + args = parser.parse_args() + + # 覆盖配置 (使用 global) + if args.num_gpus is not None: + NUM_GPUS = args.num_gpus + if args.warmup is not None: + WARMUP_IMAGES = args.warmup + + if args.mode == 'flops': + # 仅估算 FLOPs + estimate_flops_standalone() + else: + # 运行完整的 benchmark + main(save_metrics_path=args.save_metrics) + + +# pyiqa psnr ssim lpips musiq clipiqa+ --target /home/wanghongbo06/diffusion-dpo-test/DIV2K-val/sobolev-400 --r /home/wanghongbo06/baipurui/DATA/DIV2K-val/gt \ No newline at end of file diff --git a/diffusion-dpo-test/verify_lora_fusion.py b/diffusion-dpo-test/verify_lora_fusion.py new file mode 100644 index 0000000000000000000000000000000000000000..464f7e21b32faaf26f2c6b70ab2da14d5bae7dd2 --- /dev/null +++ b/diffusion-dpo-test/verify_lora_fusion.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 +""" +验证 LoRA 融合是否真的生效 +关键问题:fuse_lora 之后 unload_lora_weights 会不会把融合的权重也清除? +""" +import os +os.environ["HF_HOME"] = "/home/wanghongbo06/.cache/huggingface" + +import torch +from diffusers.pipelines import FluxPipeline +from safetensors.torch import load_file + +def verify_lora_fusion(checkpoint_path): + print(f"\n{'='*80}") + print(f"验证 LoRA 融合: {checkpoint_path}") + print(f"{'='*80}\n") + + # 1. 加载基础模型 + print("1. 加载基础 Flux 模型...") + pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", + torch_dtype=torch.bfloat16 + ).to("cuda") + + # 2. 记录融合前的权重 + print("\n2. 记录融合前的权重...") + sample_key = None + for name, param in pipe.transformer.named_parameters(): + if 'single_transformer_blocks.30' in name and 'weight' in name and 'norm' not in name: + sample_key = name + break + + if sample_key is None: + print("❌ 找不到采样层") + return + + # 获取融合前的权重 + weight_before = None + for name, param in pipe.transformer.named_parameters(): + if name == sample_key: + weight_before = param.clone().float() + print(f" 采样层: {name}") + print(f" 融合前: mean={weight_before.mean():.6e}, max={weight_before.abs().max():.6e}") + break + + # 3. 加载 LoRA + print(f"\n3. 加载 LoRA: {checkpoint_path}") + pipe.load_lora_weights(checkpoint_path, adapter_name="dpo") + + # 检查 LoRA 是否加载 + print(" 检查 LoRA 是否加载...") + lora_loaded = False + for name, module in pipe.transformer.named_modules(): + if hasattr(module, 'lora_A'): + lora_loaded = True + # 打印 LoRA 权重 + if 'single_transformer_blocks.30' in name: + if hasattr(module, 'lora_A') and module.lora_A is not None: + for adapter_name, lora_a in module.lora_A.items(): + print(f" {name}.lora_A[{adapter_name}]: mean={lora_a.weight.float().mean():.6e}") + if hasattr(module, 'lora_B') and module.lora_B is not None: + for adapter_name, lora_b in module.lora_B.items(): + print(f" {name}.lora_B[{adapter_name}]: mean={lora_b.weight.float().mean():.6e}") + break + + if lora_loaded: + print(" ✅ LoRA 已加载") + else: + print(" ❌ LoRA 未加载!") + + # 4. 融合 LoRA + print("\n4. 融合 LoRA (fuse_lora)...") + pipe.fuse_lora(lora_scale=1.0, adapter_names=["dpo"]) + + # 检查融合后的权重 + weight_after_fuse = None + for name, param in pipe.transformer.named_parameters(): + if name == sample_key: + weight_after_fuse = param.clone().float() + print(f" 融合后: mean={weight_after_fuse.mean():.6e}, max={weight_after_fuse.abs().max():.6e}") + break + + # 计算融合带来的变化 + if weight_before is not None and weight_after_fuse is not None: + diff_fuse = (weight_after_fuse - weight_before).abs() + print(f" 融合变化: max_diff={diff_fuse.max():.6e}, mean_diff={diff_fuse.mean():.6e}") + + if diff_fuse.max() < 1e-10: + print(" ⚠️ 警告: 融合没有改变权重!LoRA 可能没有生效!") + else: + print(" ✅ 融合成功,权重已改变") + + # 5. 卸载 LoRA + print("\n5. 卸载 LoRA (unload_lora_weights)...") + pipe.unload_lora_weights() + + # 检查卸载后的权重 + weight_after_unload = None + for name, param in pipe.transformer.named_parameters(): + if name == sample_key: + weight_after_unload = param.clone().float() + print(f" 卸载后: mean={weight_after_unload.mean():.6e}, max={weight_after_unload.abs().max():.6e}") + break + + # 计算卸载带来的变化 + if weight_after_fuse is not None and weight_after_unload is not None: + diff_unload = (weight_after_unload - weight_after_fuse).abs() + print(f" 卸载变化: max_diff={diff_unload.max():.6e}, mean_diff={diff_unload.mean():.6e}") + + if diff_unload.max() < 1e-10: + print(" ✅ 卸载后权重保持不变(融合的权重被保留)") + else: + print(" ❌ 卸载后权重改变了!融合的权重可能被清除了!") + + # 6. 最终对比:融合后 vs 原始 + if weight_before is not None and weight_after_unload is not None: + final_diff = (weight_after_unload - weight_before).abs() + print(f"\n6. 最终对比(原始 vs 卸载后):") + print(f" max_diff={final_diff.max():.6e}, mean_diff={final_diff.mean():.6e}") + + if final_diff.max() < 1e-10: + print(" ❌ 最终权重与原始相同!LoRA 完全没有生效!") + else: + print(" ✅ 最终权重与原始不同,LoRA 已生效") + + # 清理 + del pipe + torch.cuda.empty_cache() + + return weight_before, weight_after_fuse, weight_after_unload + + +def compare_two_checkpoints(ckpt1_path, ckpt2_path): + """对比两个 checkpoint 融合后的权重差异""" + print(f"\n{'='*80}") + print(f"对比两个 checkpoint 融合后的权重") + print(f" Checkpoint 1: {ckpt1_path}") + print(f" Checkpoint 2: {ckpt2_path}") + print(f"{'='*80}\n") + + # 加载两个模型并融合 + print("加载并融合 Checkpoint 1...") + pipe1 = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", + torch_dtype=torch.bfloat16 + ).to("cuda") + pipe1.load_lora_weights(ckpt1_path, adapter_name="dpo") + pipe1.fuse_lora(lora_scale=1.0, adapter_names=["dpo"]) + pipe1.unload_lora_weights() + + print("加载并融合 Checkpoint 2...") + pipe2 = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", + torch_dtype=torch.bfloat16 + ).to("cuda") + pipe2.load_lora_weights(ckpt2_path, adapter_name="dpo") + pipe2.fuse_lora(lora_scale=1.0, adapter_names=["dpo"]) + pipe2.unload_lora_weights() + + # 对比权重 + print("\n对比融合后的权重差异:") + print("-" * 80) + + total_diff = 0 + total_params = 0 + max_diff = 0 + max_diff_layer = "" + + for (name1, param1), (name2, param2) in zip( + pipe1.transformer.named_parameters(), + pipe2.transformer.named_parameters() + ): + if name1 != name2: + print(f"❌ 层名不匹配: {name1} vs {name2}") + continue + + diff = (param1.float() - param2.float()).abs() + layer_max_diff = diff.max().item() + layer_mean_diff = diff.mean().item() + + total_diff += diff.sum().item() + total_params += param1.numel() + + if layer_max_diff > max_diff: + max_diff = layer_max_diff + max_diff_layer = name1 + + # 只打印有显著差异的层 + if layer_max_diff > 1e-6: + print(f"{name1}: max_diff={layer_max_diff:.6e}, mean_diff={layer_mean_diff:.6e}") + + print("-" * 80) + print(f"\n总结:") + print(f" 总参数数: {total_params:,}") + print(f" 平均差异: {total_diff/total_params:.6e}") + print(f" 最大差异: {max_diff:.6e}") + print(f" 最大差异层: {max_diff_layer}") + + if max_diff < 1e-10: + print("\n❌ 两个 checkpoint 融合后的权重完全相同!") + print(" 可能原因:") + print(" 1. fuse_lora 没有正确融合") + print(" 2. unload_lora_weights 清除了融合的权重") + print(" 3. 两个 checkpoint 本身就相同") + elif max_diff < 1e-6: + print(f"\n⚠️ 两个 checkpoint 融合后的权重差异非常小 ({max_diff:.6e})") + print(" 这可能不足以产生可见的视觉差异") + else: + print(f"\n✅ 两个 checkpoint 融合后的权重有差异 ({max_diff:.6e})") + + # 清理 + del pipe1, pipe2 + torch.cuda.empty_cache() + + +if __name__ == "__main__": + # 验证单个 checkpoint 的融合过程 + verify_lora_fusion( + "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors" + ) + + # 对比两个 checkpoint + compare_two_checkpoints( + "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-15/lora_train_unet/adapter_model.safetensors", + "/home/wanghongbo06/diffusion-dpo-adv/results_1202_4/checkpoint-60/lora_train_unet/adapter_model.safetensors" + ) + diff --git a/generative-models/configs/example_training/autoencoder/kl-f4/imagenet-attnfree-logvar.yaml b/generative-models/configs/example_training/autoencoder/kl-f4/imagenet-attnfree-logvar.yaml new file mode 100644 index 0000000000000000000000000000000000000000..731f55930fba00cb9de758c90eefbcd1afd59d47 --- /dev/null +++ b/generative-models/configs/example_training/autoencoder/kl-f4/imagenet-attnfree-logvar.yaml @@ -0,0 +1,104 @@ +model: + base_learning_rate: 4.5e-6 + target: sgm.models.autoencoder.AutoencodingEngine + params: + input_key: jpg + monitor: val/rec_loss + + loss_config: + target: sgm.modules.autoencoding.losses.GeneralLPIPSWithDiscriminator + params: + perceptual_weight: 0.25 + disc_start: 20001 + disc_weight: 0.5 + learn_logvar: True + + regularization_weights: + kl_loss: 1.0 + + regularizer_config: + target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer + + encoder_config: + target: sgm.modules.diffusionmodules.model.Encoder + params: + attn_type: none + double_z: True + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4] + num_res_blocks: 4 + attn_resolutions: [] + dropout: 0.0 + + decoder_config: + target: sgm.modules.diffusionmodules.model.Decoder + params: ${model.params.encoder_config.params} + +data: + target: sgm.data.dataset.StableDataModuleFromConfig + params: + train: + datapipeline: + urls: + - DATA-PATH + pipeline_config: + shardshuffle: 10000 + sample_shuffle: 10000 + + decoders: + - pil + + postprocessors: + - target: sdata.mappers.TorchVisionImageTransforms + params: + key: jpg + transforms: + - target: torchvision.transforms.Resize + params: + size: 256 + interpolation: 3 + - target: torchvision.transforms.ToTensor + - target: sdata.mappers.Rescaler + - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare + params: + h_key: height + w_key: width + + loader: + batch_size: 8 + num_workers: 4 + + +lightning: + strategy: + target: pytorch_lightning.strategies.DDPStrategy + params: + find_unused_parameters: True + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 50000 + + image_logger: + target: main.ImageLogger + params: + enable_autocast: False + batch_frequency: 1000 + max_images: 8 + increase_log_steps: True + + trainer: + devices: 0, + limit_val_batches: 50 + benchmark: True + accumulate_grad_batches: 1 + val_check_interval: 10000 \ No newline at end of file diff --git a/generative-models/configs/example_training/autoencoder/kl-f4/imagenet-kl_f8_8chn.yaml b/generative-models/configs/example_training/autoencoder/kl-f4/imagenet-kl_f8_8chn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..39c7c9df5da1c657d2ce72ac8b6269ae86185e91 --- /dev/null +++ b/generative-models/configs/example_training/autoencoder/kl-f4/imagenet-kl_f8_8chn.yaml @@ -0,0 +1,105 @@ +model: + base_learning_rate: 4.5e-6 + target: sgm.models.autoencoder.AutoencodingEngine + params: + input_key: jpg + monitor: val/loss/rec + disc_start_iter: 0 + + encoder_config: + target: sgm.modules.diffusionmodules.model.Encoder + params: + attn_type: vanilla-xformers + double_z: true + z_channels: 8 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + + decoder_config: + target: sgm.modules.diffusionmodules.model.Decoder + params: ${model.params.encoder_config.params} + + regularizer_config: + target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer + + loss_config: + target: sgm.modules.autoencoding.losses.GeneralLPIPSWithDiscriminator + params: + perceptual_weight: 0.25 + disc_start: 20001 + disc_weight: 0.5 + learn_logvar: True + + regularization_weights: + kl_loss: 1.0 + +data: + target: sgm.data.dataset.StableDataModuleFromConfig + params: + train: + datapipeline: + urls: + - DATA-PATH + pipeline_config: + shardshuffle: 10000 + sample_shuffle: 10000 + + decoders: + - pil + + postprocessors: + - target: sdata.mappers.TorchVisionImageTransforms + params: + key: jpg + transforms: + - target: torchvision.transforms.Resize + params: + size: 256 + interpolation: 3 + - target: torchvision.transforms.ToTensor + - target: sdata.mappers.Rescaler + - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare + params: + h_key: height + w_key: width + + loader: + batch_size: 8 + num_workers: 4 + + +lightning: + strategy: + target: pytorch_lightning.strategies.DDPStrategy + params: + find_unused_parameters: True + + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 50000 + + image_logger: + target: main.ImageLogger + params: + enable_autocast: False + batch_frequency: 1000 + max_images: 8 + increase_log_steps: True + + trainer: + devices: 0, + limit_val_batches: 50 + benchmark: True + accumulate_grad_batches: 1 + val_check_interval: 10000 diff --git a/generative-models/configs/example_training/imagenet-f8_cond.yaml b/generative-models/configs/example_training/imagenet-f8_cond.yaml new file mode 100644 index 0000000000000000000000000000000000000000..23cded00a72e2883df1a4bf2b639a49cda763a8e --- /dev/null +++ b/generative-models/configs/example_training/imagenet-f8_cond.yaml @@ -0,0 +1,185 @@ +model: + base_learning_rate: 1.0e-4 + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.13025 + disable_first_stage_autocast: True + log_keys: + - cls + + scheduler_config: + target: sgm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [10000] + cycle_lengths: [10000000000000] + f_start: [1.e-6] + f_max: [1.] + f_min: [1.] + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + in_channels: 4 + out_channels: 4 + model_channels: 256 + attention_resolutions: [1, 2, 4] + num_res_blocks: 2 + channel_mult: [1, 2, 4] + num_head_channels: 64 + num_classes: sequential + adm_in_channels: 1024 + transformer_depth: 1 + context_dim: 1024 + spatial_transformer_attn_type: softmax-xformers + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: True + input_key: cls + ucg_rate: 0.2 + target: sgm.modules.encoders.modules.ClassEmbedder + params: + add_sequence_dim: True + embed_dim: 1024 + n_classes: 1000 + + - is_trainable: False + ucg_rate: 0.2 + input_key: original_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - is_trainable: False + input_key: crop_coords_top_left + ucg_rate: 0.2 + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + first_stage_config: + target: sgm.models.autoencoder.AutoencoderKL + params: + ckpt_path: CKPT_PATH + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss + params: + loss_weighting_config: + target: sgm.modules.diffusionmodules.loss_weighting.EpsWeighting + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling + params: + num_idx: 1000 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler + params: + num_steps: 50 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + guider_config: + target: sgm.modules.diffusionmodules.guiders.VanillaCFG + params: + scale: 5.0 + +data: + target: sgm.data.dataset.StableDataModuleFromConfig + params: + train: + datapipeline: + urls: + # USER: adapt this path the root of your custom dataset + - DATA_PATH + pipeline_config: + shardshuffle: 10000 + sample_shuffle: 10000 # USER: you might wanna adapt depending on your available RAM + + decoders: + - pil + + postprocessors: + - target: sdata.mappers.TorchVisionImageTransforms + params: + key: jpg # USER: you might wanna adapt this for your custom dataset + transforms: + - target: torchvision.transforms.Resize + params: + size: 256 + interpolation: 3 + - target: torchvision.transforms.ToTensor + - target: sdata.mappers.Rescaler + + - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare + params: + h_key: height # USER: you might wanna adapt this for your custom dataset + w_key: width # USER: you might wanna adapt this for your custom dataset + + loader: + batch_size: 64 + num_workers: 6 + +lightning: + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 25000 + + image_logger: + target: main.ImageLogger + params: + disabled: False + enable_autocast: False + batch_frequency: 1000 + max_images: 8 + increase_log_steps: True + log_first_step: False + log_images_kwargs: + use_ema_scope: False + N: 8 + n_rows: 2 + + trainer: + devices: 0, + benchmark: True + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 + max_epochs: 1000 \ No newline at end of file diff --git a/generative-models/configs/example_training/toy/cifar10_cond.yaml b/generative-models/configs/example_training/toy/cifar10_cond.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fca9958464488a66ed2a54d57c59228215690606 --- /dev/null +++ b/generative-models/configs/example_training/toy/cifar10_cond.yaml @@ -0,0 +1,98 @@ +model: + base_learning_rate: 1.0e-4 + target: sgm.models.diffusion.DiffusionEngine + params: + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.Denoiser + params: + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling + params: + sigma_data: 1.0 + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + in_channels: 3 + out_channels: 3 + model_channels: 32 + attention_resolutions: [] + num_res_blocks: 4 + channel_mult: [1, 2, 2] + num_head_channels: 32 + num_classes: sequential + adm_in_channels: 128 + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: True + input_key: cls + ucg_rate: 0.2 + target: sgm.modules.encoders.modules.ClassEmbedder + params: + embed_dim: 128 + n_classes: 10 + + first_stage_config: + target: sgm.models.autoencoder.IdentityFirstStage + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss + params: + loss_weighting_config: + target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting + params: + sigma_data: 1.0 + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler + params: + num_steps: 50 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization + + guider_config: + target: sgm.modules.diffusionmodules.guiders.VanillaCFG + params: + scale: 3.0 + +data: + target: sgm.data.cifar10.CIFAR10Loader + params: + batch_size: 512 + num_workers: 1 + +lightning: + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 25000 + + image_logger: + target: main.ImageLogger + params: + disabled: False + batch_frequency: 1000 + max_images: 64 + increase_log_steps: True + log_first_step: False + log_images_kwargs: + use_ema_scope: False + N: 64 + n_rows: 8 + + trainer: + devices: 0, + benchmark: True + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 + max_epochs: 20 \ No newline at end of file diff --git a/generative-models/configs/example_training/toy/mnist.yaml b/generative-models/configs/example_training/toy/mnist.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a86d05ca1efa537b57646c3923c1f54ac0d6ccf4 --- /dev/null +++ b/generative-models/configs/example_training/toy/mnist.yaml @@ -0,0 +1,79 @@ +model: + base_learning_rate: 1.0e-4 + target: sgm.models.diffusion.DiffusionEngine + params: + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.Denoiser + params: + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling + params: + sigma_data: 1.0 + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + in_channels: 1 + out_channels: 1 + model_channels: 32 + attention_resolutions: [] + num_res_blocks: 4 + channel_mult: [1, 2, 2] + num_head_channels: 32 + + first_stage_config: + target: sgm.models.autoencoder.IdentityFirstStage + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss + params: + loss_weighting_config: + target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting + params: + sigma_data: 1.0 + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler + params: + num_steps: 50 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization + +data: + target: sgm.data.mnist.MNISTLoader + params: + batch_size: 512 + num_workers: 1 + +lightning: + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 25000 + + image_logger: + target: main.ImageLogger + params: + disabled: False + batch_frequency: 1000 + max_images: 64 + increase_log_steps: False + log_first_step: False + log_images_kwargs: + use_ema_scope: False + N: 64 + n_rows: 8 + + trainer: + devices: 0, + benchmark: True + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 + max_epochs: 10 \ No newline at end of file diff --git a/generative-models/configs/example_training/toy/mnist_cond.yaml b/generative-models/configs/example_training/toy/mnist_cond.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8378acd7acd4c23039a659789b6e6ff5de1a1058 --- /dev/null +++ b/generative-models/configs/example_training/toy/mnist_cond.yaml @@ -0,0 +1,98 @@ +model: + base_learning_rate: 1.0e-4 + target: sgm.models.diffusion.DiffusionEngine + params: + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.Denoiser + params: + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling + params: + sigma_data: 1.0 + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + in_channels: 1 + out_channels: 1 + model_channels: 32 + attention_resolutions: [] + num_res_blocks: 4 + channel_mult: [1, 2, 2] + num_head_channels: 32 + num_classes: sequential + adm_in_channels: 128 + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: True + input_key: cls + ucg_rate: 0.2 + target: sgm.modules.encoders.modules.ClassEmbedder + params: + embed_dim: 128 + n_classes: 10 + + first_stage_config: + target: sgm.models.autoencoder.IdentityFirstStage + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss + params: + loss_weighting_config: + target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting + params: + sigma_data: 1.0 + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler + params: + num_steps: 50 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization + + guider_config: + target: sgm.modules.diffusionmodules.guiders.VanillaCFG + params: + scale: 3.0 + +data: + target: sgm.data.mnist.MNISTLoader + params: + batch_size: 512 + num_workers: 1 + +lightning: + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 25000 + + image_logger: + target: main.ImageLogger + params: + disabled: False + batch_frequency: 1000 + max_images: 16 + increase_log_steps: True + log_first_step: False + log_images_kwargs: + use_ema_scope: False + N: 16 + n_rows: 4 + + trainer: + devices: 0, + benchmark: True + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 + max_epochs: 20 \ No newline at end of file diff --git a/generative-models/configs/example_training/toy/mnist_cond_discrete_eps.yaml b/generative-models/configs/example_training/toy/mnist_cond_discrete_eps.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e58aae58dd108887d8d2ac06933a31f84ea61509 --- /dev/null +++ b/generative-models/configs/example_training/toy/mnist_cond_discrete_eps.yaml @@ -0,0 +1,103 @@ +model: + base_learning_rate: 1.0e-4 + target: sgm.models.diffusion.DiffusionEngine + params: + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + in_channels: 1 + out_channels: 1 + model_channels: 32 + attention_resolutions: [] + num_res_blocks: 4 + channel_mult: [1, 2, 2] + num_head_channels: 32 + num_classes: sequential + adm_in_channels: 128 + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: True + input_key: cls + ucg_rate: 0.2 + target: sgm.modules.encoders.modules.ClassEmbedder + params: + embed_dim: 128 + n_classes: 10 + + first_stage_config: + target: sgm.models.autoencoder.IdentityFirstStage + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss + params: + loss_weighting_config: + target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling + params: + num_idx: 1000 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler + params: + num_steps: 50 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + guider_config: + target: sgm.modules.diffusionmodules.guiders.VanillaCFG + params: + scale: 5.0 + +data: + target: sgm.data.mnist.MNISTLoader + params: + batch_size: 512 + num_workers: 1 + +lightning: + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 25000 + + image_logger: + target: main.ImageLogger + params: + disabled: False + batch_frequency: 1000 + max_images: 16 + increase_log_steps: True + log_first_step: False + log_images_kwargs: + use_ema_scope: False + N: 16 + n_rows: 4 + + trainer: + devices: 0, + benchmark: True + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 + max_epochs: 20 \ No newline at end of file diff --git a/generative-models/configs/example_training/toy/mnist_cond_l1_loss.yaml b/generative-models/configs/example_training/toy/mnist_cond_l1_loss.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee2f780358b7fe100efa226ae20f6ac58b441632 --- /dev/null +++ b/generative-models/configs/example_training/toy/mnist_cond_l1_loss.yaml @@ -0,0 +1,99 @@ +model: + base_learning_rate: 1.0e-4 + target: sgm.models.diffusion.DiffusionEngine + params: + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.Denoiser + params: + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling + params: + sigma_data: 1.0 + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + in_channels: 1 + out_channels: 1 + model_channels: 32 + attention_resolutions: [] + num_res_blocks: 4 + channel_mult: [1, 2, 2] + num_head_channels: 32 + num_classes: sequential + adm_in_channels: 128 + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: True + input_key: cls + ucg_rate: 0.2 + target: sgm.modules.encoders.modules.ClassEmbedder + params: + embed_dim: 128 + n_classes: 10 + + first_stage_config: + target: sgm.models.autoencoder.IdentityFirstStage + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss + params: + loss_type: l1 + loss_weighting_config: + target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting + params: + sigma_data: 1.0 + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler + params: + num_steps: 50 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization + + guider_config: + target: sgm.modules.diffusionmodules.guiders.VanillaCFG + params: + scale: 3.0 + +data: + target: sgm.data.mnist.MNISTLoader + params: + batch_size: 512 + num_workers: 1 + +lightning: + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 25000 + + image_logger: + target: main.ImageLogger + params: + disabled: False + batch_frequency: 1000 + max_images: 64 + increase_log_steps: True + log_first_step: False + log_images_kwargs: + use_ema_scope: False + N: 64 + n_rows: 8 + + trainer: + devices: 0, + benchmark: True + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 + max_epochs: 20 \ No newline at end of file diff --git a/generative-models/configs/example_training/toy/mnist_cond_with_ema.yaml b/generative-models/configs/example_training/toy/mnist_cond_with_ema.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c666e7143b7cb0a920d384f3f6294231b8bb1726 --- /dev/null +++ b/generative-models/configs/example_training/toy/mnist_cond_with_ema.yaml @@ -0,0 +1,100 @@ +model: + base_learning_rate: 1.0e-4 + target: sgm.models.diffusion.DiffusionEngine + params: + use_ema: True + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.Denoiser + params: + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling + params: + sigma_data: 1.0 + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + in_channels: 1 + out_channels: 1 + model_channels: 32 + attention_resolutions: [] + num_res_blocks: 4 + channel_mult: [1, 2, 2] + num_head_channels: 32 + num_classes: sequential + adm_in_channels: 128 + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: True + input_key: cls + ucg_rate: 0.2 + target: sgm.modules.encoders.modules.ClassEmbedder + params: + embed_dim: 128 + n_classes: 10 + + first_stage_config: + target: sgm.models.autoencoder.IdentityFirstStage + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss + params: + loss_weighting_config: + target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting + params: + sigma_data: 1.0 + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler + params: + num_steps: 50 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization + + guider_config: + target: sgm.modules.diffusionmodules.guiders.VanillaCFG + params: + scale: 3.0 + +data: + target: sgm.data.mnist.MNISTLoader + params: + batch_size: 512 + num_workers: 1 + +lightning: + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 25000 + + image_logger: + target: main.ImageLogger + params: + disabled: False + batch_frequency: 1000 + max_images: 64 + increase_log_steps: True + log_first_step: False + log_images_kwargs: + use_ema_scope: False + N: 64 + n_rows: 8 + + trainer: + devices: 0, + benchmark: True + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 + max_epochs: 20 \ No newline at end of file diff --git a/generative-models/configs/example_training/txt2img-clipl-legacy-ucg-training.yaml b/generative-models/configs/example_training/txt2img-clipl-legacy-ucg-training.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0f268c3295bd57888de3efc736d307903ee80a8f --- /dev/null +++ b/generative-models/configs/example_training/txt2img-clipl-legacy-ucg-training.yaml @@ -0,0 +1,182 @@ +model: + base_learning_rate: 1.0e-4 + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.13025 + disable_first_stage_autocast: True + log_keys: + - txt + + scheduler_config: + target: sgm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [10000] + cycle_lengths: [10000000000000] + f_start: [1.e-6] + f_max: [1.] + f_min: [1.] + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [1, 2, 4] + num_res_blocks: 2 + channel_mult: [1, 2, 4, 4] + num_head_channels: 64 + num_classes: sequential + adm_in_channels: 1792 + num_heads: 1 + transformer_depth: 1 + context_dim: 768 + spatial_transformer_attn_type: softmax-xformers + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: True + input_key: txt + ucg_rate: 0.1 + legacy_ucg_value: "" + target: sgm.modules.encoders.modules.FrozenCLIPEmbedder + params: + always_return_pooled: True + + - is_trainable: False + ucg_rate: 0.1 + input_key: original_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - is_trainable: False + input_key: crop_coords_top_left + ucg_rate: 0.1 + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + first_stage_config: + target: sgm.models.autoencoder.AutoencoderKL + params: + ckpt_path: CKPT_PATH + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [ 1, 2, 4, 4 ] + num_res_blocks: 2 + attn_resolutions: [ ] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss + params: + loss_weighting_config: + target: sgm.modules.diffusionmodules.loss_weighting.EpsWeighting + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling + params: + num_idx: 1000 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler + params: + num_steps: 50 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + guider_config: + target: sgm.modules.diffusionmodules.guiders.VanillaCFG + params: + scale: 7.5 + +data: + target: sgm.data.dataset.StableDataModuleFromConfig + params: + train: + datapipeline: + urls: + # USER: adapt this path the root of your custom dataset + - DATA_PATH + pipeline_config: + shardshuffle: 10000 + sample_shuffle: 10000 # USER: you might wanna adapt depending on your available RAM + + decoders: + - pil + + postprocessors: + - target: sdata.mappers.TorchVisionImageTransforms + params: + key: jpg # USER: you might wanna adapt this for your custom dataset + transforms: + - target: torchvision.transforms.Resize + params: + size: 256 + interpolation: 3 + - target: torchvision.transforms.ToTensor + - target: sdata.mappers.Rescaler + - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare + # USER: you might wanna use non-default parameters due to your custom dataset + + loader: + batch_size: 64 + num_workers: 6 + +lightning: + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 25000 + + image_logger: + target: main.ImageLogger + params: + disabled: False + enable_autocast: False + batch_frequency: 1000 + max_images: 8 + increase_log_steps: True + log_first_step: False + log_images_kwargs: + use_ema_scope: False + N: 8 + n_rows: 2 + + trainer: + devices: 0, + benchmark: True + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 + max_epochs: 1000 \ No newline at end of file diff --git a/generative-models/configs/example_training/txt2img-clipl.yaml b/generative-models/configs/example_training/txt2img-clipl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cb66ede901b1aa1acb18d162b88912a2e6eab0ce --- /dev/null +++ b/generative-models/configs/example_training/txt2img-clipl.yaml @@ -0,0 +1,184 @@ +model: + base_learning_rate: 1.0e-4 + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.13025 + disable_first_stage_autocast: True + log_keys: + - txt + + scheduler_config: + target: sgm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [10000] + cycle_lengths: [10000000000000] + f_start: [1.e-6] + f_max: [1.] + f_min: [1.] + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [1, 2, 4] + num_res_blocks: 2 + channel_mult: [1, 2, 4, 4] + num_head_channels: 64 + num_classes: sequential + adm_in_channels: 1792 + num_heads: 1 + transformer_depth: 1 + context_dim: 768 + spatial_transformer_attn_type: softmax-xformers + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: True + input_key: txt + ucg_rate: 0.1 + legacy_ucg_value: "" + target: sgm.modules.encoders.modules.FrozenCLIPEmbedder + params: + always_return_pooled: True + + - is_trainable: False + ucg_rate: 0.1 + input_key: original_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - is_trainable: False + input_key: crop_coords_top_left + ucg_rate: 0.1 + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + first_stage_config: + target: sgm.models.autoencoder.AutoencoderKL + params: + ckpt_path: CKPT_PATH + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + loss_fn_config: + target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss + params: + loss_weighting_config: + target: sgm.modules.diffusionmodules.loss_weighting.EpsWeighting + sigma_sampler_config: + target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling + params: + num_idx: 1000 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + sampler_config: + target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler + params: + num_steps: 50 + + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + guider_config: + target: sgm.modules.diffusionmodules.guiders.VanillaCFG + params: + scale: 7.5 + +data: + target: sgm.data.dataset.StableDataModuleFromConfig + params: + train: + datapipeline: + urls: + # USER: adapt this path the root of your custom dataset + - DATA_PATH + pipeline_config: + shardshuffle: 10000 + sample_shuffle: 10000 + + + decoders: + - pil + + postprocessors: + - target: sdata.mappers.TorchVisionImageTransforms + params: + key: jpg # USER: you might wanna adapt this for your custom dataset + transforms: + - target: torchvision.transforms.Resize + params: + size: 256 + interpolation: 3 + - target: torchvision.transforms.ToTensor + - target: sdata.mappers.Rescaler + # USER: you might wanna use non-default parameters due to your custom dataset + - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare + # USER: you might wanna use non-default parameters due to your custom dataset + + loader: + batch_size: 64 + num_workers: 6 + +lightning: + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 25000 + + image_logger: + target: main.ImageLogger + params: + disabled: False + enable_autocast: False + batch_frequency: 1000 + max_images: 8 + increase_log_steps: True + log_first_step: False + log_images_kwargs: + use_ema_scope: False + N: 8 + n_rows: 2 + + trainer: + devices: 0, + benchmark: True + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 + max_epochs: 1000 \ No newline at end of file diff --git a/generative-models/configs/inference/sd_xl_base.yaml b/generative-models/configs/inference/sd_xl_base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6047379753a05224bb5b3f6746130fb7fb9f40aa --- /dev/null +++ b/generative-models/configs/inference/sd_xl_base.yaml @@ -0,0 +1,93 @@ +model: + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.13025 + disable_first_stage_autocast: True + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + adm_in_channels: 2816 + num_classes: sequential + use_checkpoint: True + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [4, 2] + num_res_blocks: 2 + channel_mult: [1, 2, 4] + num_head_channels: 64 + use_linear_in_transformer: True + transformer_depth: [1, 2, 10] + context_dim: 2048 + spatial_transformer_attn_type: softmax-xformers + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: False + input_key: txt + target: sgm.modules.encoders.modules.FrozenCLIPEmbedder + params: + layer: hidden + layer_idx: 11 + + - is_trainable: False + input_key: txt + target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2 + params: + arch: ViT-bigG-14 + version: laion2b_s39b_b160k + freeze: True + layer: penultimate + always_return_pooled: True + legacy: False + + - is_trainable: False + input_key: original_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - is_trainable: False + input_key: crop_coords_top_left + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - is_trainable: False + input_key: target_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + first_stage_config: + target: sgm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity diff --git a/generative-models/configs/inference/sd_xl_refiner.yaml b/generative-models/configs/inference/sd_xl_refiner.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2d5ab44e748c55f5f2e34ae5aefdb78a921a8d3f --- /dev/null +++ b/generative-models/configs/inference/sd_xl_refiner.yaml @@ -0,0 +1,86 @@ +model: + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.13025 + disable_first_stage_autocast: True + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + adm_in_channels: 2560 + num_classes: sequential + use_checkpoint: True + in_channels: 4 + out_channels: 4 + model_channels: 384 + attention_resolutions: [4, 2] + num_res_blocks: 2 + channel_mult: [1, 2, 4, 4] + num_head_channels: 64 + use_linear_in_transformer: True + transformer_depth: 4 + context_dim: [1280, 1280, 1280, 1280] + spatial_transformer_attn_type: softmax-xformers + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: False + input_key: txt + target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2 + params: + arch: ViT-bigG-14 + version: laion2b_s39b_b160k + legacy: False + freeze: True + layer: penultimate + always_return_pooled: True + + - is_trainable: False + input_key: original_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - is_trainable: False + input_key: crop_coords_top_left + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - is_trainable: False + input_key: aesthetic_score + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + first_stage_config: + target: sgm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity diff --git a/generative-models/configs/inference/sv3d_p.yaml b/generative-models/configs/inference/sv3d_p.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d3781fe592813e4d1b55e9ce09488457fd84df3b --- /dev/null +++ b/generative-models/configs/inference/sv3d_p.yaml @@ -0,0 +1,118 @@ +model: + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.18215 + disable_first_stage_autocast: True + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.Denoiser + params: + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise + + network_config: + target: sgm.modules.diffusionmodules.video_model.VideoUNet + params: + adm_in_channels: 1280 + num_classes: sequential + use_checkpoint: True + in_channels: 8 + out_channels: 4 + model_channels: 320 + attention_resolutions: [4, 2, 1] + num_res_blocks: 2 + channel_mult: [1, 2, 4, 4] + num_head_channels: 64 + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + spatial_transformer_attn_type: softmax-xformers + extra_ff_mix_layer: True + use_spatial_context: True + merge_strategy: learned_with_images + video_kernel_size: [3, 1, 1] + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - input_key: cond_frames_without_noise + is_trainable: False + target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder + params: + n_cond_frames: 1 + n_copies: 1 + open_clip_embedding_config: + target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder + params: + freeze: True + + - input_key: cond_frames + is_trainable: False + target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder + params: + disable_encoder_autocast: True + n_cond_frames: 1 + n_copies: 1 + is_ae: True + encoder_config: + target: sgm.models.autoencoder.AutoencoderKLModeOnly + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: True + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + - input_key: cond_aug + is_trainable: False + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - input_key: polars_rad + is_trainable: False + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 512 + + - input_key: azimuths_rad + is_trainable: False + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 512 + + first_stage_config: + target: sgm.models.autoencoder.AutoencodingEngine + params: + loss_config: + target: torch.nn.Identity + regularizer_config: + target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer + encoder_config: + target: torch.nn.Identity + decoder_config: + target: sgm.modules.diffusionmodules.model.Decoder + params: + attn_type: vanilla-xformers + double_z: True + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [ 1, 2, 4, 4 ] + num_res_blocks: 2 + attn_resolutions: [ ] + dropout: 0.0 \ No newline at end of file diff --git a/generative-models/configs/inference/sv3d_u.yaml b/generative-models/configs/inference/sv3d_u.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5c48a5ff960262168c59a63f5321a81a06d83bea --- /dev/null +++ b/generative-models/configs/inference/sv3d_u.yaml @@ -0,0 +1,106 @@ +model: + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.18215 + disable_first_stage_autocast: True + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.Denoiser + params: + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise + + network_config: + target: sgm.modules.diffusionmodules.video_model.VideoUNet + params: + adm_in_channels: 256 + num_classes: sequential + use_checkpoint: True + in_channels: 8 + out_channels: 4 + model_channels: 320 + attention_resolutions: [4, 2, 1] + num_res_blocks: 2 + channel_mult: [1, 2, 4, 4] + num_head_channels: 64 + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + spatial_transformer_attn_type: softmax-xformers + extra_ff_mix_layer: True + use_spatial_context: True + merge_strategy: learned_with_images + video_kernel_size: [3, 1, 1] + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - input_key: cond_frames_without_noise + is_trainable: False + target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder + params: + n_cond_frames: 1 + n_copies: 1 + open_clip_embedding_config: + target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder + params: + freeze: True + + - input_key: cond_frames + is_trainable: False + target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder + params: + disable_encoder_autocast: True + n_cond_frames: 1 + n_copies: 1 + is_ae: True + encoder_config: + target: sgm.models.autoencoder.AutoencoderKLModeOnly + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: True + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + - input_key: cond_aug + is_trainable: False + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + first_stage_config: + target: sgm.models.autoencoder.AutoencodingEngine + params: + loss_config: + target: torch.nn.Identity + regularizer_config: + target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer + encoder_config: + target: torch.nn.Identity + decoder_config: + target: sgm.modules.diffusionmodules.model.Decoder + params: + attn_type: vanilla-xformers + double_z: True + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [ 1, 2, 4, 4 ] + num_res_blocks: 2 + attn_resolutions: [ ] + dropout: 0.0 \ No newline at end of file diff --git a/generative-models/configs/inference/svd.yaml b/generative-models/configs/inference/svd.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a0819ea77f1ed95dfedb2ab6ccdded4e6414e43 --- /dev/null +++ b/generative-models/configs/inference/svd.yaml @@ -0,0 +1,131 @@ +model: + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.18215 + disable_first_stage_autocast: True + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.Denoiser + params: + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise + + network_config: + target: sgm.modules.diffusionmodules.video_model.VideoUNet + params: + adm_in_channels: 768 + num_classes: sequential + use_checkpoint: True + in_channels: 8 + out_channels: 4 + model_channels: 320 + attention_resolutions: [4, 2, 1] + num_res_blocks: 2 + channel_mult: [1, 2, 4, 4] + num_head_channels: 64 + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + spatial_transformer_attn_type: softmax-xformers + extra_ff_mix_layer: True + use_spatial_context: True + merge_strategy: learned_with_images + video_kernel_size: [3, 1, 1] + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: False + input_key: cond_frames_without_noise + target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder + params: + n_cond_frames: 1 + n_copies: 1 + open_clip_embedding_config: + target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder + params: + freeze: True + + - input_key: fps_id + is_trainable: False + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - input_key: motion_bucket_id + is_trainable: False + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - input_key: cond_frames + is_trainable: False + target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder + params: + disable_encoder_autocast: True + n_cond_frames: 1 + n_copies: 1 + is_ae: True + encoder_config: + target: sgm.models.autoencoder.AutoencoderKLModeOnly + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: True + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + - input_key: cond_aug + is_trainable: False + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + first_stage_config: + target: sgm.models.autoencoder.AutoencodingEngine + params: + loss_config: + target: torch.nn.Identity + regularizer_config: + target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer + encoder_config: + target: sgm.modules.diffusionmodules.model.Encoder + params: + attn_type: vanilla + double_z: True + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + decoder_config: + target: sgm.modules.autoencoding.temporal_ae.VideoDecoder + params: + attn_type: vanilla + double_z: True + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + video_kernel_size: [3, 1, 1] \ No newline at end of file diff --git a/generative-models/configs/inference/svd_image_decoder.yaml b/generative-models/configs/inference/svd_image_decoder.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bb09177ad77a8154c20fcbb2e2fdfc0ac9b6c491 --- /dev/null +++ b/generative-models/configs/inference/svd_image_decoder.yaml @@ -0,0 +1,114 @@ +model: + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.18215 + disable_first_stage_autocast: True + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.Denoiser + params: + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise + + network_config: + target: sgm.modules.diffusionmodules.video_model.VideoUNet + params: + adm_in_channels: 768 + num_classes: sequential + use_checkpoint: True + in_channels: 8 + out_channels: 4 + model_channels: 320 + attention_resolutions: [4, 2, 1] + num_res_blocks: 2 + channel_mult: [1, 2, 4, 4] + num_head_channels: 64 + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + spatial_transformer_attn_type: softmax-xformers + extra_ff_mix_layer: True + use_spatial_context: True + merge_strategy: learned_with_images + video_kernel_size: [3, 1, 1] + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: False + input_key: cond_frames_without_noise + target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder + params: + n_cond_frames: 1 + n_copies: 1 + open_clip_embedding_config: + target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder + params: + freeze: True + + - input_key: fps_id + is_trainable: False + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - input_key: motion_bucket_id + is_trainable: False + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - input_key: cond_frames + is_trainable: False + target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder + params: + disable_encoder_autocast: True + n_cond_frames: 1 + n_copies: 1 + is_ae: True + encoder_config: + target: sgm.models.autoencoder.AutoencoderKLModeOnly + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: True + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + - input_key: cond_aug + is_trainable: False + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + first_stage_config: + target: sgm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: True + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity \ No newline at end of file diff --git a/generative-models/model_licenses/LICENSE-SDXL-Turbo b/generative-models/model_licenses/LICENSE-SDXL-Turbo new file mode 100644 index 0000000000000000000000000000000000000000..36b087b2c24ff6d398be6fe60efe026033be2793 --- /dev/null +++ b/generative-models/model_licenses/LICENSE-SDXL-Turbo @@ -0,0 +1,58 @@ +STABILITY AI NON-COMMERCIAL RESEARCH COMMUNITY LICENSE AGREEMENT +Dated: November 28, 2023 + + +By using or distributing any portion or element of the Models, Software, Software Products or Derivative Works, you agree to be bound by this Agreement. + + +"Agreement" means this Stable Non-Commercial Research Community License Agreement. + + +“AUP” means the Stability AI Acceptable Use Policy available at https://stability.ai/use-policy, as may be updated from time to time. + + +"Derivative Work(s)” means (a) any derivative work of the Software Products as recognized by U.S. copyright laws and (b) any modifications to a Model, and any other model created which is based on or derived from the Model or the Model’s output. For clarity, Derivative Works do not include the output of any Model. + + +“Documentation” means any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software. + + +"Licensee" or "you" means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity's behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf. + + +“Model(s)" means, collectively, Stability AI’s proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing, made available under this Agreement. + + +“Non-Commercial Uses” means exercising any of the rights granted herein for the purpose of research or non-commercial purposes. Non-Commercial Uses does not include any production use of the Software Products or any Derivative Works. + + +"Stability AI" or "we" means Stability AI Ltd. and its affiliates. + +"Software" means Stability AI’s proprietary software made available under this Agreement. + + +“Software Products” means the Models, Software and Documentation, individually or in any combination. + + + +1. License Rights and Redistribution. + +a. Subject to your compliance with this Agreement, the AUP (which is hereby incorporated herein by reference), and the Documentation, Stability AI grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under Stability AI’s intellectual property or other rights owned or controlled by Stability AI embodied in the Software Products to reproduce the Software Products and produce, reproduce, distribute, and create Derivative Works of the Software Products for Non-Commercial Uses only, respectively. + +b. You may not use the Software Products or Derivative Works to enable third parties to use the Software Products or Derivative Works as part of your hosted service or via your APIs, whether you are adding substantial additional functionality thereto or not. Merely distributing the Software Products or Derivative Works for download online without offering any related service (ex. by distributing the Models on HuggingFace) is not a violation of this subsection. If you wish to use the Software Products or any Derivative Works for commercial or production use or you wish to make the Software Products or any Derivative Works available to third parties via your hosted service or your APIs, contact Stability AI at https://stability.ai/contact. + +c. If you distribute or make the Software Products, or any Derivative Works thereof, available to a third party, the Software Products, Derivative Works, or any portion thereof, respectively, will remain subject to this Agreement and you must (i) provide a copy of this Agreement to such third party, and (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "This Stability AI Model is licensed under the Stability AI Non-Commercial Research Community License, Copyright (c) Stability AI Ltd. All Rights Reserved.” If you create a Derivative Work of a Software Product, you may add your own attribution notices to the Notice file included with the Software Product, provided that you clearly indicate which attributions apply to the Software Product and you must state in the NOTICE file that you changed the Software Product and how it was modified. + +2. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE SOFTWARE PRODUCTS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE SOFTWARE PRODUCTS, DERIVATIVE WORKS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE SOFTWARE PRODUCTS, DERIVATIVE WORKS AND ANY OUTPUT AND RESULTS. + +3. Limitation of Liability. IN NO EVENT WILL STABILITY AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF STABILITY AI OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING. + +4. Intellectual Property. + +a. No trademark licenses are granted under this Agreement, and in connection with the Software Products or Derivative Works, neither Stability AI nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Software Products or Derivative Works. + +b. Subject to Stability AI’s ownership of the Software Products and Derivative Works made by or for Stability AI, with respect to any Derivative Works that are made by you, as between you and Stability AI, you are and will be the owner of such Derivative Works + +c. If you institute litigation or other proceedings against Stability AI (including a cross-claim or counterclaim in a lawsuit) alleging that the Software Products, Derivative Works or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out of or related to your use or distribution of the Software Products or Derivative Works in violation of this Agreement. + +5. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Software Products and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Stability AI may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of any Software Products or Derivative Works. Sections 2-4 shall survive the termination of this Agreement. diff --git a/generative-models/model_licenses/LICENSE-SDXL0.9 b/generative-models/model_licenses/LICENSE-SDXL0.9 new file mode 100644 index 0000000000000000000000000000000000000000..b01c5a6838f0c01a347aaf83e2fde3b59a36006d --- /dev/null +++ b/generative-models/model_licenses/LICENSE-SDXL0.9 @@ -0,0 +1,75 @@ +SDXL 0.9 RESEARCH LICENSE AGREEMENT +Copyright (c) Stability AI Ltd. +This License Agreement (as may be amended in accordance with this License Agreement, “License”), between you, or your employer or other entity (if you are entering into this agreement on behalf of your employer or other entity) (“Licensee” or “you”) and Stability AI Ltd. (“Stability AI” or “we”) applies to your use of any computer program, algorithm, source code, object code, or software that is made available by Stability AI under this License (“Software”) and any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software (“Documentation”). +By clicking “I Accept” below or by using the Software, you agree to the terms of this License. If you do not agree to this License, then you do not have any rights to use the Software or Documentation (collectively, the “Software Products”), and you must immediately cease using the Software Products. If you are agreeing to be bound by the terms of this License on behalf of your employer or other entity, you represent and warrant to Stability AI that you have full legal authority to bind your employer or such entity to this License. If you do not have the requisite authority, you may not accept the License or access the Software Products on behalf of your employer or other entity. +1. LICENSE GRANT + +a. Subject to your compliance with the Documentation and Sections 2, 3, and 5, Stability AI grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under Stability AI’s copyright interests to reproduce, distribute, and create derivative works of the Software solely for your non-commercial research purposes. The foregoing license is personal to you, and you may not assign or sublicense this License or any other rights or obligations under this License without Stability AI’s prior written consent; any such assignment or sublicense will be void and will automatically and immediately terminate this License. + +b. You may make a reasonable number of copies of the Documentation solely for use in connection with the license to the Software granted above. + +c. The grant of rights expressly set forth in this Section 1 (License Grant) are the complete grant of rights to you in the Software Products, and no other licenses are granted, whether by waiver, estoppel, implication, equity or otherwise. Stability AI and its licensors reserve all rights not expressly granted by this License. + + +2. RESTRICTIONS + +You will not, and will not permit, assist or cause any third party to: + +a. use, modify, copy, reproduce, create derivative works of, or distribute the Software Products (or any derivative works thereof, works incorporating the Software Products, or any data produced by the Software), in whole or in part, for (i) any commercial or production purposes, (ii) military purposes or in the service of nuclear technology, (iii) purposes of surveillance, including any research or development relating to surveillance, (iv) biometric processing, (v) in any manner that infringes, misappropriates, or otherwise violates any third-party rights, or (vi) in any manner that violates any applicable law and violating any privacy or security laws, rules, regulations, directives, or governmental requirements (including the General Data Privacy Regulation (Regulation (EU) 2016/679), the California Consumer Privacy Act, and any and all laws governing the processing of biometric information), as well as all amendments and successor laws to any of the foregoing; + +b. alter or remove copyright and other proprietary notices which appear on or in the Software Products; + +c. utilize any equipment, device, software, or other means to circumvent or remove any security or protection used by Stability AI in connection with the Software, or to circumvent or remove any usage restrictions, or to enable functionality disabled by Stability AI; or + +d. offer or impose any terms on the Software Products that alter, restrict, or are inconsistent with the terms of this License. + +e. 1) violate any applicable U.S. and non-U.S. export control and trade sanctions laws (“Export Laws”); 2) directly or indirectly export, re-export, provide, or otherwise transfer Software Products: (a) to any individual, entity, or country prohibited by Export Laws; (b) to anyone on U.S. or non-U.S. government restricted parties lists; or (c) for any purpose prohibited by Export Laws, including nuclear, chemical or biological weapons, or missile technology applications; 3) use or download Software Products if you or they are: (a) located in a comprehensively sanctioned jurisdiction, (b) currently listed on any U.S. or non-U.S. restricted parties list, or (c) for any purpose prohibited by Export Laws; and (4) will not disguise your location through IP proxying or other methods. + + +3. ATTRIBUTION + +Together with any copies of the Software Products (as well as derivative works thereof or works incorporating the Software Products) that you distribute, you must provide (i) a copy of this License, and (ii) the following attribution notice: “SDXL 0.9 is licensed under the SDXL Research License, Copyright (c) Stability AI Ltd. All Rights Reserved.” + + +4. DISCLAIMERS + +THE SOFTWARE PRODUCTS ARE PROVIDED “AS IS” AND “WITH ALL FAULTS” WITH NO WARRANTY OF ANY KIND, EXPRESS OR IMPLIED. STABILITY AIEXPRESSLY DISCLAIMS ALL REPRESENTATIONS AND WARRANTIES, EXPRESS OR IMPLIED, WHETHER BY STATUTE, CUSTOM, USAGE OR OTHERWISE AS TO ANY MATTERS RELATED TO THE SOFTWARE PRODUCTS, INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, SATISFACTORY QUALITY, OR NON-INFRINGEMENT. STABILITY AI MAKES NO WARRANTIES OR REPRESENTATIONS THAT THE SOFTWARE PRODUCTS WILL BE ERROR FREE OR FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS, OR PRODUCE ANY PARTICULAR RESULTS. + + +5. LIMITATION OF LIABILITY + +TO THE FULLEST EXTENT PERMITTED BY LAW, IN NO EVENT WILL STABILITY AI BE LIABLE TO YOU (A) UNDER ANY THEORY OF LIABILITY, WHETHER BASED IN CONTRACT, TORT, NEGLIGENCE, STRICT LIABILITY, WARRANTY, OR OTHERWISE UNDER THIS LICENSE, OR (B) FOR ANY INDIRECT, CONSEQUENTIAL, EXEMPLARY, INCIDENTAL, PUNITIVE OR SPECIAL DAMAGES OR LOST PROFITS, EVEN IF STABILITY AI HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. THE SOFTWARE PRODUCTS, THEIR CONSTITUENT COMPONENTS, AND ANY OUTPUT (COLLECTIVELY, “SOFTWARE MATERIALS”) ARE NOT DESIGNED OR INTENDED FOR USE IN ANY APPLICATION OR SITUATION WHERE FAILURE OR FAULT OF THE SOFTWARE MATERIALS COULD REASONABLY BE ANTICIPATED TO LEAD TO SERIOUS INJURY OF ANY PERSON, INCLUDING POTENTIAL DISCRIMINATION OR VIOLATION OF AN INDIVIDUAL’S PRIVACY RIGHTS, OR TO SEVERE PHYSICAL, PROPERTY, OR ENVIRONMENTAL DAMAGE (EACH, A “HIGH-RISK USE”). IF YOU ELECT TO USE ANY OF THE SOFTWARE MATERIALS FOR A HIGH-RISK USE, YOU DO SO AT YOUR OWN RISK. YOU AGREE TO DESIGN AND IMPLEMENT APPROPRIATE DECISION-MAKING AND RISK-MITIGATION PROCEDURES AND POLICIES IN CONNECTION WITH A HIGH-RISK USE SUCH THAT EVEN IF THERE IS A FAILURE OR FAULT IN ANY OF THE SOFTWARE MATERIALS, THE SAFETY OF PERSONS OR PROPERTY AFFECTED BY THE ACTIVITY STAYS AT A LEVEL THAT IS REASONABLE, APPROPRIATE, AND LAWFUL FOR THE FIELD OF THE HIGH-RISK USE. + + +6. INDEMNIFICATION + +You will indemnify, defend and hold harmless Stability AI and our subsidiaries and affiliates, and each of our respective shareholders, directors, officers, employees, agents, successors, and assigns (collectively, the “Stability AI Parties”) from and against any losses, liabilities, damages, fines, penalties, and expenses (including reasonable attorneys’ fees) incurred by any Stability AI Party in connection with any claim, demand, allegation, lawsuit, proceeding, or investigation (collectively, “Claims”) arising out of or related to: (a) your access to or use of the Software Products (as well as any results or data generated from such access or use), including any High-Risk Use (defined below); (b) your violation of this License; or (c) your violation, misappropriation or infringement of any rights of another (including intellectual property or other proprietary rights and privacy rights). You will promptly notify the Stability AI Parties of any such Claims, and cooperate with Stability AI Parties in defending such Claims. You will also grant the Stability AI Parties sole control of the defense or settlement, at Stability AI’s sole option, of any Claims. This indemnity is in addition to, and not in lieu of, any other indemnities or remedies set forth in a written agreement between you and Stability AI or the other Stability AI Parties. + + +7. TERMINATION; SURVIVAL + +a. This License will automatically terminate upon any breach by you of the terms of this License. + +b. We may terminate this License, in whole or in part, at any time upon notice (including electronic) to you. + +c. The following sections survive termination of this License: 2 (Restrictions), 3 (Attribution), 4 (Disclaimers), 5 (Limitation on Liability), 6 (Indemnification) 7 (Termination; Survival), 8 (Third Party Materials), 9 (Trademarks), 10 (Applicable Law; Dispute Resolution), and 11 (Miscellaneous). + + +8. THIRD PARTY MATERIALS + +The Software Products may contain third-party software or other components (including free and open source software) (all of the foregoing, “Third Party Materials”), which are subject to the license terms of the respective third-party licensors. Your dealings or correspondence with third parties and your use of or interaction with any Third Party Materials are solely between you and the third party. Stability AI does not control or endorse, and makes no representations or warranties regarding, any Third Party Materials, and your access to and use of such Third Party Materials are at your own risk. + + +9. TRADEMARKS + +Licensee has not been granted any trademark license as part of this License and may not use any name or mark associated with Stability AI without the prior written permission of Stability AI, except to the extent necessary to make the reference required by the “ATTRIBUTION” section of this Agreement. + + +10. APPLICABLE LAW; DISPUTE RESOLUTION + +This License will be governed and construed under the laws of the State of California without regard to conflicts of law provisions. Any suit or proceeding arising out of or relating to this License will be brought in the federal or state courts, as applicable, in San Mateo County, California, and each party irrevocably submits to the jurisdiction and venue of such courts. + + +11. MISCELLANEOUS + +If any provision or part of a provision of this License is unlawful, void or unenforceable, that provision or part of the provision is deemed severed from this License, and will not affect the validity and enforceability of any remaining provisions. The failure of Stability AI to exercise or enforce any right or provision of this License will not operate as a waiver of such right or provision. This License does not confer any third-party beneficiary rights upon any other person or entity. This License, together with the Documentation, contains the entire understanding between you and Stability AI regarding the subject matter of this License, and supersedes all other written or oral agreements and understandings between you and Stability AI regarding such subject matter. No change or addition to any provision of this License will be binding unless it is in writing and signed by an authorized representative of both you and Stability AI. \ No newline at end of file diff --git a/generative-models/model_licenses/LICENSE-SDXL1.0 b/generative-models/model_licenses/LICENSE-SDXL1.0 new file mode 100644 index 0000000000000000000000000000000000000000..741e30e1d609b6c864c5d86f8f827e7a4e82ac82 --- /dev/null +++ b/generative-models/model_licenses/LICENSE-SDXL1.0 @@ -0,0 +1,175 @@ +Copyright (c) 2023 Stability AI CreativeML Open RAIL++-M License dated July 26, 2023 + +Section I: PREAMBLE Multimodal generative models are being widely adopted and used, and +have the potential to transform the way artists, among other individuals, conceive and +benefit from AI or ML technologies as a tool for content creation. Notwithstanding the +current and potential benefits that these artifacts can bring to society at large, there +are also concerns about potential misuses of them, either due to their technical +limitations or ethical considerations. In short, this license strives for both the open +and responsible downstream use of the accompanying model. When it comes to the open +character, we took inspiration from open source permissive licenses regarding the grant +of IP rights. Referring to the downstream responsible use, we added use-based +restrictions not permitting the use of the model in very specific scenarios, in order +for the licensor to be able to enforce the license in case potential misuses of the +Model may occur. At the same time, we strive to promote open and responsible research on +generative models for art and content generation. Even though downstream derivative +versions of the model could be released under different licensing terms, the latter will +always have to include - at minimum - the same use-based restrictions as the ones in the +original license (this license). We believe in the intersection between open and +responsible AI development; thus, this agreement aims to strike a balance between both +in order to enable responsible open-science in the field of AI. This CreativeML Open +RAIL++-M License governs the use of the model (and its derivatives) and is informed by +the model card associated with the model. NOW THEREFORE, You and Licensor agree as +follows: Definitions "License" means the terms and conditions for use, reproduction, and +Distribution as defined in this document. "Data" means a collection of information +and/or content extracted from the dataset used with the Model, including to train, +pretrain, or otherwise evaluate the Model. The Data is not licensed under this License. +"Output" means the results of operating a Model as embodied in informational content +resulting therefrom. "Model" means any accompanying machine-learning based assemblies +(including checkpoints), consisting of learnt weights, parameters (including optimizer +states), corresponding to the model architecture as embodied in the Complementary +Material, that have been trained or tuned, in whole or in part on the Data, using the +Complementary Material. "Derivatives of the Model" means all modifications to the Model, +works based on the Model, or any other model which is created or initialized by transfer +of patterns of the weights, parameters, activations or output of the Model, to the other +model, in order to cause the other model to perform similarly to the Model, including - +but not limited to - distillation methods entailing the use of intermediate data +representations or methods based on the generation of synthetic data by the Model for +training the other model. "Complementary Material" means the accompanying source code +and scripts used to define, run, load, benchmark or evaluate the Model, and used to +prepare data for training or evaluation, if any. This includes any accompanying +documentation, tutorials, examples, etc, if any. "Distribution" means any transmission, +reproduction, publication or other sharing of the Model or Derivatives of the Model to a +third party, including providing the Model as a hosted service made available by +electronic or other remote means - e.g. API-based or web access. "Licensor" means the +copyright owner or entity authorized by the copyright owner that is granting the +License, including the persons or entities that may have rights in the Model and/or +distributing the Model. "You" (or "Your") means an individual or Legal Entity exercising +permissions granted by this License and/or making use of the Model for whichever purpose +and in any field of use, including usage of the Model in an end-use application - e.g. +chatbot, translator, image generator. "Third Parties" means individuals or legal +entities that are not under common control with Licensor or You. "Contribution" means +any work of authorship, including the original version of the Model and any +modifications or additions to that Model or Derivatives of the Model thereof, that is +intentionally submitted to Licensor for inclusion in the Model by the copyright owner or +by an individual or Legal Entity authorized to submit on behalf of the copyright owner. +For the purposes of this definition, "submitted" means any form of electronic, verbal, +or written communication sent to the Licensor or its representatives, including but not +limited to communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor for the +purpose of discussing and improving the Model, but excluding communication that is +conspicuously marked or otherwise designated in writing by the copyright owner as "Not a +Contribution." "Contributor" means Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently incorporated +within the Model. + +Section II: INTELLECTUAL PROPERTY RIGHTS Both copyright and patent grants apply to the +Model, Derivatives of the Model and Complementary Material. The Model and Derivatives of +the Model are subject to additional terms as described in + +Section III. Grant of Copyright License. Subject to the terms and conditions of this +License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, +no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly +display, publicly perform, sublicense, and distribute the Complementary Material, the +Model, and Derivatives of the Model. Grant of Patent License. Subject to the terms and +conditions of this License and where and as applicable, each Contributor hereby grants +to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this paragraph) patent license to make, have made, use, offer to +sell, sell, import, and otherwise transfer the Model and the Complementary Material, +where such license applies only to those patent claims licensable by such Contributor +that are necessarily infringed by their Contribution(s) alone or by combination of their +Contribution(s) with the Model to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a cross-claim or counterclaim +in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution +incorporated within the Model and/or Complementary Material constitutes direct or +contributory patent infringement, then any patent licenses granted to You under this +License for the Model and/or Work shall terminate as of the date such litigation is +asserted or filed. Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION +Distribution and Redistribution. You may host for Third Party remote access purposes +(e.g. software-as-a-service), reproduce and distribute copies of the Model or +Derivatives of the Model thereof in any medium, with or without modifications, provided +that You meet the following conditions: Use-based restrictions as referenced in +paragraph 5 MUST be included as an enforceable provision by You in any type of legal +agreement (e.g. a license) governing the use and/or distribution of the Model or +Derivatives of the Model, and You shall give notice to subsequent users You Distribute +to, that the Model or Derivatives of the Model are subject to paragraph 5. This +provision does not apply to the use of Complementary Material. You must give any Third +Party recipients of the Model or Derivatives of the Model a copy of this License; You +must cause any modified files to carry prominent notices stating that You changed the +files; You must retain all copyright, patent, trademark, and attribution notices +excluding those notices that do not pertain to any part of the Model, Derivatives of the +Model. You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions - respecting paragraph 4.a. - for +use, reproduction, or Distribution of Your modifications, or for any such Derivatives of +the Model as a whole, provided Your use, reproduction, and Distribution of the Model +otherwise complies with the conditions stated in this License. Use-based restrictions. +The restrictions set forth in Attachment A are considered Use-based restrictions. +Therefore You cannot use the Model and the Derivatives of the Model for the specified +restricted uses. You may use the Model subject to this License, including only for +lawful purposes and in accordance with the License. Use may include creating any content +with, finetuning, updating, running, training, evaluating and/or reparametrizing the +Model. You shall require all of Your users who use the Model or a Derivative of the +Model to comply with the terms of this paragraph (paragraph 5). The Output You Generate. +Except as set forth herein, Licensor claims no rights in the Output You generate using +the Model. You are accountable for the Output you generate and its subsequent uses. No +use of the output can contravene any provision as stated in the License. + +Section IV: OTHER PROVISIONS Updates and Runtime Restrictions. To the maximum extent +permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage +of the Model in violation of this License. Trademarks and related. Nothing in this +License permits You to make use of Licensors’ trademarks, trade names, logos or to +otherwise suggest endorsement or misrepresent the relationship between the parties; and +any rights not expressly granted herein are reserved by the Licensors. Disclaimer of +Warranty. Unless required by applicable law or agreed to in writing, Licensor provides +the Model and the Complementary Material (and each Contributor provides its +Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +express or implied, including, without limitation, any warranties or conditions of +TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are +solely responsible for determining the appropriateness of using or redistributing the +Model, Derivatives of the Model, and the Complementary Material and assume any risks +associated with Your exercise of permissions under this License. Limitation of +Liability. In no event and under no legal theory, whether in tort (including +negligence), contract, or otherwise, unless required by applicable law (such as +deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, incidental, or +consequential damages of any character arising as a result of this License or out of the +use or inability to use the Model and the Complementary Material (including but not +limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, +or any and all other commercial damages or losses), even if such Contributor has been +advised of the possibility of such damages. Accepting Warranty or Additional Liability. +While redistributing the Model, Derivatives of the Model and the Complementary Material +thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, +indemnity, or other liability obligations and/or rights consistent with this License. +However, in accepting such obligations, You may act only on Your own behalf and on Your +sole responsibility, not on behalf of any other Contributor, and only if You agree to +indemnify, defend, and hold each Contributor harmless for any liability incurred by, or +claims asserted against, such Contributor by reason of your accepting any such warranty +or additional liability. If any provision of this License is held to be invalid, illegal +or unenforceable, the remaining provisions shall be unaffected thereby and remain valid +as if such provision had not been set forth herein. + +END OF TERMS AND CONDITIONS + +Attachment A Use Restrictions +You agree not to use the Model or Derivatives of the Model: +In any way that violates any applicable national, federal, state, local or +international law or regulation; For the purpose of exploiting, harming or attempting to +exploit or harm minors in any way; To generate or disseminate verifiably false +information and/or content with the purpose of harming others; To generate or +disseminate personal identifiable information that can be used to harm an individual; To +defame, disparage or otherwise harass others; For fully automated decision making that +adversely impacts an individual’s legal rights or otherwise creates or modifies a +binding, enforceable obligation; For any use intended to or which has the effect of +discriminating against or harming individuals or groups based on online or offline +social behavior or known or predicted personal or personality characteristics; To +exploit any of the vulnerabilities of a specific group of persons based on their age, +social, physical or mental characteristics, in order to materially distort the behavior +of a person pertaining to that group in a manner that causes or is likely to cause that +person or another person physical or psychological harm; For any use intended to or +which has the effect of discriminating against individuals or groups based on legally +protected characteristics or categories; To provide medical advice and medical results +interpretation; To generate or disseminate information for the purpose to be used for +administration of justice, law enforcement, immigration or asylum processes, such as +predicting an individual will commit fraud/crime commitment (e.g. by text profiling, +drawing causal relationships between assertions made in documents, indiscriminate and +arbitrarily-targeted use). diff --git a/generative-models/model_licenses/LICENSE-SV3D b/generative-models/model_licenses/LICENSE-SV3D new file mode 100644 index 0000000000000000000000000000000000000000..2a9ddf37f9f47d2c06f6fff41e887f3f9a34d469 --- /dev/null +++ b/generative-models/model_licenses/LICENSE-SV3D @@ -0,0 +1,41 @@ +STABILITY AI NON-COMMERCIAL COMMUNITY LICENSE AGREEMENT +Dated: March 18, 2024 + +"Agreement" means this Stable Non-Commercial Research Community License Agreement. + +“AUP” means the Stability AI Acceptable Use Policy available at https://stability.ai/use-policy, as may be updated from time to time. + +"Derivative Work(s)” means (a) any derivative work of the Software Products as recognized by U.S. copyright laws, (b) any modifications to a Model, and (c) any other model created which is based on or derived from the Model or the Model’s output. For clarity, Derivative Works do not include the output of any Model. + +“Documentation” means any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software. + +"Licensee" or "you" means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity's behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf. + +“Model(s)" means, collectively, Stability AI’s proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing, made available under this Agreement. + +“Non-Commercial Uses” means exercising any of the rights granted herein for the purpose of research or non-commercial purposes. Non-Commercial Uses does not include any production use of the Software Products or any Derivative Works. + +"Stability AI" or "we" means Stability AI Ltd and its affiliates. + + +"Software" means Stability AI’s proprietary software made available under this Agreement. + +“Software Products” means the Models, Software and Documentation, individually or in any combination. + + + +1. License Rights and Redistribution. +a. Subject to your compliance with this Agreement, the AUP (which is hereby incorporated herein by reference), and the Documentation, Stability AI grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under Stability AI’s intellectual property or other rights owned or controlled by Stability AI embodied in the Software Products to use, reproduce, distribute, and create Derivative Works of, the Software Products, in each case for Non-Commercial Uses only. +b. You may not use the Software Products or Derivative Works to enable third parties to use the Software Products or Derivative Works as part of your hosted service or via your APIs, whether you are adding substantial additional functionality thereto or not. Merely distributing the Software Products or Derivative Works for download online without offering any related service (ex. by distributing the Models on HuggingFace) is not a violation of this subsection. If you wish to use the Software Products or any Derivative Works for commercial or production use or you wish to make the Software Products or any Derivative Works available to third parties via your hosted service or your APIs, contact Stability AI at https://stability.ai/contact. +c. If you distribute or make the Software Products, or any Derivative Works thereof, available to a third party, the Software Products, Derivative Works, or any portion thereof, respectively, will remain subject to this Agreement and you must (i) provide a copy of this Agreement to such third party, and (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "This Stability AI Model is licensed under the Stability AI Non-Commercial Research Community License, Copyright (c) Stability AI Ltd. All Rights Reserved.” If you create a Derivative Work of a Software Product, you may add your own attribution notices to the Notice file included with the Software Product, provided that you clearly indicate which attributions apply to the Software Product and you must state in the NOTICE file that you changed the Software Product and how it was modified. +2. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE SOFTWARE PRODUCTS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE SOFTWARE PRODUCTS, DERIVATIVE WORKS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE SOFTWARE PRODUCTS, DERIVATIVE WORKS AND ANY OUTPUT AND RESULTS. +3. Limitation of Liability. IN NO EVENT WILL STABILITY AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF STABILITY AI OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING. +4. Intellectual Property. +a. No trademark licenses are granted under this Agreement, and in connection with the Software Products or Derivative Works, neither Stability AI nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Software Products or Derivative Works. +b. Subject to Stability AI’s ownership of the Software Products and Derivative Works made by or for Stability AI, with respect to any Derivative Works that are made by you, as between you and Stability AI, you are and will be the owner of such Derivative Works +c. If you institute litigation or other proceedings against Stability AI (including a cross-claim or counterclaim in a lawsuit) alleging that the Software Products, Derivative Works or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out of or related to your use or distribution of the Software Products or Derivative Works in violation of this Agreement. +5. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Software Products and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Stability AI may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of any Software Products or Derivative Works. Sections 2-4 shall survive the termination of this Agreement. + +6. Governing Law. This Agreement will be governed by and construed in accordance with the laws of the United States and the State of California without regard to choice of law +principles. + diff --git a/generative-models/model_licenses/LICENSE-SVD b/generative-models/model_licenses/LICENSE-SVD new file mode 100644 index 0000000000000000000000000000000000000000..01e57cf8310b2fdd6676fe203820a19ab86c966b --- /dev/null +++ b/generative-models/model_licenses/LICENSE-SVD @@ -0,0 +1,31 @@ +STABLE VIDEO DIFFUSION NON-COMMERCIAL COMMUNITY LICENSE AGREEMENT +Dated: November 21, 2023 + +“AUP” means the Stability AI Acceptable Use Policy available at https://stability.ai/use-policy, as may be updated from time to time. + +"Agreement" means the terms and conditions for use, reproduction, distribution and modification of the Software Products set forth herein. +"Derivative Work(s)” means (a) any derivative work of the Software Products as recognized by U.S. copyright laws and (b) any modifications to a Model, and any other model created which is based on or derived from the Model or the Model’s output. For clarity, Derivative Works do not include the output of any Model. +“Documentation” means any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software. + +"Licensee" or "you" means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity's behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf. + +"Stability AI" or "we" means Stability AI Ltd. + +"Software" means, collectively, Stability AI’s proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing, made available under this Agreement. + +“Software Products” means Software and Documentation. + +By using or distributing any portion or element of the Software Products, you agree to be bound by this Agreement. + + + +License Rights and Redistribution. +Subject to your compliance with this Agreement, the AUP (which is hereby incorporated herein by reference), and the Documentation, Stability AI grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under Stability AI’s intellectual property or other rights owned by Stability AI embodied in the Software Products to reproduce, distribute, and create Derivative Works of the Software Products for purposes other than commercial or production use. +b. If you distribute or make the Software Products, or any Derivative Works thereof, available to a third party, the Software Products, Derivative Works, or any portion thereof, respectively, will remain subject to this Agreement and you must (i) provide a copy of this Agreement to such third party, and (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "Stable Video Diffusion is licensed under the Stable Video Diffusion Research License, Copyright (c) Stability AI Ltd. All Rights Reserved.” If you create a Derivative Work of a Software Product, you may add your own attribution notices to the Notice file included with the Software Product, provided that you clearly indicate which attributions apply to the Software Product and you must state in the NOTICE file that you changed the Software Product and how it was modified. +2. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE SOFTWARE PRODUCTS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE SOFTWARE PRODUCTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE SOFTWARE PRODUCTS AND ANY OUTPUT AND RESULTS. +3. Limitation of Liability. IN NO EVENT WILL STABILITY AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF STABILITY AI OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING. +3. Intellectual Property. +a. No trademark licenses are granted under this Agreement, and in connection with the Software Products, neither Stability AI nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Software Products. +Subject to Stability AI’s ownership of the Software Products and Derivative Works made by or for Stability AI, with respect to any Derivative Works that are made by you, as between you and Stability AI, you are and will be the owner of such Derivative Works. +If you institute litigation or other proceedings against Stability AI (including a cross-claim or counterclaim in a lawsuit) alleging that the Software Products or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out of or related to your use or distribution of the Software Products in violation of this Agreement. +4. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Software Products and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Stability AI may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Software Products. Sections 2-4 shall survive the termination of this Agreement. diff --git a/generative-models/outputs/gallery/gallery_1/0025_A_minimalist_chameleon_resting_on_a_simple_branch.mp4 b/generative-models/outputs/gallery/gallery_1/0025_A_minimalist_chameleon_resting_on_a_simple_branch.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..fbeec9439409bd6ccc76a81f433c4c82618ef6c3 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_1/0025_A_minimalist_chameleon_resting_on_a_simple_branch.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_1/0048_A_softly_rounded_giraffe_calf_standing_carefully.mp4 b/generative-models/outputs/gallery/gallery_1/0048_A_softly_rounded_giraffe_calf_standing_carefully.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..cfde44f4f50d52f974463a422b66901a9fc9429e Binary files /dev/null and b/generative-models/outputs/gallery/gallery_1/0048_A_softly_rounded_giraffe_calf_standing_carefully.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_1/0052_A_softly_textured_alpaca_with_a_clean,_smooth_fleece.mp4 b/generative-models/outputs/gallery/gallery_1/0052_A_softly_textured_alpaca_with_a_clean,_smooth_fleece.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..4fea694279f51c97e315a7b0f2f150a0cd084a77 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_1/0052_A_softly_textured_alpaca_with_a_clean,_smooth_fleece.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_1/0069_A_simple_stylized_dolphin_leaping_in_a_smooth_arc.mp4 b/generative-models/outputs/gallery/gallery_1/0069_A_simple_stylized_dolphin_leaping_in_a_smooth_arc.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..448c28bb13c66629f430ae4d8c70f03cef3f1112 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_1/0069_A_simple_stylized_dolphin_leaping_in_a_smooth_arc.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_1/0075_A_minimalist_chameleon_perched_quietly.mp4 b/generative-models/outputs/gallery/gallery_1/0075_A_minimalist_chameleon_perched_quietly.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..cdd2f6a8e915e2d6815865f0f6d2aea933519cc2 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_1/0075_A_minimalist_chameleon_perched_quietly.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_1/0096_A_smoothly_rounded_forest_spirit_standing_quietly.mp4 b/generative-models/outputs/gallery/gallery_1/0096_A_smoothly_rounded_forest_spirit_standing_quietly.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0f6d9a47dff049aedba279b359cf90210db96c79 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_1/0096_A_smoothly_rounded_forest_spirit_standing_quietly.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_1/0125_A_small_minimalist_toy_rocket_standing_straight.mp4 b/generative-models/outputs/gallery/gallery_1/0125_A_small_minimalist_toy_rocket_standing_straight.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..24afd66ecaa4385588998bf482599d96294f9fdd Binary files /dev/null and b/generative-models/outputs/gallery/gallery_1/0125_A_small_minimalist_toy_rocket_standing_straight.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_2/0013_A_sleeping_little_fox,_simple_3D_stylized,_white_background,_minimalism.mp4 b/generative-models/outputs/gallery/gallery_2/0013_A_sleeping_little_fox,_simple_3D_stylized,_white_background,_minimalism.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..c1f1acd9c031c6bc9415e6a1a3b05d6932a8ad29 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_2/0013_A_sleeping_little_fox,_simple_3D_stylized,_white_background,_minimalism.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_2/0016_A_shy_fawn_with_white_spots,_simple_3D_stylized,_white_background,_minimalism..mp4 b/generative-models/outputs/gallery/gallery_2/0016_A_shy_fawn_with_white_spots,_simple_3D_stylized,_white_background,_minimalism..mp4 new file mode 100644 index 0000000000000000000000000000000000000000..761722a1fbefda1a3b32e641a7fca1f7a9ea3dbe Binary files /dev/null and b/generative-models/outputs/gallery/gallery_2/0016_A_shy_fawn_with_white_spots,_simple_3D_stylized,_white_background,_minimalism..mp4 differ diff --git a/generative-models/outputs/gallery/gallery_2/0025_A_minimalist_chameleon_resting_on_a_simple_branch.mp4 b/generative-models/outputs/gallery/gallery_2/0025_A_minimalist_chameleon_resting_on_a_simple_branch.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..fbeec9439409bd6ccc76a81f433c4c82618ef6c3 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_2/0025_A_minimalist_chameleon_resting_on_a_simple_branch.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_2/0035_A_chubby_sunny-side-up_egg,_simple_3D_stylized,_white_background,_minimalism..mp4 b/generative-models/outputs/gallery/gallery_2/0035_A_chubby_sunny-side-up_egg,_simple_3D_stylized,_white_background,_minimalism..mp4 new file mode 100644 index 0000000000000000000000000000000000000000..b9ee29e0894161d1fcfba226f82e436e7b586f2e Binary files /dev/null and b/generative-models/outputs/gallery/gallery_2/0035_A_chubby_sunny-side-up_egg,_simple_3D_stylized,_white_background,_minimalism..mp4 differ diff --git a/generative-models/outputs/gallery/gallery_2/0048_A_softly_rounded_giraffe_calf_standing_carefully.mp4 b/generative-models/outputs/gallery/gallery_2/0048_A_softly_rounded_giraffe_calf_standing_carefully.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..cfde44f4f50d52f974463a422b66901a9fc9429e Binary files /dev/null and b/generative-models/outputs/gallery/gallery_2/0048_A_softly_rounded_giraffe_calf_standing_carefully.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_2/0050_A_small_bubbling_teapot,_simple_3D_stylized,_white_background,_minimalism..mp4 b/generative-models/outputs/gallery/gallery_2/0050_A_small_bubbling_teapot,_simple_3D_stylized,_white_background,_minimalism..mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ac5ee5071bde277d27dcbb23ab898e70c8f6f1e6 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_2/0050_A_small_bubbling_teapot,_simple_3D_stylized,_white_background,_minimalism..mp4 differ diff --git a/generative-models/outputs/gallery/gallery_2/0052_A_softly_textured_alpaca_with_a_clean,_smooth_fleece.mp4 b/generative-models/outputs/gallery/gallery_2/0052_A_softly_textured_alpaca_with_a_clean,_smooth_fleece.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..4fea694279f51c97e315a7b0f2f150a0cd084a77 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_2/0052_A_softly_textured_alpaca_with_a_clean,_smooth_fleece.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_2/0064_A_tiny_rainbow_from_a_cloud,_simple_3D_stylized,_white_background,_minimalism..mp4 b/generative-models/outputs/gallery/gallery_2/0064_A_tiny_rainbow_from_a_cloud,_simple_3D_stylized,_white_background,_minimalism..mp4 new file mode 100644 index 0000000000000000000000000000000000000000..bfd6329daba10df5674a7b2bba196250d4df7a4e Binary files /dev/null and b/generative-models/outputs/gallery/gallery_2/0064_A_tiny_rainbow_from_a_cloud,_simple_3D_stylized,_white_background,_minimalism..mp4 differ diff --git a/generative-models/outputs/gallery/gallery_2/0069_A_simple_stylized_dolphin_leaping_in_a_smooth_arc.mp4 b/generative-models/outputs/gallery/gallery_2/0069_A_simple_stylized_dolphin_leaping_in_a_smooth_arc.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..448c28bb13c66629f430ae4d8c70f03cef3f1112 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_2/0069_A_simple_stylized_dolphin_leaping_in_a_smooth_arc.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_2/0075_A_minimalist_chameleon_perched_quietly.mp4 b/generative-models/outputs/gallery/gallery_2/0075_A_minimalist_chameleon_perched_quietly.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..cdd2f6a8e915e2d6815865f0f6d2aea933519cc2 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_2/0075_A_minimalist_chameleon_perched_quietly.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_2/0096_A_smoothly_rounded_forest_spirit_standing_quietly.mp4 b/generative-models/outputs/gallery/gallery_2/0096_A_smoothly_rounded_forest_spirit_standing_quietly.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0f6d9a47dff049aedba279b359cf90210db96c79 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_2/0096_A_smoothly_rounded_forest_spirit_standing_quietly.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_2/0125_A_small_minimalist_toy_rocket_standing_straight.mp4 b/generative-models/outputs/gallery/gallery_2/0125_A_small_minimalist_toy_rocket_standing_straight.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..24afd66ecaa4385588998bf482599d96294f9fdd Binary files /dev/null and b/generative-models/outputs/gallery/gallery_2/0125_A_small_minimalist_toy_rocket_standing_straight.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_3/0054_A_cute_minimalist_bat,_single_object,_simple_smooth_design,_pure_white_background.mp4 b/generative-models/outputs/gallery/gallery_3/0054_A_cute_minimalist_bat,_single_object,_simple_smooth_design,_pure_white_background.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..7416b9c2bc81ba1a740e6d06a29a4265506c54d0 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_3/0054_A_cute_minimalist_bat,_single_object,_simple_smooth_design,_pure_white_background.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_3/0057_A_cute_minimalist_chameleon,_single_object,_simple_smooth_design,_pure_white_background.mp4 b/generative-models/outputs/gallery/gallery_3/0057_A_cute_minimalist_chameleon,_single_object,_simple_smooth_design,_pure_white_background.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a706c1f1bf350c9a728e1ee8d3de3593afddf6c0 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_3/0057_A_cute_minimalist_chameleon,_single_object,_simple_smooth_design,_pure_white_background.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_3/0058_A_cute_minimalist_gecko_lizard,_single_object,_simple_smooth_design,_pure_white_background.mp4 b/generative-models/outputs/gallery/gallery_3/0058_A_cute_minimalist_gecko_lizard,_single_object,_simple_smooth_design,_pure_white_background.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..881ec8607954cb719d1ce73ba535e248151ddc7d Binary files /dev/null and b/generative-models/outputs/gallery/gallery_3/0058_A_cute_minimalist_gecko_lizard,_single_object,_simple_smooth_design,_pure_white_background.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_3/0099_A_cute_minimalist_snowman_character,_single_object,_simple_smooth_design,_pure_white_background.mp4 b/generative-models/outputs/gallery/gallery_3/0099_A_cute_minimalist_snowman_character,_single_object,_simple_smooth_design,_pure_white_background.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3bfa56b4e45e0d9d07d5319328bab4d0d9a05faa Binary files /dev/null and b/generative-models/outputs/gallery/gallery_3/0099_A_cute_minimalist_snowman_character,_single_object,_simple_smooth_design,_pure_white_background.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_4/0002_A_cute_simple_little_bunny,_sitting_gently,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 b/generative-models/outputs/gallery/gallery_4/0002_A_cute_simple_little_bunny,_sitting_gently,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..386ef3d4e1e9368cea452508001c34afc451fbe4 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_4/0002_A_cute_simple_little_bunny,_sitting_gently,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_4/0009_A_cute_simple_gentle_turtle,_standing_still,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 b/generative-models/outputs/gallery/gallery_4/0009_A_cute_simple_gentle_turtle,_standing_still,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8cfba600ff17b5d4a24c7a8254133de94c16b4ba Binary files /dev/null and b/generative-models/outputs/gallery/gallery_4/0009_A_cute_simple_gentle_turtle,_standing_still,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_4/0015_A_cute_simple_curious_monkey,_hanging_by_one_arm,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 b/generative-models/outputs/gallery/gallery_4/0015_A_cute_simple_curious_monkey,_hanging_by_one_arm,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..7951e276b40123e3e24e3236046042a7f25e2b9e Binary files /dev/null and b/generative-models/outputs/gallery/gallery_4/0015_A_cute_simple_curious_monkey,_hanging_by_one_arm,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_4/0019_A_cute_simple_gentle_giraffe_calf,_standing_still,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 b/generative-models/outputs/gallery/gallery_4/0019_A_cute_simple_gentle_giraffe_calf,_standing_still,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..bc1890c7f5a45a4cca4a4e1abb1527ed6d917251 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_4/0019_A_cute_simple_gentle_giraffe_calf,_standing_still,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_4/0024_A_cute_simple_little_dolphin,_jumping_out_of_water,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 b/generative-models/outputs/gallery/gallery_4/0024_A_cute_simple_little_dolphin,_jumping_out_of_water,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..303d972c3f2ee93650a8d558fdff9a32ba7ac3d0 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_4/0024_A_cute_simple_little_dolphin,_jumping_out_of_water,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_4/0034_A_cute_simple_wind-up_toy_mouse,_moving_forward,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 b/generative-models/outputs/gallery/gallery_4/0034_A_cute_simple_wind-up_toy_mouse,_moving_forward,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0a4c07443d52013f572ac1cd5bacd7a5705fc6bc Binary files /dev/null and b/generative-models/outputs/gallery/gallery_4/0034_A_cute_simple_wind-up_toy_mouse,_moving_forward,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_4/0040_A_cute_simple_tiny_bat,_flying,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 b/generative-models/outputs/gallery/gallery_4/0040_A_cute_simple_tiny_bat,_flying,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..f71cd89b3ed001c12e946f02d4b354859fd728f7 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_4/0040_A_cute_simple_tiny_bat,_flying,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_4/0049_A_cute_simple_sleepy_sloth,_hanging_slowly,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 b/generative-models/outputs/gallery/gallery_4/0049_A_cute_simple_sleepy_sloth,_hanging_slowly,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..04ad43ab9e2ecd18f2990543e9715f7af13875c7 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_4/0049_A_cute_simple_sleepy_sloth,_hanging_slowly,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_4/0051_A_cute_simple_fluffy_alpaca,_tilting_head,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 b/generative-models/outputs/gallery/gallery_4/0051_A_cute_simple_fluffy_alpaca,_tilting_head,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..bc1aa6ee784c2ee06b485f34c5e35db704fc5343 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_4/0051_A_cute_simple_fluffy_alpaca,_tilting_head,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_4/0055_A_cute_simple_tiny_platypus,_swimming_gesture,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 b/generative-models/outputs/gallery/gallery_4/0055_A_cute_simple_tiny_platypus,_swimming_gesture,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5d98e357d801fd37da6d7e6ab4044a42b373737e Binary files /dev/null and b/generative-models/outputs/gallery/gallery_4/0055_A_cute_simple_tiny_platypus,_swimming_gesture,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_4/0075_A_cute_simple_curious_meerkat,_standing_up_straight,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 b/generative-models/outputs/gallery/gallery_4/0075_A_cute_simple_curious_meerkat,_standing_up_straight,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a85484af660edd086ab65540dfebeb7d813e8b17 Binary files /dev/null and b/generative-models/outputs/gallery/gallery_4/0075_A_cute_simple_curious_meerkat,_standing_up_straight,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 differ diff --git a/generative-models/outputs/gallery/gallery_4/0079_A_cute_simple_gentle_deer_calf,_standing_still,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 b/generative-models/outputs/gallery/gallery_4/0079_A_cute_simple_gentle_deer_calf,_standing_still,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..9f3aea6a3ad47a4c834f7485086e00880de29cbe Binary files /dev/null and b/generative-models/outputs/gallery/gallery_4/0079_A_cute_simple_gentle_deer_calf,_standing_still,_clean_white_background,_isolated_on_white,_minimalist_aesthetic,_smooth_textures,_clear_outline,_bright_studio_lighting,_high_resolution.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3e9ee43205cf52db91403100c9f49681a71df79e Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..9e14420ffba4c6ec2b439f2ce5dc047fdf92c106 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d121713791c15a8c40c2ae7829f6779925eca8dc Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..30ebf4a69b2a3b3f3d6589241ced60b3521547c5 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..fafd3fe0cbaaa7c0534eef01bd8caff4fb3ecb90 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..448d1e9f1b938011b08dcf0a5144a350bff3166e Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3e432591cf7b76f1137d643d422ccc6fcbcd3876 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..1181f66a635f78de84a7bd7c70bccebb0099f527 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e02e786dc27b1a07269edb6fad0499109bf784ae Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..9216894e7f0b8ca5e102a0e0f2444220312448c6 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ac23b23a055f2c82db9d6bf1a2f974984962c143 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0006_A_cute_minimalist_moth__single_object__simple_smooth_design__pure_white_background/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..27324cb7e9dec6975330f6316338e3411e11d9cf Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a63e82362c1650e92e1b4228776e37beb2f0421b Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3572f8bb894afddedf157973af60458332e7e942 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..94537fd8098e4805f33172cd58c60f168f6920ac Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..840238134d3fb723459545edd3648b4c4e3cccdd Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..4f14824dec80af248abd1fccba99d9a98c61be88 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a0bca03b9b279d46854330beba7ac54032e050cc Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..eb55fb9bf9da99bf769c19d48213a1de33f00150 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..af69f27c15f17ef721a73a37f28e7fd9407a91b7 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..4d3e77e5ac70618014cc59b418ca687faa65249b Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..af97027bf8e37f2b9ff878bd07b00b200342477a Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0007_A_cute_minimalist_toucan_bird__single_object__simple_smooth_design__pure_white_background/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0009_A_softly_rounded_hippo_with_a_gently_open_mouth/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0009_A_softly_rounded_hippo_with_a_gently_open_mouth/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..fa82142f21bdbd56af44b00967b50c9f13710ee8 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0009_A_softly_rounded_hippo_with_a_gently_open_mouth/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..2843e0fdd419a769c1e1091dd0d96fb6e8c845e7 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..557b6515621e8e8774af9cca8562c1dc1b997b87 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..95a5e4f7f73937f484f70739edf23b167006a19a Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5637ce662f32a2f3805cbe2933877faa55f1dbb5 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a9631b3aaf8f67df727d9333785cf355d6e76bfd Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..4388bb3bfcb93279e445d18eef996e51ee437e30 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..024a3016f389c10393168297902b6e4104e101db Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ca5acd278277bca4e9d5c756ee697ed7caf8e6bd Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ac76e51cca540666e6d43a2f5901fdf8ffc2e65f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..1bf439c68286676a479fe72550929817fa02c7f0 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a4ef1f134d91640e5d00fdae705d26f5b3899391 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0010_A_minimalist_goldfish_with_smooth__flowing_fins/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..f0253a6f91a68a963516676277287e3e030f50b0 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..c6495fe3ef3f6b31da0c6292491a4aebce3d4237 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..9f701347e82ee718036b0d9fd6d22fce6fdd9f74 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..738989aa487cc15ce37e4178d45715dc97f290d9 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e665dbd51669096566a181adede166dc1eaec5f5 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..c6b528ddf45896f685d7d19a500b670b024c41cd Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..be839062e1eebdffde4b5b7a768d360b1059d4f1 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8d39c5ef212b313224ef3ea4bd5fef680a4cd57f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..b9aa463c6a47466db0550fcae0c1d6877be8b209 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..08bad36ff47437803ff27ba51799432b796960f8 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..6a6e953ddf63fdece5f5d9a6cd113fbacfa28b57 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0015_A_cute_round_penguin__simple_3D_stylized__white_background__minimalism./_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..bb05b1ac82e329fae903a9b514367dcba456737f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..7d2f64c630144aa5338ce380065246cc97769387 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..b1e9d1ffd58591c79af0f5e320cf5680212f60ef Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..c00df26584d2a11221ae7b0061f1396de311345b Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0b4c6ee25d8e7eb6292304db54b68cc818539589 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ed502898dea32b6d388638098bdcea31c11115cb Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8849942537cc178198d3bb734d9eaea77a780c1c Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..6a50e520f88ff48c57c70c838717e3e93da38695 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8f17fa9d354d3ba38a21dd5bbce6d66d23e44505 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a463a3e62f4fce5634d42355a231ec25e60e9767 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..801ece9f173e2041edf9132f94baada27437fba2 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0017_A_small_yellow_bird_singing__simple_3D_stylized__white_background__minimalism./_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..94ee077478e40e4311e454ec6e675a134a6a417e Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0b31d25f7ee73e4eab3b9ec96f3b9260ac09fd20 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..f2a5029c426d62efe20e4b5f974f268c7a904bab Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..2e7564ae9898beae7de8692ee90265b3155ce6a2 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..1c397370dca679a80e3dcaa0003b3d20f0cbb229 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5bd09382ffd9f064af4ee904ec0aa6c238a4aa47 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..728ed86570daf09551e2f151d753cbeb4f83712d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..aaccd8adc5e7bd4bb21348bce28291aee5081a82 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..739cf87c022f2708155e09aaa9f259bcee5fc63d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e00241c15ef0f5f1f280868eaeafdcec866a39a4 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..4501e8cdf2b16b68f7b1178654d3d545b529c80d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0018_A_simple_stylized_blossom_spirit_moving_gently/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..289ecbdefe74e39e9d61773b24ac718639e1d82f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..80b2b27952e43b5a9eb5387b8da860e4ca98f331 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..083c8cb676a270eedf67dc2d7dd642893fe8db27 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..bd6fb24de4191d8bb9eae41db9bf315759749e54 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..37a086f3bd48a3b8de859cf8d8dde34b86acd81e Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..92465c609edecfa6a693d2b58b7a7ab482d38196 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..56822e067e501edd449b99f546b3ab20e283afe9 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..f532de97052413e030f95bdc36bedb3ecdd25f73 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..6643ecf5e4c62391556c80cf5dca7caf6b57e45f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a8009ad64732f54c7c495a6ea98a7fc8247cca33 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..dae34bb0d86e9161bbbbdd49a6cae728c358d07d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0025_A_clean__minimalist_stylized_robot_giving_a_subtle_wave/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..1fc3eaf097a7b29c5e794f3e2d1b20891c03f2d9 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..4f9cd34dd06a1456ade96ac0b587b1d2c8f3587a Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..c51cb9233165387d30c342ee805344575bb417cc Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..9896ebdc0220d06540c2c95452f7ebdd58158c41 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..08ea436b61565bff9aebd0f847d55ebb568789cc Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..1de35c2c38962df025a1abb339ed9a4d46e0f76d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..7fe906ef15ae73332370149ea1576b7f9b361b06 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..019802ad2d7c14c158983dbbb98ed9c4c4c901cc Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..6c7cbb9966292d14bea3f700dd5329add7e09bcf Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..34a0c6b0083e4fd6b3224917e5462d9fbb6e6fcc Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..b293c19d80aa6b9927923d49ed2bb92ec69421e6 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0030_A_smoothly_rounded_chick_looking_downward/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e720c435ddbae1e4bf9db52d8b341c6ff2c62ca1 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8ee8e8e9393af53c36383e11da1a4c6684970504 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..52be71d83783ea2dc5994ea2e4b354025c1d08e4 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a0facf8e0f3371f2673b114e7015d26064e356cb Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..7768237b385dea7ab5a73456615869f8dde28d25 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ba4d752c06e3636250cf84d5e70b21d3578acb6d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a7985bba71cddc785be38ccf4984ebe55d9220f1 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3d0301cb2ac6f7872a3fafc50ded65e40319d39a Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..97fa68e15d47eb75aaa02fada66facedea7c25e0 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d633c4176315fb1277450e6705d73410950b0f69 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..f3e057a6c3e9ab415bfe5fd8d302b84e9dcf7c7e Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0032_A_softly_rounded_raccoon_standing_upright/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..19f1edc9c069cc365bf4d4e3b08b3481a40d45ba Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5e2678afab06c8d032c1da51ea3a23ca522b0fdb Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d476e9ff1c6dd1f4204463c8618185667f2dd325 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3bf7d8302bf003d1dc46a2ffb76d0ab8bd3651dd Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..35e613e6484f427b683beb377b354e7fae62e7f0 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0a32c6066131caa97975078e373aa2bec76d3e28 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..9a5c368597a78bae1ca4dc5feb32bb605f14787e Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e28a10b50bac0fefbbd94545d48accd746ab36ce Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..bd0531ea99daef556b0890a5260d1c3fec3edb2f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..45d8cdc922856c15e3c035026d8a94ddae66177f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..831ce18670dd4366e90f91291883013ec45d2ebe Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0033_A_minimalist_robotic_puppy_tilting_its_head/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..be307b8f3642b6012a69515beb59dbc7ab371b64 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d366bfeb420af4fb7788d0137fbe9b9ae4f8b6b4 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8ecffc24553d9790c8599aae72d58501f5bf5c07 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5c117d1cd85b76271e4cbf57c3057fca08c886e3 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..c810fc052fd38b4837502426cd670e9e200a8a84 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..9e7f427dccf0b3ecaca75567cb59cb2da1221d1d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..185f4ea521eb297aae05d83acb98efeb4f0a8a5f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..35a0f77b8d30cff1b0a9ec57c1d283f126ae190d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e25fb8d4fb3123ac054eb7f0b5bf3132abf724da Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..928925ed46342c9e06517dfda53d62a8a3943012 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3a2b337857847192d6d752d26718a3fede0b5a15 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0034_A_small_minimalist_toy_boat_resting_on_a_gentle_wave/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..b1203e310831b1252631f191186d40c0ee686f50 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8bf4f1ce7f715b6d340486ac99ff16409e69d4f0 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ff124cc88e39c1bf19da86daf9d903ccce66bda1 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..c333a90a1226d3f15b4be8dc773b8a89ceb48246 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..58666c148b1db2b45e585351a737dc7990fb5dbd Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d7f1622aaba73226fe0df14c739dc2731045ef72 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..555eae1cf7b64e12e3df710db61c4bfa4e47f9e8 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..55c80cd72dfbe410277d1954b1010927a99d7405 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5c5633c82ed84565f114f4ce46c058e1e9029318 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0036_A_softly_rounded_stylized_wolf_cub_walking_gracefully/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0037_A_smoothly_rounded_porcupine_with_neat__stylized_quills/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0037_A_smoothly_rounded_porcupine_with_neat__stylized_quills/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..818118647488afccb2b61f28d37fda3acf79b94d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0037_A_smoothly_rounded_porcupine_with_neat__stylized_quills/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0037_A_smoothly_rounded_porcupine_with_neat__stylized_quills/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0037_A_smoothly_rounded_porcupine_with_neat__stylized_quills/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..23a5685dae2c06cf140b018af58aded15c3f852d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0037_A_smoothly_rounded_porcupine_with_neat__stylized_quills/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0037_A_smoothly_rounded_porcupine_with_neat__stylized_quills/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0037_A_smoothly_rounded_porcupine_with_neat__stylized_quills/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..74cbcbf371afe3ee1df1f88386d046b59994fcc6 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0037_A_smoothly_rounded_porcupine_with_neat__stylized_quills/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5eb99d6ef750b2adc9ee29789ad9c2b86b633a1b Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..88c275aa833703a45e73af17aedb44381ae3c173 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ee765bc3790ae8ed2607af3866b9041f04fb6f1c Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..bc819caf2ee0a7193afb288caa7e27daf20d4e7d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..b513cb876125137f64431341b6ebd85cb3a28117 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a0ec298bbed4a65044a7a6a381b82b76a2a692c9 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ac950f32b2655a958ddb53a519b964a28c2dbcbe Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d04700e1d9d4481c0a2702d43556b236380991db Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ea77c86d0c79601f9fce36ba8ffd78acc33a65c6 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0040_A_softly_rounded_octopus_with_gently_curving_tentacles/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a06711420384bb0f4d432681a6d54aa4c1078a79 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3b851d317b34d9306b5ef4848bed8fc91fd77a1c Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..28a8f17e97d78cf106e5054a7091b4f9977379c9 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..eec70f52ba4dde1d0b6746847d7ecf178519d4f6 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a0ba099908f4b652b18a08fb4a4552dc1ad0c893 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8f9342442794f854e54eac1e7ed081732690d69d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e6f91c717f2096e7cb8d2a71450ce9926856202f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3fc413eb901902c76c01f2dd457b56e9d087ed76 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..59e8f890220d14ba6f0613d3050917e8bd22b157 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d4c010a986f2ae6e6f120caadbdb119594eecaf9 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e607da35f7dc0a7906402d97410bc6dea1967075 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0041_A_small_stylized_eagle_soaring_with_clean_wing_lines/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..bbc4bcee2236421828f784037c86f9df4babec40 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..f074d2eb269b37f4e733b87a35921c7459346f17 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..1b64527333fa095768568945e03d9f54cf6e7c91 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..b6c791ae73cee12db1ad39cedf3cb78e45dde474 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..56ea7fa6ab0784758eda4b30444bd2793655123d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d0bc343f754dad8c9410f67152dcd7f7dc1e4136 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..14a74033f605b353a5cba45280b4591f11b57caa Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..840fc12cd16ec4bb08ae91131e5a6aa7be95ca1d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..50f244ce038c618535d3faefa02281c2d0701306 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..10423a9193910874f54c90b8b2de12b651ba27ae Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..83e4cf48a8163db4353e5a358ae7befba305927f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0042_A_minimalist_snail_with_a_clean_spiral_shell/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0043_A_minimalist_wizard_figure_holding_a_softly_glowing_staff/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0043_A_minimalist_wizard_figure_holding_a_softly_glowing_staff/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3e052a04300f8fc05c9393ad6bc6a5b754fbb7c0 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0043_A_minimalist_wizard_figure_holding_a_softly_glowing_staff/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0043_A_minimalist_wizard_figure_holding_a_softly_glowing_staff/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0043_A_minimalist_wizard_figure_holding_a_softly_glowing_staff/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..33243f1504678ddd19ef2339e84924379b17bf05 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0043_A_minimalist_wizard_figure_holding_a_softly_glowing_staff/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0043_A_minimalist_wizard_figure_holding_a_softly_glowing_staff/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0043_A_minimalist_wizard_figure_holding_a_softly_glowing_staff/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..54a91ccb7a9dae26c297dcd823fe7b6fb6afa6ea Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0043_A_minimalist_wizard_figure_holding_a_softly_glowing_staff/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0043_A_minimalist_wizard_figure_holding_a_softly_glowing_staff/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0043_A_minimalist_wizard_figure_holding_a_softly_glowing_staff/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8caf22e2554da7d637c3a49d775d43c92cb28c3f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0043_A_minimalist_wizard_figure_holding_a_softly_glowing_staff/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..96d09dae32087ebb2ab95db16e0d2eed2136a75f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e3b0b684d27d7c1a98cdaf11225739faf721b233 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..4c858a6a51b63ccff00f5a05c8164b142bcfa09f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..6b36135e7e3531400f82d375bc47f88855cfa0fb Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..95bd87b8f7ad807333460aaed6e5b5e0ade0f75b Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..403b84cd4fd287138519eea180904b999940e548 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e370385e9c51e96e7df8df7752e1f59828892df1 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8cfcf1194fca09b5b877e264593df5cba275d3fc Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..732f6d6796fddca6e8e17f1840ecb33fc966f4d1 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..88f5b5b38e80c5a49bee0ea1222223914f633502 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..32c1bcfdab857e6ccc77f27019013d0664c14c2f Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0046_A_simple_stylized_blossom_spirit_moving_gently/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..35a1864af9d634976e3b329d9a6479172c77e413 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d85269b7e1ffd86921488cb36d7575e24f0a7f82 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..907cd53f5b874694e2c364cde3ab35c7b9520c1d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8224761b5007a8e840d76c5c11bfd092313b7342 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..7b5858e62274311988994901e347736ed92d70e0 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3d2476eaa595b4bc50330fc39de4320eddafbfe3 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5045be74e05abd92a261bd7e56a0a953c0a51741 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..9a29f64155ef222fe8ae8c805b6bb665af40e138 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e9eea9708a213352a3958a809117ce907b91e93d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..762679cbcec82396774f816af3172b4cfd37000c Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..18171d41f8f597555ff1b0415259a96a358c089b Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0047_A_small_minimalist_space_cat_in_a_neat_helmet/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..344d60865ecaab0b46e9c44f478e3996c59d8dbe Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..219269193e6fa8d02cbbf09f9dad26dabfb5f5b7 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..c0452fc47db5322d798d2c5d0abd840e5eea00e3 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..84526e7accb49f500ac985fefd57041b46642b3c Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..7307c8584250a0e3ac267df57b458669ba7a845d Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0048_A_simple_stylized_phoenix_with_a_warm_subtle_glow/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_diag.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_diag.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..383cca7647ab3589394c5cc47a59c2488f3349c3 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_diag.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_process_input.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..4b1987ef30cd2a08050dae3ce036df3c9ff9e3b8 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_process_input.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_t000.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_t000.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..67796cab79db04a7069c78f4eddb7334a831ea69 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_t000.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v001.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..4ca15835cb1368934797e11e297c1a0e5185ec26 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v001.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v002.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..4abc87cd3b91eb289fca17af5a26e4a9115b1e9a Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v002.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v003.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..6503673bb36c1e7a806eec56de3848c99bf1c1fb Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v003.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v004.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..560d23ad30dfe9723ebd34d1c9d586b4d63715b9 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v004.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v005.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e0bd0aee286b8e5a923f11ba7583e79bc57112ec Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v005.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v006.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..91d2593a594b3a2cf9843c4a3025c1637fa339e7 Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v006.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v007.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..b60e3f645ebd3d8779fbcb16a0c9ce09a161ba2e Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v007.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v008.mp4 b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..42070bc06d6e829a855bdc294190f5e7b8761a3e Binary files /dev/null and b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/000000_v008.mp4 differ diff --git a/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/_done b/generative-models/outputs/outputs_4d/outputs_bad_5/0049_A_softly_rounded_caterpillar_moving_forward/_done new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/generative-models/outputs/sv4d2/000000_process_input.mp4 b/generative-models/outputs/sv4d2/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..29c5f85ce32a2c61ec630431e871706156b6ec86 Binary files /dev/null and b/generative-models/outputs/sv4d2/000000_process_input.mp4 differ diff --git a/generative-models/outputs/sv4d2/000000_v001.mp4 b/generative-models/outputs/sv4d2/000000_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..e15434bf92d2ddff9a27c9f736b951e03383a436 Binary files /dev/null and b/generative-models/outputs/sv4d2/000000_v001.mp4 differ diff --git a/generative-models/outputs/sv4d2/000000_v002.mp4 b/generative-models/outputs/sv4d2/000000_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0abf99feb4d21f63dae7f89550ce83be6853c767 Binary files /dev/null and b/generative-models/outputs/sv4d2/000000_v002.mp4 differ diff --git a/generative-models/outputs/sv4d2/000000_v003.mp4 b/generative-models/outputs/sv4d2/000000_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..53d86f922992bf9bdab571940a302a3ac10c0642 Binary files /dev/null and b/generative-models/outputs/sv4d2/000000_v003.mp4 differ diff --git a/generative-models/outputs/sv4d2/000000_v004.mp4 b/generative-models/outputs/sv4d2/000000_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d7d0ec7c8c3f618d06ce02c364c42dcdfc6ee2f1 Binary files /dev/null and b/generative-models/outputs/sv4d2/000000_v004.mp4 differ diff --git a/generative-models/outputs/sv4d2/000001_process_input.mp4 b/generative-models/outputs/sv4d2/000001_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ebad4e030ec25c5028c322d489e5b67ff6a48776 Binary files /dev/null and b/generative-models/outputs/sv4d2/000001_process_input.mp4 differ diff --git a/generative-models/outputs/sv4d2/000001_v001.mp4 b/generative-models/outputs/sv4d2/000001_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..4076bf767609216781bb464718e58910b884689f Binary files /dev/null and b/generative-models/outputs/sv4d2/000001_v001.mp4 differ diff --git a/generative-models/outputs/sv4d2/000001_v002.mp4 b/generative-models/outputs/sv4d2/000001_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..407ab65a143b451541f9ae5ef7adf772ec110539 Binary files /dev/null and b/generative-models/outputs/sv4d2/000001_v002.mp4 differ diff --git a/generative-models/outputs/sv4d2/000001_v003.mp4 b/generative-models/outputs/sv4d2/000001_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..2d6e7f046e8c649947dbc2f3dacc47e2993323c2 Binary files /dev/null and b/generative-models/outputs/sv4d2/000001_v003.mp4 differ diff --git a/generative-models/outputs/sv4d2/000001_v004.mp4 b/generative-models/outputs/sv4d2/000001_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..119f640fef1b7c9da49d58a0a1a43f2c581716f0 Binary files /dev/null and b/generative-models/outputs/sv4d2/000001_v004.mp4 differ diff --git a/generative-models/outputs/sv4d2_8views/000000_process_input.mp4 b/generative-models/outputs/sv4d2_8views/000000_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..eeb4a5564aff28235a39df0104068d6fa5b50732 Binary files /dev/null and b/generative-models/outputs/sv4d2_8views/000000_process_input.mp4 differ diff --git a/generative-models/outputs/sv4d2_8views/000000_v005.mp4 b/generative-models/outputs/sv4d2_8views/000000_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..1e2f566884d88cd28d832ecaf363b157ca8a4104 Binary files /dev/null and b/generative-models/outputs/sv4d2_8views/000000_v005.mp4 differ diff --git a/generative-models/outputs/sv4d2_8views/000000_v006.mp4 b/generative-models/outputs/sv4d2_8views/000000_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..c114abd655021c8cf4ec5265cb70aecfc025c58b Binary files /dev/null and b/generative-models/outputs/sv4d2_8views/000000_v006.mp4 differ diff --git a/generative-models/outputs/sv4d2_8views/000000_v008.mp4 b/generative-models/outputs/sv4d2_8views/000000_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5fe9b3be374de381db18444c38db0febada58bea Binary files /dev/null and b/generative-models/outputs/sv4d2_8views/000000_v008.mp4 differ diff --git a/generative-models/outputs/sv4d2_8views/000001_process_input.mp4 b/generative-models/outputs/sv4d2_8views/000001_process_input.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..ebad4e030ec25c5028c322d489e5b67ff6a48776 Binary files /dev/null and b/generative-models/outputs/sv4d2_8views/000001_process_input.mp4 differ diff --git a/generative-models/outputs/sv4d2_8views/000001_v001.mp4 b/generative-models/outputs/sv4d2_8views/000001_v001.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..7ce5e8726417bbd7262091c5111b9c21f4774e7f Binary files /dev/null and b/generative-models/outputs/sv4d2_8views/000001_v001.mp4 differ diff --git a/generative-models/outputs/sv4d2_8views/000001_v002.mp4 b/generative-models/outputs/sv4d2_8views/000001_v002.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..802196571bd8edeb1805039264f21f7c068d621d Binary files /dev/null and b/generative-models/outputs/sv4d2_8views/000001_v002.mp4 differ diff --git a/generative-models/outputs/sv4d2_8views/000001_v003.mp4 b/generative-models/outputs/sv4d2_8views/000001_v003.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..dc1e8a51f745d66c0ed687ab0ae0561b9aafe8a3 Binary files /dev/null and b/generative-models/outputs/sv4d2_8views/000001_v003.mp4 differ diff --git a/generative-models/outputs/sv4d2_8views/000001_v004.mp4 b/generative-models/outputs/sv4d2_8views/000001_v004.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..b0117048e58f22ba1c425ab9440a7ec254874aac Binary files /dev/null and b/generative-models/outputs/sv4d2_8views/000001_v004.mp4 differ diff --git a/generative-models/outputs/sv4d2_8views/000001_v005.mp4 b/generative-models/outputs/sv4d2_8views/000001_v005.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..0d0e829cc5dfa8e2d857ca7f667458a7582eaf22 Binary files /dev/null and b/generative-models/outputs/sv4d2_8views/000001_v005.mp4 differ diff --git a/generative-models/outputs/sv4d2_8views/000001_v006.mp4 b/generative-models/outputs/sv4d2_8views/000001_v006.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..3e0428f2eba4aee970af22541170fab2cd03d816 Binary files /dev/null and b/generative-models/outputs/sv4d2_8views/000001_v006.mp4 differ diff --git a/generative-models/outputs/sv4d2_8views/000001_v007.mp4 b/generative-models/outputs/sv4d2_8views/000001_v007.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..031e73bf7ae6e4fb05031a514babc23dd0dac722 Binary files /dev/null and b/generative-models/outputs/sv4d2_8views/000001_v007.mp4 differ diff --git a/generative-models/outputs/sv4d2_8views/000001_v008.mp4 b/generative-models/outputs/sv4d2_8views/000001_v008.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..5aaec959fd51769f64a50445da473e12af4aff57 Binary files /dev/null and b/generative-models/outputs/sv4d2_8views/000001_v008.mp4 differ diff --git a/results/LSDIR/DreamClear/results/input/0000016-seed-0.png b/results/LSDIR/DreamClear/results/input/0000016-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..d98381733aab9a09a0fc53f92974432ece4cf2d5 Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000016-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000024-seed-0.png b/results/LSDIR/DreamClear/results/input/0000024-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..0d812a5d02c2ebe7865e327de313635e428ad147 Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000024-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000035-seed-0.png b/results/LSDIR/DreamClear/results/input/0000035-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..9b28e44244b008a17c08d00f25f878f3adedd638 Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000035-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000069-seed-0.png b/results/LSDIR/DreamClear/results/input/0000069-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..70438fd9db2076b5ffc1d530aa26b5ea2889a9cd Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000069-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000108-seed-0.png b/results/LSDIR/DreamClear/results/input/0000108-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..139d49681f7081408caa128c2d485ae5bc9b6a9d Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000108-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000136-seed-0.png b/results/LSDIR/DreamClear/results/input/0000136-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..a02487600c66842621db4f13a3af47726ec396de Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000136-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000142-seed-0.png b/results/LSDIR/DreamClear/results/input/0000142-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..3936895e4ac3c5d329167241d0774d25162ca4a2 Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000142-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000154-seed-0.png b/results/LSDIR/DreamClear/results/input/0000154-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..e35d6a306c0da28c1949ad16fda990051b592144 Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000154-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000157-seed-0.png b/results/LSDIR/DreamClear/results/input/0000157-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..fcf557f337a28f60ae1f55f6aa73a9c3a6f04896 Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000157-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000164-seed-0.png b/results/LSDIR/DreamClear/results/input/0000164-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..2f07d9d6b9cef885c768bb9fe2503813370860fc Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000164-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000167-seed-0.png b/results/LSDIR/DreamClear/results/input/0000167-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..a32f815e1ae1c67295158591c95c62a03e1550cb Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000167-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000184-seed-0.png b/results/LSDIR/DreamClear/results/input/0000184-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..dfdaecb296f80e4b63c74924a5d3042d01dff33d Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000184-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000192-seed-0.png b/results/LSDIR/DreamClear/results/input/0000192-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..cec4c72290b078d6dfbc72632f605a46db5d72a9 Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000192-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000199-seed-0.png b/results/LSDIR/DreamClear/results/input/0000199-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..499aed1c6f320231f2ae3ed31dc5fdaaf3109699 Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000199-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000204-seed-0.png b/results/LSDIR/DreamClear/results/input/0000204-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..7449337f3d93ea00bcd55b21a684f8632a1fd91c Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000204-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000218-seed-0.png b/results/LSDIR/DreamClear/results/input/0000218-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..88ca6b5f1b558cde84144f4607293c62954997b8 Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000218-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000228-seed-0.png b/results/LSDIR/DreamClear/results/input/0000228-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..cf9e2d9313186744892d4080145056e00735cb5b Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000228-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000231-seed-0.png b/results/LSDIR/DreamClear/results/input/0000231-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..74b880148cd991ee09dddff3b0c72ecc3d8bb771 Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000231-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000245-seed-0.png b/results/LSDIR/DreamClear/results/input/0000245-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..d61b793b15f8ff1dbac5cb7fe1985212b1d9a0a3 Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000245-seed-0.png differ diff --git a/results/LSDIR/DreamClear/results/input/0000246-seed-0.png b/results/LSDIR/DreamClear/results/input/0000246-seed-0.png new file mode 100644 index 0000000000000000000000000000000000000000..025cd50a67065676d8304a30d324e4f9a34684fe Binary files /dev/null and b/results/LSDIR/DreamClear/results/input/0000246-seed-0.png differ