tuandunghcmut commited on
Commit
f435a72
·
verified ·
1 Parent(s): 9eb384f

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. PaddleMIX/applications/README.md +98 -0
  2. PaddleMIX/applications/README_en.md +87 -0
  3. PaddleMIX/applications/gradio_autolable.py +187 -0
  4. PaddleMIX/deploy/README.md +110 -0
  5. PaddleMIX/docs/CHANGELOG.md +44 -0
  6. PaddleMIX/docs/FAQ.md +0 -0
  7. PaddleMIX/paddlemix/__init__.py +20 -0
  8. PaddleMIX/ppdiffusers/README.md +1278 -0
  9. PaddleMIX/ppdiffusers/VERSION +1 -0
  10. PaddleMIX/ppdiffusers/requirements.txt +18 -0
  11. PaddleMIX/ppdiffusers/setup.py +71 -0
  12. PaddleMIX/scripts/build_wheel.sh +136 -0
  13. a_main_folder/lavis_examples/albef_feature_extraction.ipynb +0 -0
  14. a_main_folder/lavis_examples/albef_vqa.ipynb +0 -0
  15. a_main_folder/lavis_examples/albef_zero_shot_classification.ipynb +0 -0
  16. a_main_folder/lavis_examples/blip2_feature_extraction.ipynb +145 -0
  17. a_main_folder/lavis_examples/blip2_image_text_matching.ipynb +141 -0
  18. a_main_folder/lavis_examples/blip2_instructed_generation.ipynb +0 -0
  19. a_main_folder/lavis_examples/blip_feature_extraction.ipynb +0 -0
  20. a_main_folder/lavis_examples/blip_image_captioning.ipynb +0 -0
  21. a_main_folder/lavis_examples/blip_image_text_matching.ipynb +0 -0
  22. a_main_folder/lavis_examples/blip_text_localization.ipynb +0 -0
  23. a_main_folder/lavis_examples/blip_vqa.ipynb +0 -0
  24. a_main_folder/lavis_examples/blip_zero_shot_classification.ipynb +0 -0
  25. a_main_folder/lavis_examples/clip_feature_extraction.ipynb +0 -0
  26. a_main_folder/lavis_examples/clip_zero_shot_classification.ipynb +0 -0
  27. a_main_folder/litserve/.lightning_studio/.studiorc +4 -0
  28. a_main_folder/litserve/.lightning_studio/on_start.sh +13 -0
  29. a_main_folder/litserve/.lightning_studio/on_stop.sh +8 -0
  30. a_main_folder/litserve/aurasr.ipynb +215 -0
  31. a_main_folder/litserve/aurasr/.lightning_studio/.studiorc +4 -0
  32. a_main_folder/litserve/aurasr/.lightning_studio/on_start.sh +13 -0
  33. a_main_folder/litserve/aurasr/.lightning_studio/on_stop.sh +8 -0
  34. a_main_folder/litserve/aurasr/client.py +30 -0
  35. a_main_folder/litserve/aurasr/input.jpg +0 -0
  36. a_main_folder/litserve/aurasr/server.py +33 -0
  37. a_main_folder/llm2vec/test.ipynb +80 -0
  38. a_main_folder/ultralytics/input.jpg +0 -0
  39. a_main_folder/ultralytics/test.ipynb +0 -0
  40. open_clip/src/open_clip/hf_configs.py +67 -0
  41. open_clip/src/open_clip/model_configs/RN101.json +21 -0
  42. open_clip/src/open_clip/model_configs/RN50x16.json +21 -0
  43. open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP-i18n-256.json +29 -0
  44. open_clip/src/open_clip/model_configs/ViT-B-16-quickgelu.json +17 -0
  45. open_clip/src/open_clip/model_configs/ViT-B-16.json +16 -0
  46. open_clip/src/open_clip/model_configs/ViT-B-32-plus-256.json +16 -0
  47. open_clip/src/open_clip/model_configs/ViT-B-32-quickgelu.json +17 -0
  48. open_clip/src/open_clip/model_configs/ViT-B-32.json +16 -0
  49. open_clip/src/open_clip/model_configs/ViT-H-14-378-quickgelu.json +18 -0
  50. open_clip/src/open_clip/model_configs/ViT-H-14-CLIPA.json +26 -0
PaddleMIX/applications/README.md ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **简体中文** | [English](./README_en.md)
2
+ <p align="center">
3
+ <img src="https://github.com/PaddlePaddle/PaddleMIX/assets/22989727/2cd19298-1c52-4d73-a0f7-dcdab6a8ec90" align="middle" width = "600" />
4
+ </p>
5
+
6
+ <p align="center">
7
+ <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
8
+ <a href=""><img src="https://img.shields.io/badge/python-3.7+-aff.svg"></a>
9
+ <a href=""><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg"></a>
10
+ <a href="https://github.com/PaddlePaddle/PaddleMIX/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleMIX?color=ccf"></a>
11
+ </p>
12
+
13
+ <h4 align="center">
14
+ <a href=#特性> 特性 </a> |
15
+ <a href=#快速开始> 快速开始 </a>
16
+ </h4>
17
+
18
+
19
+
20
+ **PaddleMIX**应用示例基于paddlemix、ppdiffusers和paddlenlp开发,**简单易用**且**功能强大**。聚合业界**优质预训练模型**并提供**开箱即用**的开发体验,覆盖跨模态和多场景的模型库搭配,可满足开发者**灵活定制**的需求。
21
+
22
+ <img src="https://github.com/user-attachments/assets/4c695140-bf4c-46db-bbb5-5dd8197be947" align="center" />
23
+
24
+ ## 快速开始
25
+
26
+ 请先确认是否已安装 [PaddleMIX](../README.md/#安装) 和 [ppdiffusers](../README.md/#安装)
27
+
28
+ ### 1. appflow 依赖安装
29
+ ```shell
30
+ pip install -r paddlemix/appflow/requirements.txt
31
+ ```
32
+
33
+
34
+ ### 2.一键预测
35
+
36
+ PaddleMIX提供一键预测功能,无需训练,这里以开放世界检测分割为例。直接在终端运行如下命令,即可完成模型推理。
37
+
38
+ ```python
39
+ >>> python
40
+ >>> from paddlemix.appflow import Appflow
41
+ >>> from ppdiffusers.utils import load_image
42
+
43
+ >>> task = Appflow(app="openset_det_sam",
44
+ models=["GroundingDino/groundingdino-swint-ogc","Sam/SamVitH-1024"],
45
+ static_mode=False) #如果开启静态图推理,设置为True,默认动态图
46
+ >>> url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
47
+ >>> image_pil = load_image(url)
48
+ >>> result = task(image=image_pil,prompt="dog")
49
+ ```
50
+
51
+ 参数说明
52
+ | 参数 | 是否必须| 含义 |
53
+ |-------|-------|---------------------------------------------------------------------------------------------|
54
+ | --app | Yes| 应用名称 |
55
+ | --models | Yes | 需要使用的模型,可以是单个模型,也可以多个组合 |
56
+ | --static_mode | Option | 是否静态图推理,默认False |
57
+ | --precision | Option | 当 static_mode == True 时使用,默认fp32,可选择trt_fp32、trt_fp16 |
58
+
59
+
60
+ ## 特性
61
+
62
+ #### <a href=#开箱即用的工具集> 开箱即用的工具集 </a>
63
+
64
+ #### <a href=#跨模态多场景应用> 跨模态多场景应用 </a>
65
+
66
+
67
+
68
+ ### 开箱即用的工具集
69
+
70
+ Appflow提供丰富的开箱即用工具集,覆盖跨模态多场景应用,提供产业级的效果与极致的推理性能。
71
+
72
+ ![appflow](https://github.com/LokeZhou/PaddleMIX/assets/13300429/f80a7aa0-4cd5-4f86-90d6-2fc6da3eb42f)
73
+
74
+
75
+
76
+
77
+ ### 跨模态多场景应用
78
+ | 应用名称 | 调用模型 | 静态图推理 |
79
+ | :--------------------------------- | -------------------------------- | ----------|
80
+ | [视觉语言对话(Vision-Language-Chat)](./VLChat/README.md) | `qwen-vl-chat-7b` | 🚧 |
81
+ | [开放世界检测分割(Openset-Det-Sam)](./CVinW/README.md/#开放世界检测分割grounded-sam-detect-and-segment-everything-with-text-prompt) | `grounded sam` | ✅ |
82
+ | [自动标注(AutoLabel)](./Automatic_label/README.md/#自动标注autolabel) | `blip2 grounded sam` | ✅ |
83
+ | [检测框引导的图像编辑(Det-Guided-Inpainting)](./Inpainting/README.md/#检测框引导的图像编辑det-guided-inpainting) | `chatglm-6b stable-diffusion-2-inpainting grounded sam` | ✅ |
84
+ | [文图生成(Text-to-Image Generation)](./text2image/README.md/#文图生成text-to-image-generation) | `runwayml/stable-diffusion-v1-5 stabilityai/stable-diffusion-xl-base-1.0` | [fastdeploy](../ppdiffusers/deploy/README.md/#文图生成text-to-image-generation) |
85
+ | [文本引导的图像放大(Text-Guided Image Upscaling)](./image2image/README.md/#文本引导的图像放大text-guided-image-upscaling) | `ldm-super-resolution-4x-openimages`| ❌ |
86
+ | [文本引导的图像编辑(Text-Guided Image Inpainting)](./Inpainting/README.md/#文本引导的图像编辑text-guided-image-inpainting) | `stable-diffusion-2-inpainting` | [fastdeploy](../ppdiffusers/deploy/README.md/#文本引导的图像编辑text-guided-image-inpainting) |
87
+ | [文本引导的图像变换(Image-to-Image Text-Guided Generation)](./image2image/README.md/#文本引导的图像变换image-to-image-text-guided-generation) | `stable-diffusion-v1-5` | [fastdeploy](../ppdiffusers/deploy/README.md/#文本引导的图像变换image-to-image-text-guided-generation) |
88
+ | [文本条件的视频生成(Text-to-Video Generation)](./text2video/README.md/#文本条件的视频生成text-to-video-generation) | `text-to-video-ms-1.7b` | ❌ |
89
+ | [音频生成图像(Audio-to-Image Generation)](./Audio2Img/README.md/#audio-to-image) | `imagebind stable-diffusion-2-1-unclip` | |
90
+ | [音频描述(Audio-to-Caption Generation)](./Audio2Caption/README.md/#音频描述audio-to-caption-generation) | `chatglm-6b whisper` | |
91
+ | [音频对话(Audio-to-Chat Generation)](./AudioChat/README.md/#音频对话audio-to-chat-generation) | `chatglm-6b whisper fastspeech2` | |
92
+ | [音乐生成(Music Generation)](./MusicGeneration/README.md/#音乐生成music-generation) | `chatglm-6b minigpt4 audioldm` | |
93
+
94
+ 更多应用持续开发中......
95
+
96
+ * ✅: Supported
97
+ * 🚧: In Progress
98
+ * ❌: Not Supported
PaddleMIX/applications/README_en.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **English** | [简体中文](./README.md)
2
+ <p align="center">
3
+ <img src="https://github.com/PaddlePaddle/PaddleMIX/assets/22989727/2cd19298-1c52-4d73-a0f7-dcdab6a8ec90" align="middle" width = "600" />
4
+ </p>
5
+
6
+ <p align="center">
7
+ <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
8
+ <a href=""><img src="https://img.shields.io/badge/python-3.7+-aff.svg"></a>
9
+ <a href=""><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg"></a>
10
+ <a href="https://github.com/PaddlePaddle/PaddleMIX/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleMIX?color=ccf"></a>
11
+ </p>
12
+
13
+ <h4 align="center">
14
+ <a href=#Features> Features </a> |
15
+ <a href=#quick-start> Quick Start </a> |
16
+ </h4>
17
+
18
+ **PaddleMIX** application example is developed based on paddlemix, ppdiffusers, and Paddlenlp,which is **simple** and **easy** to use and **powerful**. Aggregating industry high-quality pre trained models and providing out of the box development experience, covering cross modal and multi scenario model library matching, can meet the needs of developers flexible customization .
19
+
20
+ <img src="https://github.com/user-attachments/assets/4c695140-bf4c-46db-bbb5-5dd8197be947" align="center" />
21
+
22
+
23
+ ## Quick Start
24
+ Please confirm if it has been installed first [PaddleMIX](../README_EN.md/#installation) and [ppdiffusers](../README_EN.md/#installation)
25
+
26
+ ### 1.requirements
27
+ ```shell
28
+ pip install -r paddlemix/appflow/requirements.txt
29
+ ```
30
+
31
+ ### 2.Appflow
32
+
33
+ PaddleMIX provides Appflow without training, and can directly input data to output results:
34
+
35
+ ```
36
+ >>> python
37
+ >>> from paddlemix.appflow import Appflow
38
+ >>> from ppdiffusers.utils import load_image
39
+
40
+ >>> task = Appflow(app="openset_det_sam",
41
+ models=["GroundingDino/groundingdino-swint-ogc","Sam/SamVitH-1024"],
42
+ static_mode=False) #如果开启静态图推理,设置为True,默认动态图
43
+ >>> url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
44
+ >>> image_pil = load_image(url)
45
+ >>> result = task(image=image_pil,prompt="dog")
46
+ ```
47
+
48
+ Parameter Description
49
+ | parameter | required| meaning |
50
+ |-------|-------|---------------------------------------------------------------------------------------------|
51
+ | --app | Yes| app name |
52
+ | --models | Yes | model list,can be a single model or multiple combinations |
53
+ | --static_mode | Option | static graph inference, default : False |
54
+ | --precision | Option | when static_mode == True used,default: fp32, option trt_fp32、trt_fp16 |
55
+
56
+ ## Features
57
+
58
+ #### <a href=#out-of-box-toolset> Out-of-Box Toolset </a>
59
+
60
+ #### <a href=#multi-modal-and-scenario> Multi Modal And Scenario </a>
61
+
62
+
63
+
64
+ ### Out-of-Box Toolset
65
+
66
+ Appflow provides a rich set of out of the box tools that cover cross modal and multi scenario applications, providing industry level effects and ultimate reasoning performance.
67
+ ![appflow](https://github.com/LokeZhou/PaddleMIX/assets/13300429/f80a7aa0-4cd5-4f86-90d6-2fc6da3eb42f)
68
+
69
+ ### Multi Modal And Scenario
70
+ | name | models | static mode |
71
+ | :--------------------------------- | -------------------------------- | ----------|
72
+ | [视觉语言对话(Vision-Language-Chat)](./VLChat/README.md) | `qwen-vl-chat-7b` | 🚧 |
73
+ | [开放世界检测分割(Openset-Det-Sam)](./CVinW/README.md/#开放世界检测分割grounded-sam-detect-and-segment-everything-with-text-prompt) | `grounded sam` | ✅ |
74
+ | [自动标注(AutoLabel)](./Automatic_label/README.md/#自动标注autolabel) | `blip2 grounded sam` | ✅ |
75
+ | [检测框引导的图像编辑(Det-Guided-Inpainting)](./Inpainting/README.md/#检测框引导的图像编辑det-guided-inpainting) | `chatglm-6b stable-diffusion-2-inpainting grounded sam` | ✅ |
76
+ | [文图生成(Text-to-Image Generation)](./text2image/README.md/#文图生成text-to-image-generation) | `runwayml/stable-diffusion-v1-5` | [fastdeploy](../ppdiffusers/deploy/README.md/#文图生成text-to-image-generation) |
77
+ | [文本引导的图像放大(Text-Guided Image Upscaling)](./image2image/README.md/#文本引导的图像放大text-guided-image-upscaling) | `ldm-super-resolution-4x-openimages`| ❌ |
78
+ | [文本引导的图像编辑(Text-Guided Image Inpainting)](./Inpainting/README.md/#文本引导的图像编辑text-guided-image-inpainting) | `stable-diffusion-2-inpainting` | [fastdeploy](../ppdiffusers/deploy/README.md/#文本引导的图像编辑text-guided-image-inpainting) |
79
+ | [文本引导的图像变换(Image-to-Image Text-Guided Generation)](./image2image/README.md/#文本引导的图像变换image-to-image-text-guided-generation) | `stable-diffusion-v1-5` | [fastdeploy](../ppdiffusers/deploy/README.md/#文本引导的图像变换image-to-image-text-guided-generation) |
80
+ | [文本条件的视频生成(Text-to-Video Generation)](./text2video/README.md/#文本条件的视频生成text-to-video-generation) | `text-to-video-ms-1.7b` | ❌ |
81
+
82
+
83
+ More applications under continuous development......
84
+
85
+ * ✅: Supported
86
+ * 🚧: In Progress
87
+ * ❌: Not Supported
PaddleMIX/applications/gradio_autolable.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from paddlemix.appflow import Appflow
2
+ from ppdiffusers.utils import load_image
3
+ import paddle
4
+ import cv2
5
+
6
+ import os
7
+ import json
8
+ from zipfile import ZipFile
9
+ import zipfile
10
+ import numpy as np
11
+ from PIL import Image, ImageDraw
12
+ import gradio as gr
13
+ import traceback
14
+ import math
15
+ import tempfile
16
+
17
+
18
+ task = Appflow(app="auto_label",
19
+ models=["paddlemix/blip2-caption-opt2.7b","GroundingDino/groundingdino-swint-ogc","Sam/SamVitH-1024"])
20
+
21
+
22
+ def auto_label(img, prompt):
23
+ result = task(image=img,blip2_prompt = prompt)
24
+ return result
25
+
26
+
27
+ def result2json(result, filename):
28
+ label_data = {'version': '0.0.0',
29
+ 'flags': {} ,
30
+ 'shapes': [],
31
+ 'imagePath': filename,
32
+ 'imageHeight': result['image'].size[1],
33
+ 'imageWidth': result['image'].size[0]}
34
+
35
+ for i in range(len(result['labels'])):
36
+ # label去掉末尾的置信度
37
+ label = result['labels'][i]
38
+ spl_idx = -1
39
+ for j in range(len(label)):
40
+ if label[j] == '(':
41
+ spl_idx = j
42
+ if spl_idx == -1:
43
+ label = label
44
+ else:
45
+ label = label[:spl_idx]
46
+
47
+ # 增加bbox
48
+ rect = result['boxes'][i].tolist()
49
+ xmin, ymin, xmax, ymax = rect
50
+ label_data['shapes'].append(
51
+ {'label': label,
52
+ 'points': [[xmin, ymin],[xmax, ymax]],
53
+ 'group_id': None,
54
+ 'shape_type': 'rectangle',
55
+ 'flags': {}
56
+ }
57
+ )
58
+
59
+ # 记录polygen
60
+ seg_mask = result['seg_masks'][i].numpy()[0]
61
+ mask_img = seg_mask.astype('uint8')*255
62
+ contours, _ = cv2.findContours(mask_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
63
+ points = []
64
+ for contour in contours:
65
+ for point in contour:
66
+ points.append(point[0].tolist())
67
+
68
+ # 增加polygen
69
+ rect = result['boxes'][i]
70
+ xmin, ymin, xmax, ymax = rect
71
+ label_data['shapes'].append(
72
+ {'label': label,
73
+ 'points': points,
74
+ 'group_id': None,
75
+ 'shape_type': 'polygon',
76
+ 'flags': {}
77
+ }
78
+ )
79
+
80
+ return label_data
81
+
82
+
83
+ def generate_mask(img, result_masks):
84
+ divide_part = int(255/(math.ceil(len(result_masks)/3)+1))
85
+ np_img = np.array(img)
86
+ for i in range(len(result_masks)):
87
+ color = [0,0,0]
88
+ c = i%3
89
+ p = i//3+1
90
+ color[c] = divide_part*p
91
+ mask = result_masks[i]
92
+ M = mask.numpy()[0]
93
+ np_img[M] = color
94
+ print(color)
95
+ img = Image.fromarray(np_img)
96
+ return img
97
+
98
+
99
+ def al_fun(img, prompt):
100
+ img = Image.fromarray(img.astype('uint8')).convert('RGB')
101
+ result = auto_label(img, prompt)
102
+ label_data = result2json(result, "tmpimg")
103
+ # Draw BBox
104
+ draw = ImageDraw.Draw(img)
105
+ for i in range(len(result['boxes'])):
106
+ rect = result['boxes'][i].tolist()
107
+ draw.rectangle(rect, width=10)
108
+ # Draw Mask
109
+ mask_img = generate_mask(result['image'], result['seg_masks'])
110
+ # Write File
111
+ labeled_file = os.path.join(tmpdir,'labeled_date.json')
112
+ with open(labeled_file,'w') as f:
113
+ json.dump(label_data, f, indent=4)
114
+ return img, mask_img, labeled_file
115
+
116
+
117
+ def al_file_fun(file_in, prompt):
118
+ out_zip_file = os.path.join(tmpdir, "labeled.zip")
119
+ with ZipFile(out_zip_file, "w") as zipObj:
120
+ for _, imgname in enumerate(file_in):
121
+ image_pil = Image.open(imgname.name)
122
+ result = auto_label(image_pil, prompt)
123
+ label_data = result2json(result, imgname.name.split("/")[-1])
124
+ labeled_file = os.path.join(tmpdir,imgname.name.split("/")[-1]+'.josn')
125
+ with open(labeled_file,'w') as f:
126
+ json.dump(label_data, f, indent=4)
127
+ zipObj.write(labeled_file)
128
+ return out_zip_file
129
+
130
+
131
+ def al_zip_fun(zip_in, prompt):
132
+ for _, zipname in enumerate(zip_in):
133
+ with open('test.txt', 'a') as f:
134
+ f.write(zipname.name+'\n')
135
+ f.write(zipname.name+'\n')
136
+ zipfile.ZipFile(zipname.name).extractall(tmpdir)
137
+ with open('test.txt', 'a') as f:
138
+ f.write('\n after extract \n')
139
+ out_zip_file = os.path.join(tmpdir, "labeled.zip")
140
+ with ZipFile(out_zip_file, "w") as zipObj:
141
+ for root, _, files in os.walk(tmpdir, topdown=False):
142
+ for name in files:
143
+ if name.split('.')[-1] in ['jpg', 'png', 'jpeg', 'JPG', 'PNG', 'JPEG']:
144
+ img_path = os.path.join(root, name)
145
+ json_path = os.path.join(root, name+'.json')
146
+
147
+ image_pil = Image.open(img_path)
148
+ result = auto_label(image_pil, prompt)
149
+ label_data = result2json(result, img_path)
150
+ with open(json_path,'w') as f:
151
+ json.dump(label_data, f, indent=4)
152
+ zipObj.write(json_path)
153
+ os.remove(img_path)
154
+ return out_zip_file
155
+
156
+
157
+ with gr.Blocks() as demo:
158
+ gr.Markdown("# 自动标注(AutoLabel)")
159
+ with gr.Tab("单张图片标注"):
160
+ with gr.Row():
161
+ al_image_in = gr.Image(label = "输入图片")
162
+ al_image_out1 = gr.Image(label = "BBox标注图片")
163
+ al_image_out2 = gr.Image(label = "Mask标注图片")
164
+ al_text_in = gr.Text(label = "Prompt", value="describe the image")
165
+ al_file_out_ = gr.File(label = "标注文件")
166
+ al_button = gr.Button()
167
+ al_button.click(fn=al_fun, inputs = [al_image_in, al_text_in], outputs = [al_image_out1, al_image_out2, al_file_out_])
168
+ with gr.Tab("上传多张图片批量标注"):
169
+ with gr.Row():
170
+ al_file_in = gr.Files(label = "上传多张图片", file_types=['.jpg', '.png', '.jpeg', '.JPG', '.PNG', '.JPEG'])
171
+ al_file_out = gr.File(label = "标注结果")
172
+ al_file_text_in = gr.Text(label = "Prompt", value="describe the image")
173
+ al_file_button = gr.Button()
174
+ al_file_button.click(fn=al_file_fun, inputs = [al_file_in, al_file_text_in], outputs = [al_file_out])
175
+ with gr.Tab("上传压缩包批量标注"):
176
+ with gr.Row():
177
+ al_zip_in = gr.Files(label = "上传压缩包", file_types=['.zip'])
178
+ al_zip_out = gr.File(label = "标注结果")
179
+ al_zip_text_in = gr.Text(label = "Prompt", value="describe the image")
180
+ al_zip_button = gr.Button()
181
+ al_zip_button.click(fn=al_zip_fun, inputs = [al_zip_in, al_zip_text_in], outputs = [al_zip_out])
182
+
183
+
184
+ # for download file, use the tempfile
185
+ global tmpdir
186
+ with tempfile.TemporaryDirectory(dir='.') as tmpdir:
187
+ demo.launch()
PaddleMIX/deploy/README.md ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PaddleMIX推理部署
2
+
3
+ [[English](README_en.md)]
4
+
5
+ PaddleMIX基于Paddle Inference,提供了python的部署方案。部署方式分为两种:
6
+ - 通过 **APPflow** ,设置static_mode = True 变量开启静态图推理,同时可配合trt加速推理;该方式部分模型不支持静态图以及trt,具体模型可参考[跨模态多场景应用](../applications/README.md/#跨模态多场景应用);
7
+
8
+ - 单模型部署
9
+
10
+
11
+ ## 1.APPflow部署
12
+
13
+ 在使用 PaddleMIX 一键预测 **APPflow** 时,可通过设置 static_mode = True 变量开启静态图推理,同时可配合trt加速推理。
14
+
15
+ ### 1.1 示例
16
+
17
+ ```python
18
+ >>> from paddlemix.appflow import Appflow
19
+ >>> from PIL import Image
20
+
21
+ >>> task = Appflow(app="openset_det_sam",
22
+ models=["GroundingDino/groundingdino-swint-ogc","Sam/SamVitH-1024"],
23
+ static_mode=True,
24
+ precision="fp32")
25
+ >>> image_pil = Image.open("beauty.png").convert("RGB")
26
+ >>> result = task(image=image_pil,prompt="women")
27
+ ```
28
+
29
+ ### 1.2 参数说明
30
+ | 参数 | 是否必须| 含义 |
31
+ |-------|-------|---------------------------------------------------------------------------------------------|
32
+ | --app | Yes| 应用名称 |
33
+ | --models | Yes | 需要使用的模型,可以是单个模型,也可以多个组合 |
34
+ | --static_mode | Option | 是否静态图推理,默认False |
35
+ | --precision | Option | 当 static_mode == True 时使用,默认fp32,可选择trt_fp32、trt_fp16 |
36
+
37
+ 说明:
38
+ - 部分模型不支持静态图以及trt,具体可参考[跨模态多场景应用](../applications/README.md)
39
+ - 生成的静态图将在模型名字对应的文件夹下 如:GroundingDino/groundingdino-swint-ogc/
40
+
41
+
42
+ ## 2. 单模型预测部署
43
+
44
+ Python端预测部署主要包含两个步骤:
45
+ - 导出预测模型
46
+ - 基于Python进行预测
47
+
48
+ 当前支持模型:
49
+ - [blip2](./blip2/README.md)
50
+ - [groundingdino](./groundingdino/README.md)
51
+ - [sam](./sam/README.md)
52
+ - [qwen_vl](./qwen_vl/README.md)
53
+
54
+ 以 groundingdino 为例子。
55
+
56
+ ### 2.1 导出预测模型
57
+
58
+ ```bash
59
+ cd deploy/groundingdino
60
+ # 导出groundingdino模型
61
+ python export.py \
62
+ --dino_type GroundingDino/groundingdino-swint-ogc
63
+ ```
64
+ 导出后目录下,包括 `model_state.pdiparams`, `model_state.pdiparams.info`, `model_state.pdmodel`等文件。
65
+
66
+ ### 2.2 基于python的预测
67
+
68
+ ```bash
69
+ python predict.py \
70
+ --text_encoder_type GroundingDino/groundingdino-swint-ogc \
71
+ --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \
72
+ --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
73
+ --output_dir ./groundingdino_predict_output \
74
+ --prompt "bus"
75
+
76
+ ```
77
+
78
+ ## 3. 推理 BenchMark
79
+
80
+ > Note:
81
+ > 测试环境为:
82
+ Paddle 3.0,
83
+ PaddleMIX release/2.0
84
+ PaddleNLP2.7.2
85
+ A100 80G单卡。
86
+
87
+ ### 3.1 benchmark命令
88
+
89
+ 在 `deploy` 对应模型目录下的运行后加 --benchmark,
90
+ 如 GroundingDino 的benchmark命令为:
91
+
92
+ ```bash
93
+ cd deploy/groundingdino
94
+ python predict.py \
95
+ --text_encoder_type GroundingDino/groundingdino-swint-ogc \
96
+ --model_path output_groundingdino/GroundingDino/groundingdino-swint-ogc \
97
+ --input_image https://bj.bcebos.com/v1/paddlenlp/models/community/GroundingDino/000000004505.jpg \
98
+ --output_dir ./groundingdino_predict_output \
99
+ --prompt "bus" \
100
+ --benchmark True
101
+ ```
102
+
103
+ # A100性能数据
104
+ |模型|图片分辨率|数据类型 |Paddle Deploy |
105
+ |-|-|-|-|
106
+ |qwen-vl-7b|448*448|fp16|669.8 ms|
107
+ |llava-1.5-7b|336*336|fp16|981.2 ms|
108
+ |llava-1.6-7b|336*336|fp16|778.7 ms|
109
+ |groundingDino/groundingdino-swint-ogc|800*1193|fp32|100 ms|
110
+ |Sam/SamVitH-1024|1024*1024|fp32|121 ms|
PaddleMIX/docs/CHANGELOG.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 版本更新信息
2
+
3
+ ## 最新版本信息
4
+
5
+ ### 2.0(07/26/2024)
6
+
7
+ #### 多模态理解
8
+
9
+ 1. 新增模型:LLaVA: v1.5-7b, v1.5-13b, v1,6-7b,CogAgent, CogVLM, Qwen-VL, InternLM-XComposer2
10
+ 2. 数据集增强:新增chatml_dataset图文对话数据读取方案,可自定义chat_template文件适配,支持混合数据集
11
+ 3. 工具链升级:新增Auto模块,统一SFT训练流程,兼容全参数、lora训练。新增mixtoken训练策略,SFT吞吐量提升5.6倍。支持Qwen-VL,LLaVA推理部署,较torch推理性能提升2.38倍
12
+
13
+ #### 多模态生成
14
+
15
+ 1. 视频生成能力:支持Sora相关技术,支持DiT、SiT、UViT训练推理,新增NaViT、MAGVIT-v2模型; 新增视频生成模型SVD、Open Sora,支持模型微调和推理; 新增姿态可控视频生成模型AnimateAnyone、即插即用视频生成模型AnimateDiff、GIF视频生成模型Hotshot-XL;
16
+ 2. 文生图模型库:新增高速推理文图生成模型LCM,适配SD/SDXL训练和推理;
17
+ 3. 工具链升级:发布ppdiffusers 0.24.1版本,新增peft,accelerate后端; 权重加载/保存全面升级,支持分布式、模型切片、safetensors等场景。
18
+ 4. 生态兼容:提供基于ppdiffusers开发的ComfyUI插件,支持了常见的模型加载转换、文生图、图生图、图像局部修改等任务。新增Stable Diffusion 1.5系列节点;新增Stable Diffusion XL系列节点。新增4个图像生成的workflow案例。
19
+
20
+ #### DataCopilot(多模态数据处理工具箱)
21
+
22
+ 1. 多模态数据集类型MMDataset,支持加载和导出Json、H5、Jsonl等多种数据存储格式,内置并发(map, filter)数据处理接口等
23
+ 2. 多模态数据格式工具,支持自定义数据结构,数据转换,离线格式检查
24
+ 3. 多模态数据分析工具,支持基本的统计信息,数据可视化功能,以及注册自定义功能
25
+
26
+ ### 1.0(11/15/2023)
27
+
28
+ #### 核心能力
29
+
30
+ 1. 大规模预训练: BLIP-2支持数据并行、sharding、模型并行,流水线并行训练;支持千亿参数规模训练; EVA-CLIP支持数据并行、sharding、模型并行训练; Stable Diffusion支持数据并行、sharding、BF16 O2训练; CLIP,Coca支持数据并行训练
31
+ 2. 有监督精调: Stable Diffusion,SDXL 支持LoRA精调
32
+ 3. 推理部署: 支持BLIP-2,miniGPT-4,Grounding DINO, SAM,Stable Diffusion动转静导出部署
33
+
34
+ #### 前沿模型
35
+ 1. 新增CLIP系列跨模态大模型:CLIP,EVA-CLIP,Coca
36
+ 2. 新增图生文跨模态大模型:BLIP-2,miniGPT-4,VisualGLM
37
+ 3. 新增跨模态视觉模型:Grounding DINO, SAM
38
+ 4. 新增融合更多模态大模型:ImageBind
39
+ 5. 新增文生图模型:SDXL,支持Text2Image、Img2Img、Inpainting、InstructPix2Pix等任务,支持DreamBooth Lora训练; 新增UniDiffuser,通过统一的多模态扩散过程支持文生图、图生文等任务; 新增文本条件视频生成模型LVDM,支持训练与推理; 新增文图生成模型Kandinsky 2.2,Consistency models; Controlnet升级,支持ControlNetImg2Img、ControlNetInpaint、 StableDiffusionXLControlNet等。
40
+
41
+ #### 特色应用
42
+ 1. 新增跨模态大模型应用流水线AppFlow
43
+ 2. 新增基于chat的图像编辑应用
44
+ 3. 新增自动标注应用
PaddleMIX/docs/FAQ.md ADDED
File without changes
PaddleMIX/paddlemix/__init__.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # copyright (c) 2023 paddlepaddle authors. all rights reserved.
2
+ # copyright 2023 the salesforce team authors and the huggingface team. all rights reserved.
3
+ #
4
+ # licensed under the apache license, version 2.0 (the "license");
5
+ # you may not use this file except in compliance with the license.
6
+ # you may obtain a copy of the license at
7
+ #
8
+ # http://www.apache.org/licenses/license-2.0
9
+ #
10
+ # unless required by applicable law or agreed to in writing, software
11
+ # distributed under the license is distributed on an "as is" basis,
12
+ # without warranties or conditions of any kind, either express or implied.
13
+ # see the license for the specific language governing permissions and
14
+ # limitations under the license.
15
+
16
+ from .datasets import *
17
+ from .models import *
18
+ from .optimization import *
19
+ from .processors import *
20
+ from .triton_ops import *
PaddleMIX/ppdiffusers/README.md ADDED
@@ -0,0 +1,1278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <img src="https://user-images.githubusercontent.com/11793384/215372703-4385f66a-abe4-44c7-9626-96b7b65270c8.png" width="40%" height="40%" />
3
+ </div>
4
+
5
+ <p align="center">
6
+ <a href="https://pypi.org/project/ppdiffusers/"><img src="https://img.shields.io/pypi/pyversions/ppdiffusers"></a>
7
+ <a href=""><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-yellow.svg"></a>
8
+ <a href="https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
9
+ </p>
10
+
11
+ <h4 align="center">
12
+ <a href=#特性> 特性 </a> |
13
+ <a href=#安装> 安装 </a> |
14
+ <a href=#快速开始> 快速开始 </a> |
15
+ <a href=#模型部署> 模型部署</a>
16
+ </h4>
17
+
18
+ # PPDiffusers: Diffusers toolbox implemented based on PaddlePaddle
19
+
20
+ **PPDiffusers**是一款支持多种模态(如文本图像跨模态、图像、语音)扩散模型(Diffusion Model)训练和推理的国产化工具箱,依托于[**PaddlePaddle**](https://www.paddlepaddle.org.cn/)框架和[**PaddleNLP**](https://github.com/PaddlePaddle/PaddleNLP)自然语言处理开发库。
21
+
22
+ ## News 📢
23
+ * 🔥 **2024.10.18 发布 0.29.0 版本,新增图像生成模型[Stable Diffusion 3 (SD3)](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/text_to_image/README_sd3.md),支持DreamBooth训练及高性能推理;SD3、SDXL适配昇腾910B,提供国产计算芯片上的训推能力;DIT支持[高性能推理](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/class_conditional_image_generation/DiT/README.md#23-paddle-inference-%E9%AB%98%E6%80%A7%E8%83%BD%E6%8E%A8%E7%90%86);支持PaddleNLP 3.0 beta版本。**
24
+
25
+ * 🔥 **2024.07.15 发布 0.24.1 版本,新增[Open-Sora](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/Open-Sora),支持模型训练和推理;全面支持Paddle 3.0。**
26
+
27
+ * 🔥 **2024.04.17 发布 0.24.0 版本,支持[Sora相关技术](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/sora),支持[DiT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/class_conditional_image_generation/DiT)、[SiT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/class_conditional_image_generation/DiT#exploring-flow-and-diffusion-based-generative-models-with-scalable-interpolant-transformers-sit)、[UViT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/text_to_image_mscoco_uvit)训练推理,新增[NaViT](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/navit)、[MAGVIT-v2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/video_tokenizer/magvit2)模型;
28
+ 视频生成能力全面升级;
29
+ 新增视频生成模型[SVD](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/stable_video_diffusion),支持模型微调和推理;
30
+ 新增姿态可控视频生成模型[AnimateAnyone](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/AnimateAnyone)、即插即用视频生成模型[AnimateDiff](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/inference/text_to_video_generation_animediff.py)、GIF视频生成模型[Hotshot-XL](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/community/Hotshot-XL);
31
+ 新增高速推理文图生成模型[LCM](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/consistency_distillation),支持SD/SDXL训练和推理;
32
+ [模型推理部署](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/deploy)全面升级;新增peft,accelerate后端;
33
+ 权重加载/保存全面升级,支持分布式、模型切片、safetensors等场景,相关能力已集成DiT、 [IP-Adapter](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/ip_adapter)、[PhotoMaker](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/PhotoMaker)、[InstantID](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/InstantID)等。**
34
+ * 🔥 **2023.12.12 发布 0.19.4 版本,修复已知的部分 BUG,修复 0D Tensor 的 Warning,新增 SDXL 的 FastdeployPipeline。**
35
+ * 🔥 **2023.09.27 发布 0.19.3 版本,新增[SDXL](#文本图像多模),支持Text2Image、Img2Img、Inpainting、InstructPix2Pix等任务,支持DreamBooth Lora训练;
36
+ 新增[UniDiffuser](#文本图像多模),通过统一的多模态扩散过程支持文生图、图生文等任务;
37
+ 新增文本条件视频生成模型[LVDM](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/text_to_video_lvdm),支持训练与推理;
38
+ 新增文图生成模型[Kandinsky 2.2](#文本图像多模),[Consistency models](#文本图像多模);
39
+ Stable Diffusion支持[BF16 O2训练](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/stable_diffusion),效果对齐FP32;
40
+ [LoRA加载升级](#加载HF-LoRA权重),支持加载SDXL的LoRA权重;
41
+ [Controlnet](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/ppdiffusers/pipelines/controlnet)升级,支持ControlNetImg2Img、ControlNetInpaint、StableDiffusionXLControlNet等。**
42
+
43
+
44
+
45
+
46
+ ## 特性
47
+ #### 📦 SOTA扩散模型Pipelines集合
48
+ 我们提供**SOTA(State-of-the-Art)** 的扩散模型Pipelines集合。
49
+ 目前**PPDiffusers**已经集成了**100+Pipelines**,支持文图生成(Text-to-Image Generation)、文本引导的图像编辑(Text-Guided Image Inpainting)、文本引导的图像变换(Image-to-Image Text-Guided Generation)、文本条件的视频生成(Text-to-Video Generation)、超分(Super Superresolution)、文本条件的音频生成(Text-to-Audio Generation)在内的**10余项**任务,覆盖**文本、图像、视频、音频**等多种模态。
50
+ 如果想要了解当前支持的所有**Pipelines**以及对应的来源信息,可以阅读[🔥 PPDiffusers Pipelines](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/pipelines/README.md)文档。
51
+
52
+
53
+ #### 🔊 提供丰富的Noise Scheduler
54
+ 我们提供了丰富的**噪声调度器(Noise Scheduler)**,可以对**速度**与**质量**进行权衡,用户可在推理时根据需求快速切换使用。
55
+ 当前**PPDiffusers**已经集成了**14+Scheduler**,不仅支持 [DDPM](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py)、[DDIM](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py) 和 [PNDM](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py),还支持最新的 [🔥 DPMSolver](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py)!
56
+
57
+ #### 🎛️ 提供多种扩散模型组件
58
+ 我们提供了**多种扩散模型**组件,如[UNet1DModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_1d.py)、[UNet2DModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_2d.py)、[UNet2DConditionModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_2d_condition.py)、[UNet3DConditionModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/unet_3d_condition.py)、[VQModel](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/vae.py)、[AutoencoderKL](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/models/vae.py)等。
59
+
60
+
61
+ #### 📖 提供丰富的训练和推理教程
62
+ 我们提供了丰富的训练教程,不仅支持扩散模型的二次开发微调,如基于[Textual Inversion](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/textual_inversion)和[DreamBooth](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/dreambooth)使用3-5张图定制化训练生成图像的风格或物体,还支持[🔥 Latent Diffusion Model](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/text_to_image_laion400m)、[🔥 ControlNet](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/controlnet)、[🔥 T2I-Adapter](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/t2i-adapter) 等扩散模型的训练!
63
+ 此外,我们还提供了丰富的[🔥 Pipelines推理样例](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/examples/inference)。
64
+
65
+ #### 🚀 支持FastDeploy高性能部署
66
+ 我们提供基于[FastDeploy](https://github.com/PaddlePaddle/FastDeploy)的[🔥 高性能Stable Diffusion Pipeline](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py),更多有关FastDeploy进行多推理引擎后端高性能部署的信息请参考[🔥 高性能FastDeploy推理教程](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers/deploy)。
67
+
68
+ ## 安装
69
+
70
+ ### 环境依赖
71
+ ```
72
+ pip install -r requirements.txt
73
+ ```
74
+ 关于PaddlePaddle安装的详细教程请查看[Installation](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)。
75
+
76
+ ### pip安装
77
+
78
+ ```shell
79
+ pip install --upgrade ppdiffusers
80
+ ```
81
+
82
+ ### 手动安装
83
+ ```shell
84
+ git clone https://github.com/PaddlePaddle/PaddleMIX
85
+ cd PaddleMIX/ppdiffusers
86
+ python setup.py install
87
+ ```
88
+ ### 设置代理
89
+ ```shell
90
+ export HF_HUB_ENABLE_HF_TRANSFER=1
91
+ export HF_ENDPOINT=https://hf-mirror.com
92
+ ```
93
+
94
+ ## 快速开始
95
+ 我们将以扩散模型的典型代表**Stable Diffusion**为例,带你快速了解PPDiffusers。
96
+
97
+ **Stable Diffusion**基于**潜在扩散模型(Latent Diffusion Models)**,专门用于**文图生成(Text-to-Image Generation)任务**。该模型是由来自 [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [LAION](https://laion.ai/)以及[RunwayML](https://runwayml.com/)的工程师共同开发完成,目前发布了v1和v2两个版本。v1版本采用了LAION-5B数据集子集(分辨率为 512x512)进行训练,并具有以下架构设置:自动编码器下采样因子为8,UNet大小为860M,文本编码器为CLIP ViT-L/14。v2版本相较于v1版本在生成图像的质量和分辨率等进行了改善。
98
+
99
+ ### Stable Diffusion重点模型权重
100
+
101
+ <details><summary>&emsp; Stable Diffusion 模型支持的权重(英文) </summary>
102
+
103
+ **我们只需要将下面的"xxxx",替换成所需的权重名,即可快速使用!**
104
+ ```python
105
+ from ppdiffusers import *
106
+
107
+ pipe_text2img = StableDiffusionPipeline.from_pretrained("xxxx")
108
+ pipe_img2img = StableDiffusionImg2ImgPipeline.from_pretrained("xxxx")
109
+ pipe_inpaint_legacy = StableDiffusionInpaintPipelineLegacy.from_pretrained("xxxx")
110
+ pipe_mega = StableDiffusionMegaPipeline.from_pretrained("xxxx")
111
+
112
+ # pipe_mega.text2img() 等于 pipe_text2img()
113
+ # pipe_mega.img2img() 等于 pipe_img2img()
114
+ # pipe_mega.inpaint_legacy() 等于 pipe_inpaint_legacy()
115
+ ```
116
+
117
+ | PPDiffusers支持的模型名称 | 支持加载的Pipeline | 备注 | huggingface.co地址 |
118
+ | :-------------------------------------------: | :--------------------------------------------------------------------: | --- | :-----------------------------------------: |
119
+ | CompVis/stable-diffusion-v1-4 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | Stable-Diffusion-v1-4 使用 Stable-Diffusion-v1-2 的权重进行初始化。随后在"laion-aesthetics v2 5+"数据集上以 **512x512** 分辨率微调了 **225k** 步数,对文本使用了 **10%** 的dropout(即:训练过程中文图对中的文本有 10% 的概率会变成空文本)。模型使用了[CLIP ViT-L/14](https://huggingface.co/openai/clip-vit-large-patch14)作为文本编码器。| [地址](https://huggingface.co/CompVis/stable-diffusion-v1-4) |
120
+ | CompVis/ldm-text2im-large-256 | LDMTextToImagePipeline | [LDM论文](https://arxiv.org/pdf/2112.10752.pdf) LDM-KL-8-G* 权重。| [地址](https://huggingface.co/CompVis/ldm-text2im-large-256) |
121
+ | CompVis/ldm-super-resolution-4x-openimages | LDMSuperResolutionPipeline | [LDM论文](https://arxiv.org/pdf/2112.10752.pdf) LDM-VQ-4 权重,[原始权重链接](https://ommer-lab.com/files/latent-diffusion/sr_bsr.zip)。| [地址](https://huggingface.co/CompVis/ldm-super-resolution-4x-openimages) |
122
+ | runwayml/stable-diffusion-v1-5 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | Stable-Diffusion-v1-5 使用 Stable-Diffusion-v1-2 的权重进行初始化。随后在"laion-aesthetics v2 5+"数据集上以 **512x512** 分辨率微调了 **595k** 步数,对文本使用了 **10%** 的dropout(即:训练过程中文图对中的文本有 10% 的概率会变成空文本)。模型同样也使用了[CLIP ViT-L/14](https://huggingface.co/openai/clip-vit-large-patch14)作为文本编码器。| [地址](https://huggingface.co/runwayml/stable-diffusion-v1-5) |
123
+ | runwayml/stable-diffusion-inpainting | StableDiffusionInpaintPipeline | Stable-Diffusion-Inpainting 使用 Stable-Diffusion-v1-2 的权重进行初始化。首先进行了 **595k** 步的常规训练(实际也就是 Stable-Diffusion-v1-5 的权重),然后进行了 **440k** 步的 inpainting 修复训练。对于 inpainting 修复训练,给 UNet 额外增加了 **5** 输入通道(其中 **4** 个用于被 Mask 遮盖住的图片,**1** 个用于 Mask 本身)。在训练期间,会随机生成 Mask,并有 **25%** 概率会将原始图片全部 Mask 掉。| [地址](https://huggingface.co/runwayml/stable-diffusion-inpainting) |
124
+ | stabilityai/stable-diffusion-2-base | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 该模型首先在 [LAION-5B 256x256 子集上](https://laion.ai/blog/laion-5b/) (过滤条件:[punsafe = 0.1 的 LAION-NSFW 分类器](https://github.com/LAION-AI/CLIP-based-NSFW-Detector) 和 审美分数大于等于 4.5 )从头开始训练 **550k** 步,然后又在分辨率 **>= 512x512** 的同一数据集上进一步训练 **850k** 步。| [地址](https://huggingface.co/stabilityai/stable-diffusion-2-base) |
125
+ | stabilityai/stable-diffusion-2 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | stable-diffusion-2 使用 stable-diffusion-2-base 权重进行初始化,首先在同一数据集上(**512x512** 分辨率)使用 [v-objective](https://arxiv.org/abs/2202.00512) 训��了 **150k** 步。然后又在 **768x768** 分辨率上使用 [v-objective](https://arxiv.org/abs/2202.00512) 继续训练了 **140k** 步。| [地址](https://huggingface.co/stabilityai/stable-diffusion-2) |
126
+ | stabilityai/stable-diffusion-2-inpainting | StableDiffusionInpaintPipeline |stable-diffusion-2-inpainting 使用 stable-diffusion-2-base 权重初始化,并且额外训练了 **200k** 步。训练过程使用了 [LAMA](https://github.com/saic-mdal/lama) 中提出的 Mask 生成策略,并且使用 Mask 图片的 Latent 表示(经过 VAE 编码)作为附加条件。| [地址](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting) |
127
+ | stabilityai/stable-diffusion-x4-upscaler | StableDiffusionUpscalePipeline | 该模型在**LAION 10M** 子集上(>2048x2048)训练了 1.25M 步。该模型还在分辨率为 **512x512** 的图像上使用 [Text-guided Latent Upscaling Diffusion Model](https://arxiv.org/abs/2112.10752) 进行了训练。除了**文本输入**之外,它还接收 **noise_level** 作为输入参数,因此我们可以使用 [预定义的 Scheduler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler/blob/main/low_res_scheduler/scheduler_config.json) 向低分辨率的输入图片添加噪声。| [地址](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) |
128
+ | hakurei/waifu-diffusion | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | waifu-diffusion-v1-2 使用 stable-diffusion-v1-4 权重初始化,并且在**高质量动漫**图像数据集上进行微调后得到的模型。用于微调的数据是 **680k** 文本图像样本,这些样本是通过 **booru 网站** 下载的。| [地址](https://huggingface.co/hakurei/waifu-diffusion) |
129
+ | hakurei/waifu-diffusion-v1-3 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | waifu-diffusion-v1-3 是 waifu-diffusion-v1-2 基础上进一步训练得到的。他们对数据集进行了额外操作:(1)删除下划线;(2)删除括号;(3)用逗号分隔每个booru 标签;(4)随机化标签顺序。| [地址](https://huggingface.co/hakurei/waifu-diffusion) |
130
+ | naclbit/trinart_stable_diffusion_v2_60k | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | trinart_stable_diffusion 使用 stable-diffusion-v1-4 权重初始化,在 40k **高分辨率漫画/动漫风格**的图片数据集上微调了 8 个 epoch。V2 版模型使用 **dropouts**、**10k+ 图像**和**新的标记策略**训练了**更长时间**。| [地址](https://huggingface.co/naclbit/trinart_stable_diffusion_v2) |
131
+ | naclbit/trinart_stable_diffusion_v2_95k | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | **95k** 步数的结果,其他同上。| [地址](https://huggingface.co/naclbit/trinart_stable_diffusion_v2) |
132
+ | naclbit/trinart_stable_diffusion_v2_115k | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | **115k** 步数的结果,其他同上。| [地址](https://huggingface.co/naclbit/trinart_stable_diffusion_v2) |
133
+ | Deltaadams/Hentai-Diffusion | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | None| [地址](https://huggingface.co/Deltaadams/Hentai-Diffusion) |
134
+ | ringhyacinth/nail-set-diffuser | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 美甲领域的扩散模型,训练数据使用了 [Weekend](https://weibo.com/u/5982308498)| [地址](https://huggingface.co/ringhyacinth/nail-set-diffuser) |
135
+ | Linaqruf/anything-v3.0 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 该模型可通过输入几个文本提示词就能生成**高质量、高度详细的动漫风格图片**,该模型支持使用 **danbooru 标签文本** 生成图像。| [地址](https://huggingface.co/Linaqruf/anything-v3.0) |
136
+
137
+ </details>
138
+ <details><summary>&emsp; Stable Diffusion 模型支持的权重(中文和多语言) </summary>
139
+
140
+
141
+ | PPDiffusers支持的模型名称 | 支持加载的Pipeline | 备注 | huggingface.co地址 |
142
+ | :-------------------------------------------: | :--------------------------------------------------------------------: | --- | :-----------------------------------------: |
143
+ | BAAI/AltDiffusion | AltDiffusionPipeline、AltDiffusionImg2ImgPipeline | 该模型使用 [AltCLIP](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/AltCLIP/README.md) 作为文本编码器,在 Stable Diffusion 基础上训练了**双语Diffusion模型**,其中训练数据来自 [WuDao数据集](https://data.baai.ac.cn/details/WuDaoCorporaText) 和 [LAION](https://huggingface.co/datasets/ChristophSchuhmann/improved_aesthetics_6plus) 。| [地址](https://huggingface.co/BAAI/AltDiffusion) |
144
+ | BAAI/AltDiffusion-m9 | AltDiffusionPipeline、AltDiffusionImg2ImgPipeline |该模型使用9种语言的 [AltCLIP-m9](https://github.com/FlagAI-Open/FlagAI/tree/master/examples/AltCLIP/README.md) 作为文本编码器,其他同上。| [地址](https://huggingface.co/BAAI/AltDiffusion-m9) |
145
+ | IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 他们将 [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/) 数据集 (100M) 和 [Zero](https://zero.so.com/) 数据集 (23M) 用作预训练的数据集,先用 [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co/IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) 对这两个数据集的图文对相似性进行打分,取 CLIP Score 大于 0.2 的图文对作为训练集。 他们使用 [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co/IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) 作为初始化的text encoder,冻住 [stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) ([论文](https://arxiv.org/abs/2112.10752)) 模型的其他部分,只训练 text encoder,以便保留原始模型的生成能力且实现中文概念的对齐。该模型目前在0.2亿图文对上训练了一个 epoch。 在 32 x A100 上训练了大约100小时,该版本只是一个初步的版本。| [地址](https://huggingface.co/IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1) |
146
+ | IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1 | StableDiffusionPipeline、StableDiffusionImg2ImgPipeline、StableDiffusionInpaintPipelineLegacy、StableDiffusionMegaPipeline、StableDiffusionPipelineAllinOne | 他们将 [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/) 数据集 (100M) 和 [Zero](https://zero.so.com/) 数据集 (23M) 用作预训练的数据集,先用 [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co/IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) 对这两个数据集的图文对相似性进行打分,取 CLIP Score 大于 0.2 的图文对作为训练集。 他们使用 [stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) ([论文](https://arxiv.org/abs/2112.10752)) 模型进行继续训练,其中训练分为**两个stage**。**第一个stage** 中冻住模型的其他部分,只训练 text encoder ,以便保留原始模型的生成能力且实现中文概念的对齐。**第二个stage** 中将全部模型解冻,一起训练 text encoder 和 diffusion model ,以便 diffusion model 更好的适配中文引导。第一个 stage 他们训练了 80 小时,第二个 stage 训练了 100 小时,两个stage都是用了8 x A100,该版本是一个初步的版本。| [地址](https://huggingface.co/IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1) |
147
+ </details>
148
+
149
+
150
+ ### 加载HF Diffusers权重
151
+ ```python
152
+ from ppdiffusers import StableDiffusionPipeline
153
+ # 设置from_hf_hub为True,表示从huggingface hub下载,from_diffusers为True表示加载的是diffusers版Pytorch权重
154
+ pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", from_hf_hub=True, from_diffusers=True)
155
+ ```
156
+
157
+ ### 加载原库的Lightning权重
158
+ ```python
159
+ from ppdiffusers import StableDiffusionPipeline
160
+ # 可输入网址 或 本地ckpt、safetensors文件
161
+ pipe = StableDiffusionPipeline.from_single_file("https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/ppdiffusers/chilloutmix_NiPrunedFp32Fix.safetensors")
162
+ ```
163
+
164
+ ### 加载HF LoRA权重
165
+ ```python
166
+ from ppdiffusers import DiffusionPipeline
167
+
168
+ pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", paddle_dtype=paddle.float16)
169
+
170
+ pipe.load_lora_weights("stabilityai/stable-diffusion-xl-base-1.0",
171
+ weight_name="sd_xl_offset_example-lora_1.0.safetensors",
172
+ from_diffusers=True)
173
+ ```
174
+
175
+ ### 加载Civitai社区的LoRA权重
176
+ ```python
177
+ from ppdiffusers import StableDiffusionPipeline
178
+ pipe = StableDiffusionPipeline.from_pretrained("TASUKU2023/Chilloutmix")
179
+ # 加载lora权重
180
+ pipe.load_lora_weights("./",
181
+ weight_name="Moxin_10.safetensors",
182
+ from_diffusers=True)
183
+ pipe.fuse_lora()
184
+ ```
185
+
186
+ ### XFormers加速
187
+ 为了使用**XFormers加速**,我们需要安装`develop`版本的`paddle`,Linux系统的安装命令如下:
188
+ ```sh
189
+ python -m pip install paddlepaddle-gpu==0.0.0.post117 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html
190
+ ```
191
+
192
+ ```python
193
+ import paddle
194
+ from ppdiffusers import StableDiffusionPipeline
195
+ pipe = StableDiffusionPipeline.from_pretrained("TASUKU2023/Chilloutmix", paddle_dtype=paddle.float16)
196
+ # 开启xformers加速 默认选择"cutlass"加速
197
+ pipe.enable_xformers_memory_efficient_attention()
198
+ # flash 需要使用 A100、A10、3060、3070、3080、3090 等以上显卡。
199
+ # pipe.enable_xformers_memory_efficient_attention("flash")
200
+ ```
201
+
202
+ ### ToME + ControlNet
203
+ ```python
204
+ # 安装develop的ppdiffusers
205
+ # pip install "ppdiffusers>=0.24.0"
206
+ import paddle
207
+ from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline
208
+ from ppdiffusers.utils import load_image
209
+
210
+ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
211
+ pipe = StableDiffusionControlNetPipeline.from_pretrained(
212
+ "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet, paddle_dtype=paddle.float16
213
+ )
214
+
215
+ # Apply ToMe with a 50% merging ratio
216
+ pipe.apply_tome(ratio=0.5) # Can also use pipe.unet in place of pipe here
217
+
218
+ # 我们可以开启 xformers
219
+ # pipe.enable_xformers_memory_efficient_attention()
220
+ generator = paddle.Generator().manual_seed(0)
221
+ prompt = "bird"
222
+ image = load_image(
223
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
224
+ )
225
+
226
+ image = pipe(prompt, image, generator=generator).images[0]
227
+
228
+ image.save("bird.png")
229
+ ```
230
+
231
+ ### 文图生成 (Text-to-Image Generation)
232
+
233
+ ```python
234
+ import paddle
235
+ from ppdiffusers import StableDiffusionPipeline
236
+
237
+ pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
238
+
239
+ # 设置随机种子,我们可以复现下面的结果!
240
+ paddle.seed(5232132133)
241
+ prompt = "a portrait of shiba inu with a red cap growing on its head. intricate. lifelike. soft light. sony a 7 r iv 5 5 mm. cinematic post - processing "
242
+ image = pipe(prompt, guidance_scale=7.5, height=768, width=768).images[0]
243
+
244
+ image.save("shiba_dog_with_a_red_cap.png")
245
+ ```
246
+ <div align="center">
247
+ <img width="500" alt="image" src="https://user-images.githubusercontent.com/50394665/204796701-d7911f76-8670-47d5-8d1b-8368b046c5e4.png">
248
+ </div>
249
+
250
+ ### 文本引导的图像变换(Image-to-Image Text-Guided Generation)
251
+
252
+ <details><summary>&emsp;Image-to-Image Text-Guided Generation Demo </summary>
253
+
254
+ ```python
255
+ import paddle
256
+ from ppdiffusers import StableDiffusionImg2ImgPipeline
257
+ from ppdiffusers.utils import load_image
258
+
259
+ pipe = StableDiffusionImg2ImgPipeline.from_pretrained("Linaqruf/anything-v3.0", safety_checker=None)
260
+
261
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/image_Kurisu.png"
262
+ image = load_image(url).resize((512, 768))
263
+
264
+ # 设置随机种子,我们可以复现下面的结果!
265
+ paddle.seed(42)
266
+ prompt = "Kurisu Makise, looking at viewer, long hair, standing, 1girl, hair ornament, hair flower, cute, jacket, white flower, white dress"
267
+ negative_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
268
+
269
+ image = pipe(prompt=prompt, negative_prompt=negative_prompt, image=image, strength=0.75, guidance_scale=7.5).images[0]
270
+ image.save("image_Kurisu_img2img.png")
271
+ ```
272
+ <div align="center">
273
+ <img width="500" alt="image" src="https://user-images.githubusercontent.com/50394665/204799529-cd89dcdb-eb1d-4247-91ac-b0f7bad777f8.png">
274
+ </div>
275
+ </details>
276
+
277
+ ### 文本引导的图像编辑(Text-Guided Image Inpainting)
278
+
279
+ 注意!当前有两种版本的图像编辑代码,一个是Legacy版本,一个是正式版本,下面将分别介绍两种代码如何使用!
280
+
281
+ <details><summary>&emsp;Legacy版本代码</summary>
282
+
283
+ ```python
284
+ import paddle
285
+ from ppdiffusers import StableDiffusionInpaintPipelineLegacy
286
+ from ppdiffusers.utils import load_image
287
+
288
+ # 可选模型权重
289
+ # CompVis/stable-diffusion-v1-4
290
+ # runwayml/stable-diffusion-v1-5
291
+ # stabilityai/stable-diffusion-2-base (原始策略 512x512)
292
+ # stabilityai/stable-diffusion-2 (v-objective 768x768)
293
+ # Linaqruf/anything-v3.0
294
+ # ......
295
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
296
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
297
+
298
+ image = load_image(img_url).resize((512, 512))
299
+ mask_image = load_image(mask_url).resize((512, 512))
300
+
301
+ pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("stabilityai/stable-diffusion-2-base", safety_checker=None)
302
+
303
+ # 设置随机种子,我们可以复现下面的结果!
304
+ paddle.seed(10245)
305
+ prompt = "a red cat sitting on a bench"
306
+ image = pipe(prompt=prompt, image=image, mask_image=mask_image, strength=0.75).images[0]
307
+
308
+ image.save("a_red_cat_legacy.png")
309
+ ```
310
+ <div align="center">
311
+ <img width="900" alt="image" src="https://user-images.githubusercontent.com/50394665/204802186-5a6d302b-83aa-4247-a5bb-ebabfcc3abc4.png">
312
+ </div>
313
+
314
+ </details>
315
+
316
+ <details><summary>&emsp;正式版本代码</summary>
317
+
318
+ Tips: 下面的使用方法是新版本的代码,也是官���推荐的代码,注意必须配合 **runwayml/stable-diffusion-inpainting** 和 **stabilityai/stable-diffusion-2-inpainting** 才可正常使用。
319
+ ```python
320
+ import paddle
321
+ from ppdiffusers import StableDiffusionInpaintPipeline
322
+ from ppdiffusers.utils import load_image
323
+
324
+ # 可选模型权重
325
+ # runwayml/stable-diffusion-inpainting
326
+ # stabilityai/stable-diffusion-2-inpainting
327
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
328
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
329
+
330
+ image = load_image(img_url).resize((512, 512))
331
+ mask_image = load_image(mask_url).resize((512, 512))
332
+
333
+ pipe = StableDiffusionInpaintPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting")
334
+
335
+ # 设置随机种子,我们可以复现下面的结果!
336
+ paddle.seed(1024)
337
+ prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
338
+ image = pipe(prompt=prompt, image=image, mask_image=mask_image).images[0]
339
+
340
+ image.save("a_yellow_cat.png")
341
+ ```
342
+ <div align="center">
343
+ <img width="900" alt="image" src="https://user-images.githubusercontent.com/50394665/204801946-6cd043bc-f3db-42cf-82cd-6a6171484523.png">
344
+ </div>
345
+ </details>
346
+
347
+ ### 文本引导的图像放大 & 超分(Text-Guided Image Upscaling & Super-Resolution)
348
+
349
+ <details><summary>&emsp;Text-Guided Image Upscaling Demo</summary>
350
+
351
+ ```python
352
+ import paddle
353
+ from ppdiffusers import StableDiffusionUpscalePipeline
354
+ from ppdiffusers.utils import load_image
355
+
356
+ pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler")
357
+
358
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png"
359
+ # 我们人工将原始图片缩小成 128x128 分辨率,最终保存的图片会放大4倍!
360
+ low_res_img = load_image(url).resize((128, 128))
361
+
362
+ prompt = "a white cat"
363
+ image = pipe(prompt=prompt, image=low_res_img).images[0]
364
+
365
+ image.save("upscaled_white_cat.png")
366
+ ```
367
+ <div align="center">
368
+ <img width="200" alt="image" src="https://user-images.githubusercontent.com/50394665/204806180-b7f1b9cf-8a62-4577-b5c4-91adda08a13b.png">
369
+ <img width="400" alt="image" src="https://user-images.githubusercontent.com/50394665/204806202-8c110be3-5f48-4946-95ea-21ad5a9a2340.png">
370
+ </div>
371
+ </details>
372
+
373
+ <details><summary>&emsp;Super-Resolution Demo</summary>
374
+
375
+ ```python
376
+ import paddle
377
+ from ppdiffusers import LDMSuperResolutionPipeline
378
+ from ppdiffusers.utils import load_image
379
+
380
+ pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
381
+
382
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
383
+
384
+ # 我们人工将原始图片缩小成 128x128 分辨率,最终保存的图片会放大4倍!
385
+ low_res_img = load_image(url).resize((128, 128))
386
+
387
+ image = pipe(image=low_res_img, num_inference_steps=100).images[0]
388
+
389
+ image.save("ldm-super-resolution-image.png")
390
+ ```
391
+ <div align="center">
392
+ <img width="200" alt="image" src="https://user-images.githubusercontent.com/50394665/204804426-5e28b571-aa41-4f56-ba26-68cca75fdaae.png">
393
+ <img width="400" alt="image" src="https://user-images.githubusercontent.com/50394665/204804148-fe7c293b-6cd7-4942-ae9c-446369fe8410.png">
394
+ </div>
395
+
396
+ </details>
397
+
398
+ ## 模型推理部署
399
+ 除了**Paddle动态图**运行之外,很多模型还支持将模型导出并使用推理引擎运行。我们提供基于[FastDeploy](https://github.com/PaddlePaddle/FastDeploy)上的**StableDiffusion**模型部署示例,涵盖文生图、图生图、图像编辑等任务,用户可以按照我们提供[StableDiffusion模型导出教程](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/deploy/export.md)将模型导出,然后使用`FastDeployStableDiffusionMegaPipeline`进行高性能推理部署!
400
+
401
+ <details><summary>&emsp; 已预先导出的FastDeploy版Stable Diffusion权重 </summary>
402
+
403
+ **注意:当前导出的vae encoder带有随机因素!**
404
+
405
+ - CompVis/stable-diffusion-v1-4@fastdeploy
406
+ - runwayml/stable-diffusion-v1-5@fastdeploy
407
+ - runwayml/stable-diffusion-inpainting@fastdeploy
408
+ - stabilityai/stable-diffusion-2-base@fastdeploy
409
+ - stabilityai/stable-diffusion-2@fastdeploy
410
+ - stabilityai/stable-diffusion-2-inpainting@fastdeploy
411
+ - Linaqruf/anything-v3.0@fastdeploy
412
+ - hakurei/waifu-diffusion-v1-3@fastdeploy
413
+
414
+ </details>
415
+
416
+ <details><summary>&emsp; FastDeploy Demo </summary>
417
+
418
+ ```python
419
+ import paddle
420
+ import fastdeploy as fd
421
+ from ppdiffusers import FastDeployStableDiffusionMegaPipeline
422
+ from ppdiffusers.utils import load_image
423
+
424
+ def create_runtime_option(device_id=0, backend="paddle", use_cuda_stream=True):
425
+ option = fd.RuntimeOption()
426
+ if backend == "paddle":
427
+ option.use_paddle_backend()
428
+ else:
429
+ option.use_ort_backend()
430
+ if device_id == -1:
431
+ option.use_cpu()
432
+ else:
433
+ option.use_gpu(device_id)
434
+ if use_cuda_stream:
435
+ paddle_stream = paddle.device.cuda.current_stream(device_id).cuda_stream
436
+ option.set_external_raw_stream(paddle_stream)
437
+ return option
438
+
439
+ runtime_options = {
440
+ "text_encoder": create_runtime_option(0, "paddle"), # use gpu:0
441
+ "vae_encoder": create_runtime_option(0, "paddle"), # use gpu:0
442
+ "vae_decoder": create_runtime_option(0, "paddle"), # use gpu:0
443
+ "unet": create_runtime_option(0, "paddle"), # use gpu:0
444
+ }
445
+
446
+ fd_pipe = FastDeployStableDiffusionMegaPipeline.from_pretrained(
447
+ "Linaqruf/anything-v3.0@fastdeploy", runtime_options=runtime_options
448
+ )
449
+
450
+ # text2img
451
+ prompt = "a portrait of shiba inu with a red cap growing on its head. intricate. lifelike. soft light. sony a 7 r iv 5 5 mm. cinematic post - processing "
452
+ image_text2img = fd_pipe.text2img(prompt=prompt, num_inference_steps=50).images[0]
453
+ image_text2img.save("image_text2img.png")
454
+
455
+ # img2img
456
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/image_Kurisu.png"
457
+ image = load_image(url).resize((512, 512))
458
+ prompt = "Kurisu Makise, looking at viewer, long hair, standing, 1girl, hair ornament, hair flower, cute, jacket, white flower, white dress"
459
+ negative_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
460
+
461
+ image_img2img = fd_pipe.img2img(
462
+ prompt=prompt, negative_prompt=negative_prompt, image=image, strength=0.75, guidance_scale=7.5
463
+ ).images[0]
464
+ image_img2img.save("image_img2img.png")
465
+
466
+ # inpaint_legacy
467
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
468
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png"
469
+ image = load_image(img_url).resize((512, 512))
470
+ mask_image = load_image(mask_url).resize((512, 512))
471
+ prompt = "a red cat sitting on a bench"
472
+
473
+ image_inpaint_legacy = fd_pipe.inpaint_legacy(
474
+ prompt=prompt, image=image, mask_image=mask_image, strength=0.75, num_inference_steps=50
475
+ ).images[0]
476
+ image_inpaint_legacy.save("image_inpaint_legacy.png")
477
+ ```
478
+ </details>
479
+ <div align="center">
480
+ <img width="900" alt="image" src="https://user-images.githubusercontent.com/50394665/205297240-46b80992-34af-40cd-91a6-ae76589d0e21.png">
481
+ </div>
482
+
483
+
484
+ ## 更多任务分类展示
485
+ ### 文本图像多模
486
+
487
+ <details open>
488
+ <summary>&emsp;文图生成(Text-to-Image Generation)</summary>
489
+
490
+ #### text_to_image_generation-stable_diffusion
491
+
492
+ ```python
493
+ from ppdiffusers import StableDiffusionPipeline
494
+
495
+ # 加载模型和scheduler
496
+ pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
497
+
498
+ # 执行pipeline进行推理
499
+ prompt = "a photo of an astronaut riding a horse on mars"
500
+ image = pipe(prompt).images[0]
501
+
502
+ # 保存图片
503
+ image.save("astronaut_rides_horse_sd.png")
504
+ ```
505
+ <div align="center">
506
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209322401-6ecfeaaa-6878-4302-b592-07a31de4e590.png">
507
+ </div>
508
+
509
+ #### text_to_image_generation-stable_diffusion_xl
510
+
511
+ ```python
512
+ import paddle
513
+ from ppdiffusers import StableDiffusionXLPipeline
514
+
515
+ pipe = StableDiffusionXLPipeline.from_pretrained(
516
+ "stabilityai/stable-diffusion-xl-base-1.0",
517
+ paddle_dtype=paddle.float16,
518
+ variant="fp16"
519
+ )
520
+ prompt = "a photo of an astronaut riding a horse on mars"
521
+ generator = paddle.Generator().manual_seed(42)
522
+ image = pipe(prompt=prompt, generator=generator, num_inference_steps=50).images[0]
523
+ image.save('sdxl_text2image.png')
524
+ ```
525
+ <div align="center">
526
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/d72729f9-8685-48f9-a238-e4ddf6d264f3">
527
+ </div>
528
+
529
+ #### text_to_image_generation-sdxl_base_with_refiner
530
+
531
+ ```python
532
+ from ppdiffusers import DiffusionPipeline
533
+ import paddle
534
+
535
+ # load both base & refiner
536
+ base = DiffusionPipeline.from_pretrained(
537
+ "stabilityai/stable-diffusion-xl-base-1.0",
538
+ paddle_dtype=paddle.float16,
539
+ )
540
+ refiner = DiffusionPipeline.from_pretrained(
541
+ "stabilityai/stable-diffusion-xl-refiner-1.0",
542
+ text_encoder_2=base.text_encoder_2,
543
+ vae=base.vae,
544
+ paddle_dtype=paddle.float16,
545
+ variant="fp16",
546
+ )
547
+
548
+ # Define how many steps and what % of steps to be run on each experts (80/20) here
549
+ n_steps = 40
550
+ high_noise_frac = 0.8
551
+
552
+ prompt = "A majestic lion jumping from a big stone at night"
553
+ prompt = "a photo of an astronaut riding a horse on mars"
554
+ generator = paddle.Generator().manual_seed(42)
555
+
556
+ # run both experts
557
+ image = base(
558
+ prompt=prompt,
559
+ output_type="latent",
560
+ generator=generator,
561
+ ).images
562
+
563
+ image = refiner(
564
+ prompt=prompt,
565
+ image=image,
566
+ generator=generator,
567
+ ).images[0]
568
+ image.save('text_to_image_generation-sdxl-base-with-refiner-result.png')
569
+ ```
570
+ <div align="center">
571
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/8ef36826-ed94-4856-a356-af1677f60d1b">
572
+ </div>
573
+
574
+ #### text_to_image_generation-kandinsky2_2
575
+ ```python
576
+ from ppdiffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline
577
+
578
+ pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
579
+ prompt = "red cat, 4k photo"
580
+ out = pipe_prior(prompt)
581
+ image_emb = out.image_embeds
582
+ zero_image_emb = out.negative_image_embeds
583
+ pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
584
+ image = pipe(
585
+ image_embeds=image_emb,
586
+ negative_image_embeds=zero_image_emb,
587
+ height=768,
588
+ width=768,
589
+ num_inference_steps=50,
590
+ ).images
591
+ image[0].save("text_to_image_generation-kandinsky2_2-result-cat.png")
592
+ ```
593
+ <div align="center">
594
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/188f76dd-4bd7-4a33-8f30-b893c7a9e249">
595
+ </div>
596
+
597
+ #### text_to_image_generation-unidiffuser
598
+ ```python
599
+ import paddle
600
+ from paddlenlp.trainer import set_seed
601
+
602
+ from ppdiffusers import UniDiffuserPipeline
603
+
604
+ model_id_or_path = "thu-ml/unidiffuser-v1"
605
+ pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, paddle_dtype=paddle.float16)
606
+ set_seed(42)
607
+
608
+ # Text variation can be performed with a text-to-image generation followed by a image-to-text generation:
609
+ # 1. Text-to-image generation
610
+ prompt = "an elephant under the sea"
611
+ sample = pipe(prompt=prompt, num_inference_steps=20, guidance_scale=8.0)
612
+ t2i_image = sample.images[0]
613
+ t2i_image.save("t2i_image.png")
614
+ ````
615
+ <div align="center">
616
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/a6eb11d2-ad27-4263-8cb4-b0d8dd42b36c">
617
+ </div>
618
+
619
+ #### text_to_image_generation-deepfloyd_if
620
+
621
+ ```python
622
+ import paddle
623
+
624
+ from ppdiffusers import DiffusionPipeline, IFPipeline, IFSuperResolutionPipeline
625
+ from ppdiffusers.utils import pd_to_pil
626
+
627
+ # Stage 1: generate images
628
+ pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16)
629
+ pipe.enable_xformers_memory_efficient_attention()
630
+ prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
631
+ prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
632
+ image = pipe(
633
+ prompt_embeds=prompt_embeds,
634
+ negative_prompt_embeds=negative_embeds,
635
+ output_type="pd",
636
+ ).images
637
+
638
+ # save intermediate image
639
+ pil_image = pd_to_pil(image)
640
+ pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_I.png")
641
+ # save gpu memory
642
+ pipe.to(paddle_device="cpu")
643
+
644
+ # Stage 2: super resolution stage1
645
+ super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained(
646
+ "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", paddle_dtype=paddle.float16
647
+ )
648
+ super_res_1_pipe.enable_xformers_memory_efficient_attention()
649
+
650
+ image = super_res_1_pipe(
651
+ image=image,
652
+ prompt_embeds=prompt_embeds,
653
+ negative_prompt_embeds=negative_embeds,
654
+ output_type="pd",
655
+ ).images
656
+ # save intermediate image
657
+ pil_image = pd_to_pil(image)
658
+ pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_II.png")
659
+ # save gpu memory
660
+ super_res_1_pipe.to(paddle_device="cpu")
661
+ ```
662
+ <div align="center">
663
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/246785766-700dfad9-159d-4bfb-bfc7-c18df938a052.png">
664
+ </div>
665
+ <div align="center">
666
+ <center>if_stage_I</center>
667
+ </div>
668
+ <div align="center">
669
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/246785773-3359ca5f-dadf-4cc8-b318-ff1f9d4a2d35.png">
670
+ </div>
671
+ <div align="center">
672
+ <center>if_stage_II</center>
673
+ <!-- <img alt="image" src="https://user-images.githubusercontent.com/20476674/246785774-8870829a-354b-4a87-9d67-93af315f51e6.png">
674
+ <center>if_stage_III</center> -->
675
+ </div>
676
+ </details>
677
+
678
+
679
+ <details><summary>&emsp;文本引导的图像放大(Text-Guided Image Upscaling)</summary>
680
+
681
+ #### text_guided_image_upscaling-stable_diffusion_2
682
+
683
+ ```python
684
+ from ppdiffusers import StableDiffusionUpscalePipeline
685
+ from ppdiffusers.utils import load_image
686
+
687
+ pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler")
688
+
689
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png"
690
+ low_res_img = load_image(url).resize((128, 128))
691
+
692
+ prompt = "a white cat"
693
+ upscaled_image = pipe(prompt=prompt, image=low_res_img).images[0]
694
+ upscaled_image.save("upsampled_cat_sd2.png")
695
+ ```
696
+ <div align="center">
697
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209324085-0d058b70-89b0-43c2-affe-534eedf116cf.png">
698
+ <center>原图像</center>
699
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209323862-ce2d8658-a52b-4f35-90cb-aa7d310022e7.png">
700
+ <center>生成图像</center>
701
+ </div>
702
+ </details>
703
+
704
+ <details><summary>&emsp;文本引导的图像编辑(Text-Guided Image Inpainting)</summary>
705
+
706
+ #### text_guided_image_inpainting-stable_diffusion_2
707
+
708
+ ```python
709
+ import paddle
710
+
711
+ from ppdiffusers import PaintByExamplePipeline
712
+ from ppdiffusers.utils import load_image
713
+
714
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/image_example_1.png"
715
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/mask_example_1.png"
716
+ example_url = "https://paddlenlp.bj.bcebos.com/models/community/Fantasy-Studio/data/reference_example_1.jpeg"
717
+
718
+ init_image = load_image(img_url).resize((512, 512))
719
+ mask_image = load_image(mask_url).resize((512, 512))
720
+ example_image = load_image(example_url).resize((512, 512))
721
+
722
+ pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example")
723
+
724
+ # 使用fp16加快生成速度
725
+ with paddle.amp.auto_cast(True):
726
+ image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0]
727
+ image.save("image_guided_image_inpainting-paint_by_example-result.png")
728
+ ```
729
+ <div align="center">
730
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/247118364-5d91f433-f9ac-4514-b5f0-cb4599905847.png" width=300>
731
+ <center>原图像</center>
732
+ <div align="center">
733
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/247118361-0f78d6db-6896-4f8d-b1bd-8350192f7a4e.png" width=300>
734
+ <center>掩码图像</center>
735
+ <div align="center">
736
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/247118368-305a048d-ddc3-4a5f-8915-58591ef680f0.jpeg" width=300>
737
+ <center>参考图像</center>
738
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/247117963-e5b9b754-39a3-480b-a557-46a2f9310e79.png" width=300>
739
+ <center>生成图像</center>
740
+ </div>
741
+ </details>
742
+
743
+
744
+ <details><summary>&emsp;文本引导的图像变换(Image-to-Image Text-Guided Generation)</summary>
745
+
746
+ #### text_guided_image_inpainting-kandinsky2_2
747
+ ```python
748
+ import numpy as np
749
+ import paddle
750
+
751
+ from ppdiffusers import KandinskyV22InpaintPipeline, KandinskyV22PriorPipeline
752
+ from ppdiffusers.utils import load_image
753
+
754
+ pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
755
+ "kandinsky-community/kandinsky-2-2-prior", paddle_dtype=paddle.float16
756
+ )
757
+ prompt = "a hat"
758
+ image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
759
+ pipe = KandinskyV22InpaintPipeline.from_pretrained(
760
+ "kandinsky-community/kandinsky-2-2-decoder-inpaint", paddle_dtype=paddle.float16
761
+ )
762
+ init_image = load_image(
763
+ "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
764
+ )
765
+ mask = np.zeros((768, 768), dtype=np.float32)
766
+ mask[:250, 250:-250] = 1
767
+ out = pipe(
768
+ image=init_image,
769
+ mask_image=mask,
770
+ image_embeds=image_emb,
771
+ negative_image_embeds=zero_image_emb,
772
+ height=768,
773
+ width=768,
774
+ num_inference_steps=50,
775
+ )
776
+ image = out.images[0]
777
+ image.save("text_guided_image_inpainting-kandinsky2_2-result-cat_with_hat.png")
778
+ ```
779
+ <div align="center">
780
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/64a943d5-167b-4433-91c3-3cf9279714db">
781
+ <center>原图像</center>
782
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/f469c127-52f4-4173-a693-c06b92a052aa">
783
+ <center>生成图像</center>
784
+ </div>
785
+
786
+ #### image_to_image_text_guided_generation-stable_diffusion
787
+ ```python
788
+ import paddle
789
+
790
+ from ppdiffusers import StableDiffusionImg2ImgPipeline
791
+ from ppdiffusers.utils import load_image
792
+
793
+ # 加载pipeline
794
+ pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
795
+
796
+ # 下载初始图片
797
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png"
798
+
799
+ init_image = load_image(url).resize((768, 512))
800
+
801
+ prompt = "A fantasy landscape, trending on artstation"
802
+ # 使用fp16加快生成速度
803
+ with paddle.amp.auto_cast(True):
804
+ image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
805
+
806
+ image.save("fantasy_landscape.png")
807
+ ```
808
+ <div align="center">
809
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209327142-d8e1d0c7-3bf8-4a08-a0e8-b11451fc84d8.png">
810
+ <center>原图像</center>
811
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209325799-d9ff279b-0d57-435f-bda7-763e3323be23.png">
812
+ <center>生成图像</center>
813
+ </div>
814
+
815
+ #### image_to_image_text_guided_generation-stable_diffusion_xl
816
+ ```python
817
+ import paddle
818
+ from ppdiffusers import StableDiffusionXLImg2ImgPipeline
819
+ from ppdiffusers.utils import load_image
820
+
821
+ pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
822
+ "stabilityai/stable-diffusion-xl-refiner-1.0",
823
+ paddle_dtype=paddle.float16,
824
+ # from_hf_hub=True,
825
+ # from_diffusers=True,
826
+ variant="fp16"
827
+ )
828
+ url = "https://paddlenlp.bj.bcebos.com/models/community/westfish/develop-0-19-3/000000009.png"
829
+ init_image = load_image(url).convert("RGB")
830
+ prompt = "a photo of an astronaut riding a horse on mars"
831
+ image = pipe(prompt, image=init_image).images[0]
832
+ image.save('sdxl_image2image.png')
833
+ ```
834
+ <div align="center">
835
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/41bd9381-2799-4bed-a5e2-ba312a2f8da9">
836
+ <center>原图像</center>
837
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/db672d03-2e3a-46ac-97fd-d80cca18dbbe">
838
+ <center>生成图像</center>
839
+ </div>
840
+
841
+ #### image_to_image_text_guided_generation-kandinsky2_2
842
+ ```python
843
+ import paddle
844
+
845
+ from ppdiffusers import KandinskyV22Img2ImgPipeline, KandinskyV22PriorPipeline
846
+ from ppdiffusers.utils import load_image
847
+
848
+ pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
849
+ "kandinsky-community/kandinsky-2-2-prior", paddle_dtype=paddle.float16
850
+ )
851
+ prompt = "A red cartoon frog, 4k"
852
+ image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
853
+ pipe = KandinskyV22Img2ImgPipeline.from_pretrained(
854
+ "kandinsky-community/kandinsky-2-2-decoder", paddle_dtype=paddle.float16
855
+ )
856
+
857
+ init_image = load_image(
858
+ "https://hf-mirror.com/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/frog.png"
859
+ )
860
+ image = pipe(
861
+ image=init_image,
862
+ image_embeds=image_emb,
863
+ negative_image_embeds=zero_image_emb,
864
+ height=768,
865
+ width=768,
866
+ num_inference_steps=100,
867
+ strength=0.2,
868
+ ).images
869
+ image[0].save("image_to_image_text_guided_generation-kandinsky2_2-result-red_frog.png")
870
+ ```
871
+ <div align="center">
872
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/aae57109-94ad-408e-ae75-8cce650cebe5">
873
+ <center>原图像</center>
874
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/23cf2c4e-416f-4f21-82a6-e57de11b5e83">
875
+ <center>生成图像</center>
876
+ </div>
877
+
878
+ </details>
879
+ </details>
880
+
881
+ <details><summary>&emsp;文本图像双引导图像生成(Dual Text and Image Guided Generation)</summary>
882
+
883
+ #### dual_text_and_image_guided_generation-versatile_diffusion
884
+ ```python
885
+ from ppdiffusers import VersatileDiffusionDualGuidedPipeline
886
+ from ppdiffusers.utils import load_image
887
+
888
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
889
+ image = load_image(url)
890
+ text = "a red car in the sun"
891
+
892
+ pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
893
+ pipe.remove_unused_weights()
894
+
895
+ text_to_image_strength = 0.75
896
+ image = pipe(prompt=text, image=image, text_to_image_strength=text_to_image_strength).images[0]
897
+ image.save("versatile-diffusion-red_car.png")
898
+ ```
899
+ <div align="center">
900
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209325965-2475e9c4-a524-4970-8498-dfe10ff9cf24.jpg" >
901
+ <center>原图像</center>
902
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209325293-049098d0-d591-4abc-b151-9291ac2636da.png">
903
+ <center>生成图像</center>
904
+ </div>
905
+ </details>
906
+
907
+ ### 文本视频多模
908
+
909
+ <details open>
910
+ <summary>&emsp;文本条件的视频生成(Text-to-Video Generation)</summary>
911
+
912
+ #### text_to_video_generation-lvdm
913
+
914
+ ```python
915
+ import paddle
916
+
917
+ from ppdiffusers import LVDMTextToVideoPipeline
918
+
919
+ # 加载模型和scheduler
920
+ pipe = LVDMTextToVideoPipeline.from_pretrained("westfish/lvdm_text2video_orig_webvid_2m")
921
+
922
+ # 执行pipeline进行推理
923
+ seed = 2013
924
+ generator = paddle.Generator().manual_seed(seed)
925
+ samples = pipe(
926
+ prompt="cutting in kitchen",
927
+ num_frames=16,
928
+ height=256,
929
+ width=256,
930
+ num_inference_steps=50,
931
+ generator=generator,
932
+ guidance_scale=15,
933
+ eta=1,
934
+ save_dir=".",
935
+ save_name="text_to_video_generation-lvdm-result-ddim_lvdm_text_to_video_ucf",
936
+ encoder_type="2d",
937
+ scale_factor=0.18215,
938
+ shift_factor=0,
939
+ )
940
+ ```
941
+ <div align="center">
942
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/270906907-2b9d53c1-0272-4c7a-81b2-cd962d23bbee.gif">
943
+ </div>
944
+
945
+ #### text_to_video_generation-synth
946
+
947
+ ```python
948
+ import imageio
949
+
950
+ from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline
951
+
952
+ pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
953
+ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
954
+
955
+ prompt = "An astronaut riding a horse."
956
+ video_frames = pipe(prompt, num_inference_steps=25).frames
957
+ imageio.mimsave("text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4", video_frames, fps=8)
958
+ ```
959
+ <div align="center">
960
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/281259277-0ebe29a3-4eba-48ee-a98b-292e60de3c98.gif">
961
+ </div>
962
+
963
+
964
+ #### text_to_video_generation-synth with zeroscope_v2_XL
965
+
966
+ ```python
967
+ import imageio
968
+
969
+ from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline
970
+
971
+ # from ppdiffusers.utils import export_to_video
972
+
973
+ pipe = TextToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_XL")
974
+ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
975
+
976
+ prompt = "An astronaut riding a horse."
977
+ video_frames = pipe(prompt, num_inference_steps=50, height=320, width=576, num_frames=24).frames
978
+ imageio.mimsave("text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4", video_frames, fps=8)
979
+ ```
980
+ <div align="center">
981
+ <img width="300" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/43ebbca0-9f07-458b-809a-acf296a2539b">
982
+ </div>
983
+
984
+ #### text_to_video_generation-zero
985
+
986
+ ```python
987
+ import imageio
988
+
989
+ # pip install imageio[ffmpeg]
990
+ import paddle
991
+
992
+ from ppdiffusers import TextToVideoZeroPipeline
993
+
994
+ model_id = "runwayml/stable-diffusion-v1-5"
995
+ pipe = TextToVideoZeroPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16)
996
+
997
+ prompt = "A panda is playing guitar on times square"
998
+ result = pipe(prompt=prompt).images
999
+ result = [(r * 255).astype("uint8") for r in result]
1000
+ imageio.mimsave("text_to_video_generation-zero-result-panda.mp4", result, fps=4)
1001
+ ```
1002
+ <div align="center">
1003
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/246779321-c2b0c2b4-e383-40c7-a4d8-f417e8062b35.gif">
1004
+ </div>
1005
+
1006
+ </details>
1007
+
1008
+ ### 文本音频多模
1009
+ <details>
1010
+ <summary>&emsp;文本条件的音频生成(Text-to-Audio Generation)</summary>
1011
+
1012
+ #### text_to_audio_generation-audio_ldm
1013
+
1014
+ ```python
1015
+ import paddle
1016
+ import scipy
1017
+
1018
+ from ppdiffusers import AudioLDM2Pipeline
1019
+
1020
+ pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2", paddle_dtype=paddle.float16)
1021
+
1022
+ prompt = "Musical constellations twinkling in the night sky, forming a cosmic melody."
1023
+ negative_prompt = "Low quality."
1024
+ audio = pipe(prompt, negative_prompt=negative_prompt, num_inference_steps=200, audio_length_in_s=10).audios[0]
1025
+
1026
+ output_path = f"{prompt}.wav"
1027
+ # save the audio sample as a .wav file
1028
+ scipy.io.wavfile.write(output_path, rate=16000, data=audio)
1029
+ ```
1030
+ <div align = "center">
1031
+ <thead>
1032
+ </thead>
1033
+ <tbody>
1034
+ <tr>
1035
+ <td align = "center">
1036
+ <a href="https://paddlenlp.bj.bcebos.com/models/community/paddlemix/ppdiffusers/AudioLDM2-Music.wav" rel="nofollow">
1037
+ <img align="center" src="https://user-images.githubusercontent.com/20476674/209344877-edbf1c24-f08d-4e3b-88a4-a27e1fd0a858.png" width="200 style="max-width: 100%;"></a><br>
1038
+ </td>
1039
+ </tr>
1040
+ </tbody>
1041
+ </div>
1042
+ </details>
1043
+
1044
+ 可以使用以下代码转换[huggingface](https://huggingface.co/docs/diffusers/api/pipelines/audioldm2)的模型,一键在paddle中使用
1045
+ ```python
1046
+ pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-music", from_hf_hub=True, from_diffusers=True).save_pretrained("cvssp/audioldm2-music")
1047
+ ```
1048
+ ### 图像
1049
+
1050
+ <details><summary>&emsp;无条件图像生成(Unconditional Image Generation)</summary>
1051
+
1052
+ #### unconditional_image_generation-latent_diffusion_uncond
1053
+
1054
+ ```python
1055
+ from ppdiffusers import LDMPipeline
1056
+
1057
+ # 加载模型和scheduler
1058
+ pipe = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
1059
+
1060
+ # 执行pipeline进行推理
1061
+ image = pipe(num_inference_steps=200).images[0]
1062
+
1063
+ # 保存图片
1064
+ image.save("ldm_generated_image.png")
1065
+ ```
1066
+ <div align="center">
1067
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209327936-7fe914e0-0ea0-4e21-a433-24eaed6ee94c.png">
1068
+ </div>
1069
+ </details>
1070
+
1071
+ <details><summary>&emsp;超分(Super Superresolution)</summary>
1072
+
1073
+ #### super_resolution-latent_diffusion
1074
+ ```python
1075
+ import paddle
1076
+
1077
+ from ppdiffusers import LDMSuperResolutionPipeline
1078
+ from ppdiffusers.utils import load_image
1079
+
1080
+ # 加载pipeline
1081
+ pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
1082
+
1083
+ # 下载初始图片
1084
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
1085
+
1086
+ init_image = load_image(url).resize((128, 128))
1087
+ init_image.save("original-image.png")
1088
+
1089
+ # 使用fp16加快生成速度
1090
+ with paddle.amp.auto_cast(True):
1091
+ image = pipe(init_image, num_inference_steps=100, eta=1).images[0]
1092
+
1093
+ image.save("super-resolution-image.png")
1094
+ ```
1095
+ <div align="center">
1096
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209328660-9700fdc3-72b3-43bd-9a00-23b370ba030b.png">
1097
+ <center>原图像</center>
1098
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209328479-4eaea5d8-aa4a-4f31-aa2a-b47e3c730f15.png">
1099
+ <center>生成图像</center>
1100
+ </div>
1101
+ </details>
1102
+
1103
+
1104
+ <details><summary>&emsp;图像编辑(Image Inpainting)</summary>
1105
+
1106
+ #### image_inpainting-repaint
1107
+ ```python
1108
+ from ppdiffusers import RePaintPipeline, RePaintScheduler
1109
+ from ppdiffusers.utils import load_image
1110
+
1111
+ img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/celeba_hq_256.png"
1112
+ mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/mask_256.png"
1113
+
1114
+ # Load the original image and the mask as PIL images
1115
+ original_image = load_image(img_url).resize((256, 256))
1116
+ mask_image = load_image(mask_url).resize((256, 256))
1117
+
1118
+ scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256", subfolder="scheduler")
1119
+ pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
1120
+
1121
+ output = pipe(
1122
+ original_image=original_image,
1123
+ mask_image=mask_image,
1124
+ num_inference_steps=250,
1125
+ eta=0.0,
1126
+ jump_length=10,
1127
+ jump_n_sample=10,
1128
+ )
1129
+ inpainted_image = output.images[0]
1130
+
1131
+ inpainted_image.save("repaint-image.png")
1132
+ ```
1133
+ <div align="center">
1134
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209329052-b6fc2aaf-1a59-49a3-92ef-60180fdffd81.png">
1135
+ <center>原图像</center>
1136
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209329048-4fe12176-32a0-4800-98f2-49bd8d593799.png">
1137
+ <center>mask图像</center>
1138
+ <img alt="image" src="https://user-images.githubusercontent.com/20476674/209329241-b7e4d99e-468a-4b95-8829-d77ee14bfe98.png">
1139
+ <center>生成图像</center>
1140
+ </div>
1141
+ </details>
1142
+
1143
+
1144
+
1145
+ <details><summary>&emsp;图像变化(Image Variation)</summary>
1146
+
1147
+ #### image_variation-versatile_diffusion
1148
+ ```python
1149
+ from ppdiffusers import VersatileDiffusionImageVariationPipeline
1150
+ from ppdiffusers.utils import load_image
1151
+
1152
+ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg"
1153
+ image = load_image(url)
1154
+
1155
+ pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
1156
+
1157
+ image = pipe(image).images[0]
1158
+ image.save("versatile-diffusion-car_variation.png")
1159
+ ```
1160
+ <div align="center">
1161
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209331434-51f6cdbd-b8e4-4faa-8e49-1cc852e35603.jpg">
1162
+ <center>原图像</center>
1163
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209331591-f6cc4cd8-8430-4627-8d22-bf404fb2bfdd.png">
1164
+ <center>生成图像</center>
1165
+ </div>
1166
+ </details>
1167
+
1168
+
1169
+
1170
+
1171
+
1172
+ ### 音频
1173
+ <details>
1174
+ <summary>&emsp;无条件音频生成(Unconditional Audio Generation)</summary>
1175
+
1176
+ #### unconditional_audio_generation-audio_diffusion
1177
+
1178
+ ```python
1179
+ from scipy.io.wavfile import write
1180
+ from ppdiffusers import AudioDiffusionPipeline
1181
+ import paddle
1182
+
1183
+ # 加载模型和scheduler
1184
+ pipe = AudioDiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
1185
+ pipe.set_progress_bar_config(disable=None)
1186
+ generator = paddle.Generator().manual_seed(42)
1187
+
1188
+ output = pipe(generator=generator)
1189
+ audio = output.audios[0]
1190
+ image = output.images[0]
1191
+
1192
+ # 保存音频到本地
1193
+ for i, audio in enumerate(audio):
1194
+ write(f"audio_diffusion_test{i}.wav", pipe.mel.config.sample_rate, audio.transpose())
1195
+
1196
+ # 保存图片
1197
+ image.save("audio_diffusion_test.png")
1198
+ ```
1199
+ <div align = "center">
1200
+ <thead>
1201
+ </thead>
1202
+ <tbody>
1203
+ <tr>
1204
+ <td align = "center">
1205
+ <a href="https://paddlenlp.bj.bcebos.com/models/community/teticio/data/audio_diffusion_test0.wav" rel="nofollow">
1206
+ <img align="center" src="https://user-images.githubusercontent.com/20476674/209344877-edbf1c24-f08d-4e3b-88a4-a27e1fd0a858.png" width="200 style="max-width: 100%;"></a><br>
1207
+ </td>
1208
+ </tr>
1209
+ </tbody>
1210
+ </div>
1211
+
1212
+ <div align="center">
1213
+ <img width="300" alt="image" src="https://user-images.githubusercontent.com/20476674/209342125-93e8715e-895b-4115-9e1e-e65c6c2cd95a.png">
1214
+ </div>
1215
+
1216
+
1217
+ #### unconditional_audio_generation-spectrogram_diffusion
1218
+
1219
+ ```python
1220
+ import paddle
1221
+ import scipy
1222
+
1223
+ from ppdiffusers import MidiProcessor, SpectrogramDiffusionPipeline
1224
+ from ppdiffusers.utils.download_utils import ppdiffusers_url_download
1225
+
1226
+ # Download MIDI from: wget https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid
1227
+ mid_file_path = ppdiffusers_url_download(
1228
+ "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid", cache_dir="."
1229
+ )
1230
+ pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion", paddle_dtype=paddle.float16)
1231
+ processor = MidiProcessor()
1232
+ output = pipe(processor(mid_file_path))
1233
+ audio = output.audios[0]
1234
+
1235
+ output_path = "unconditional_audio_generation-spectrogram_diffusion-result-beethoven_hammerklavier_2.wav"
1236
+ # save the audio sample as a .wav file
1237
+ scipy.io.wavfile.write(output_path, rate=16000, data=audio)
1238
+ ```
1239
+ <div align = "center">
1240
+ <thead>
1241
+ </thead>
1242
+ <tbody>
1243
+ <tr>
1244
+ <td align = "center">
1245
+ <a href="https://paddlenlp.bj.bcebos.com/models/community/westfish/develop_ppdiffusers_data/beethoven_hammerklavier_2.wav" rel="nofollow">
1246
+ <img align="center" src="https://user-images.githubusercontent.com/20476674/209344877-edbf1c24-f08d-4e3b-88a4-a27e1fd0a858.png" width="200 style="max-width: 100%;"></a><br>
1247
+ </td>
1248
+ </tr>
1249
+ </tbody>
1250
+ </div>
1251
+ </details>
1252
+
1253
+
1254
+
1255
+ ## License
1256
+ PPDiffusers 遵循 [Apache-2.0开源协议](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/LICENSE)。
1257
+
1258
+ Stable Diffusion 遵循 [The CreativeML OpenRAIL M 开源协议](https://huggingface.co/spaces/CompVis/stable-diffusion-license)。
1259
+ > The CreativeML OpenRAIL M is an [Open RAIL M license](https://www.licenses.ai/blog/2022/8/18/naming-convention-of-responsible-ai-licenses), adapted from the work that [BigScience](https://bigscience.huggingface.co/) and [the RAIL Initiative](https://www.licenses.ai/) are jointly carrying in the area of responsible AI licensing. See also [the article about the BLOOM Open RAIL license](https://bigscience.huggingface.co/blog/the-bigscience-rail-license) on which this license is based.
1260
+
1261
+ Stable Diffusion 3遵循 [Stability Community 开源协议](https://stability.ai/license)。
1262
+ > Community License: Free for research, non-commercial, and commercial use for organisations or individuals with less than $1M annual revenue. You only need a paid Enterprise license if your yearly revenues exceed USD$1M and you use Stability AI models in commercial products or services. Read more: https://stability.ai/license
1263
+
1264
+ ## Acknowledge
1265
+ 我们借鉴了🤗 Hugging Face的[Diffusers](https://github.com/huggingface/diffusers)关于预训练扩散模型使用的优秀设计,在此对Hugging Face作者及其开源社区表示感谢。
1266
+
1267
+ ## Citation
1268
+
1269
+ ```bibtex
1270
+ @misc{ppdiffusers,
1271
+ author = {PaddlePaddle Authors},
1272
+ title = {PPDiffusers: State-of-the-art diffusion model toolkit based on PaddlePaddle},
1273
+ year = {2022},
1274
+ publisher = {GitHub},
1275
+ journal = {GitHub repository},
1276
+ howpublished = {\url{https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers}}
1277
+ }
1278
+ ```
PaddleMIX/ppdiffusers/VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.29.0
PaddleMIX/ppdiffusers/requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ paddlenlp>=3.0.0b2
2
+ safetensors>=0.3.1
3
+ ftfy
4
+ regex
5
+ Pillow
6
+ opencv-python
7
+ av
8
+ # for test
9
+ parameterized
10
+ requests_mock
11
+ omegaconf
12
+ note_seq
13
+ urllib3<=2.0.0
14
+ einops>=0.6.1
15
+ paddlesde
16
+ ligo-segments
17
+ huggingface_hub==0.23.0
18
+ hf_transfer
PaddleMIX/ppdiffusers/setup.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+
16
+ from setuptools import find_packages, setup
17
+
18
+ description = "PPDiffusers: Diffusers toolbox implemented based on PaddlePaddle"
19
+
20
+ with open("requirements.txt") as fin:
21
+ REQUIRED_PACKAGES = fin.read()
22
+
23
+
24
+ def read(file: str):
25
+ current_dir = os.path.dirname(__file__)
26
+ path = os.path.join(current_dir, file)
27
+ with open(path, "r", encoding="utf-8") as f:
28
+ content = f.read().strip()
29
+ return content
30
+
31
+
32
+ def read_version():
33
+ """read version of ppdiffusers"""
34
+ return read("VERSION")
35
+
36
+
37
+ def read_readme():
38
+ return read("README.md")
39
+
40
+
41
+ def read_requirements():
42
+ content = read("requirements.txt")
43
+ packages = content.split("\n")
44
+ return packages
45
+
46
+
47
+ setup(
48
+ name="ppdiffusers",
49
+ packages=find_packages(),
50
+ version=read_version(),
51
+ author="PaddleMIX Team",
52
+ author_email="paddlemix@baidu.com",
53
+ description=description,
54
+ long_description=read_readme(),
55
+ long_description_content_type="text/markdown",
56
+ url="https://github.com/PaddlePaddle/PaddleMIX/ppdiffusers",
57
+ keywords=["ppdiffusers", "paddle", "paddlemix"],
58
+ install_requires=REQUIRED_PACKAGES,
59
+ python_requires=">=3.6",
60
+ entry_points={"console_scripts": ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"]},
61
+ classifiers=[
62
+ "Programming Language :: Python :: 3",
63
+ "Programming Language :: Python :: 3.6",
64
+ "Programming Language :: Python :: 3.7",
65
+ "Programming Language :: Python :: 3.8",
66
+ "Programming Language :: Python :: 3.9",
67
+ "License :: OSI Approved :: Apache Software License",
68
+ "Operating System :: OS Independent",
69
+ ],
70
+ license="Apache 2.0",
71
+ )
PaddleMIX/scripts/build_wheel.sh ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ #=================================================
18
+ # Utils
19
+ #=================================================
20
+
21
+
22
+ # directory config
23
+ DIST_DIR="dist"
24
+ BUILD_DIR="build"
25
+ EGG_DIR="paddlemix.egg-info"
26
+
27
+ # command line log config
28
+ RED='\033[0;31m'
29
+ BLUE='\033[0;34m'
30
+ GREEN='\033[1;32m'
31
+ BOLD='\033[1m'
32
+ NONE='\033[0m'
33
+
34
+ function python_version_check() {
35
+ PY_MAIN_VERSION=`python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1}'`
36
+ PY_SUB_VERSION=`python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $2}'`
37
+ echo -e "find python version ${PY_MAIN_VERSION}.${PY_SUB_VERSION}"
38
+ if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "5" ]; then
39
+ echo -e "${RED}FAIL:${NONE} please use Python >= 3.5 !"
40
+ exit 1
41
+ fi
42
+ }
43
+
44
+ function init() {
45
+ echo -e "${BLUE}[init]${NONE} removing building directory..."
46
+ rm -rf $DIST_DIR $BUILD_DIR $EGG_DIR
47
+ if [ `pip list | grep paddlemix | wc -l` -gt 0 ]; then
48
+ echo -e "${BLUE}[init]${NONE} uninstalling paddlemix..."
49
+ pip uninstall -y paddlemix
50
+ fi
51
+ echo -e "${BLUE}[init]${NONE} ${GREEN}init success\n"
52
+ }
53
+
54
+ function build_and_install() {
55
+ echo -e "${BLUE}[build]${NONE} building paddlemix wheel..."
56
+ # add ppdiffusers as dependency to paddlemix
57
+ cp requirements.txt requirements.bak
58
+ echo 'ppdiffusers==0.19.3' >> requirements.txt
59
+ python setup.py sdist bdist_wheel
60
+ if [ $? -ne 0 ]; then
61
+ echo -e "${RED}[FAIL]${NONE} build paddlemix wheel failed !"
62
+ exit 1
63
+ fi
64
+ echo -e "${BLUE}[build]${NONE} ${GREEN}build paddldet wheel success\n"
65
+ mv requirements.bak requirements.txt
66
+
67
+ echo -e "${BLUE}[install]${NONE} installing paddlemix..."
68
+ cd $DIST_DIR
69
+
70
+ find . -name "paddlemix*.whl" | xargs pip install
71
+ if [ $? -ne 0 ]; then
72
+ cd ..
73
+ echo -e "${RED}[FAIL]${NONE} install paddlemix wheel failed !"
74
+ exit 1
75
+ fi
76
+ echo -e "${BLUE}[install]${NONE} ${GREEN}paddlemix install success\n"
77
+ cd ..
78
+ }
79
+
80
+ function unittest() {
81
+ echo -e "${BLUE}[unittest]${NONE} run unittests..."
82
+
83
+ # NOTE: perform unittests make sure installed paddlemix is used
84
+ python -m unittest discover -v
85
+
86
+ echo -e "${BLUE}[unittest]${NONE} ${GREEN}unittests success\n${NONE}"
87
+ }
88
+
89
+ function cleanup() {
90
+ rm -rf $BUILD_DIR $EGG_DIR
91
+ pip uninstall -y paddlemix
92
+ }
93
+
94
+ function abort() {
95
+ echo -e "${RED}[FAIL]${NONE} build wheel and unittest failed !
96
+ please check your code" 1>&2
97
+
98
+ cur_dir=`basename "$pwd"`
99
+ if [ cur_dir==$TEST_DIR -o cur_dir==$DIST_DIR ]; then
100
+ cd ..
101
+ fi
102
+
103
+ rm -rf $BUILD_DIR $EGG_DIR $DIST_DIR $TEST_DIR
104
+ pip uninstall -y paddlemix
105
+ }
106
+
107
+ python_version_check
108
+
109
+ trap 'abort' 0
110
+ set -e
111
+
112
+ init
113
+ build_and_install
114
+ # unittest
115
+ cleanup
116
+
117
+ # get Paddle version
118
+ PADDLE_VERSION=`python -c "import paddle; print(paddle.version.full_version)"`
119
+ PADDLE_COMMIT=`python -c "import paddle; print(paddle.version.commit)"`
120
+ PADDLE_COMMIT=`git rev-parse --short $PADDLE_COMMIT`
121
+
122
+ # get PaddleMIX branch
123
+ PPDET_BRANCH=`git rev-parse --abbrev-ref HEAD`
124
+ PPDET_COMMIT=`git rev-parse --short HEAD`
125
+
126
+ # get Python version
127
+ PYTHON_VERSION=`python -c "import platform; print(platform.python_version())"`
128
+
129
+ echo -e "\n${GREEN}paddlemix wheel compiled and checked success !${NONE}
130
+ ${BLUE}Python version:${NONE} $PYTHON_VERSION
131
+ ${BLUE}Paddle version:${NONE} $PADDLE_VERSION ($PADDLE_COMMIT)
132
+ ${BLUE}paddlemix branch:${NONE} $PPDET_BRANCH ($PPDET_COMMIT)\n"
133
+
134
+ echo -e "${GREEN}wheel saved under${NONE} ${RED}${BOLD}./dist"
135
+
136
+ trap : 0
a_main_folder/lavis_examples/albef_feature_extraction.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
a_main_folder/lavis_examples/albef_vqa.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
a_main_folder/lavis_examples/albef_zero_shot_classification.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
a_main_folder/lavis_examples/blip2_feature_extraction.ipynb ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import torch\n",
10
+ "from PIL import Image\n",
11
+ "\n",
12
+ "from lavis.models import load_model_and_preprocess"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "markdown",
17
+ "metadata": {},
18
+ "source": [
19
+ "#### Load an example image"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": null,
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "raw_image = Image.open(\"../docs/_static/merlion.png\").convert(\"RGB\")\n",
29
+ "caption = \"a large fountain spewing water into the air\"\n",
30
+ "\n",
31
+ "display(raw_image.resize((596, 437)))"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": null,
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "# setup device to use\n",
41
+ "device = torch.device(\"cuda\") if torch.cuda.is_available() else \"cpu\""
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": null,
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "model, vis_processors, txt_processors = load_model_and_preprocess(name=\"blip2_feature_extractor\", model_type=\"pretrain\", is_eval=True, device=device)\n",
51
+ "image = vis_processors[\"eval\"](raw_image).unsqueeze(0).to(device)\n",
52
+ "text_input = txt_processors[\"eval\"](caption)\n",
53
+ "sample = {\"image\": image, \"text_input\": [text_input]}"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "markdown",
58
+ "metadata": {},
59
+ "source": [
60
+ "#### Multimodal features"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": null,
66
+ "metadata": {},
67
+ "outputs": [],
68
+ "source": [
69
+ "features_multimodal = model.extract_features(sample)\n",
70
+ "print(features_multimodal.multimodal_embeds.shape)\n",
71
+ "# torch.Size([1, 32, 768]), 32 is the number of queries"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "markdown",
76
+ "metadata": {},
77
+ "source": [
78
+ "#### Unimodal features"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": null,
84
+ "metadata": {},
85
+ "outputs": [],
86
+ "source": [
87
+ "features_image = model.extract_features(sample, mode=\"image\")\n",
88
+ "features_text = model.extract_features(sample, mode=\"text\")\n",
89
+ "print(features_image.image_embeds.shape)\n",
90
+ "# torch.Size([1, 32, 768])\n",
91
+ "print(features_text.text_embeds.shape)\n",
92
+ "# torch.Size([1, 12, 768])"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "markdown",
97
+ "metadata": {},
98
+ "source": [
99
+ "#### Normalized low-dimensional unimodal features"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": null,
105
+ "metadata": {},
106
+ "outputs": [],
107
+ "source": [
108
+ "# low-dimensional projected features\n",
109
+ "print(features_image.image_embeds_proj.shape)\n",
110
+ "# torch.Size([1, 32, 256])\n",
111
+ "print(features_text.text_embeds_proj.shape)\n",
112
+ "# torch.Size([1, 12, 256])\n",
113
+ "similarity = (features_image.image_embeds_proj @ features_text.text_embeds_proj[:,0,:].t()).max()\n",
114
+ "print(similarity)\n",
115
+ "# tensor([[0.3642]])"
116
+ ]
117
+ }
118
+ ],
119
+ "metadata": {
120
+ "kernelspec": {
121
+ "display_name": "Python 3 (ipykernel)",
122
+ "language": "python",
123
+ "name": "python3"
124
+ },
125
+ "language_info": {
126
+ "codemirror_mode": {
127
+ "name": "ipython",
128
+ "version": 3
129
+ },
130
+ "file_extension": ".py",
131
+ "mimetype": "text/x-python",
132
+ "name": "python",
133
+ "nbconvert_exporter": "python",
134
+ "pygments_lexer": "ipython3",
135
+ "version": "3.8.13"
136
+ },
137
+ "vscode": {
138
+ "interpreter": {
139
+ "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
140
+ }
141
+ }
142
+ },
143
+ "nbformat": 4,
144
+ "nbformat_minor": 2
145
+ }
a_main_folder/lavis_examples/blip2_image_text_matching.ipynb ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import torch\n",
10
+ "from PIL import Image\n",
11
+ "\n",
12
+ "from lavis.models import load_model_and_preprocess\n",
13
+ "from lavis.processors import load_processor"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "markdown",
18
+ "metadata": {},
19
+ "source": [
20
+ "#### Load an example image and text"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": null,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "raw_image = Image.open(\"../docs/_static/merlion.png\").convert(\"RGB\")\n",
30
+ "display(raw_image.resize((596, 437)))"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "# setup device to use\n",
40
+ "device = torch.device(\"cuda\") if torch.cuda.is_available() else \"cpu\""
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": null,
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "caption = \"merlion in Singapore\""
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "markdown",
54
+ "metadata": {},
55
+ "source": [
56
+ "#### Load model and preprocessors"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": null,
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "model, vis_processors, text_processors = load_model_and_preprocess(\"blip2_image_text_matching\", \"pretrain\", device=device, is_eval=True)\n",
66
+ "# model, vis_processors, text_processors = load_model_and_preprocess(\"blip2_image_text_matching\", \"coco\", device=device, is_eval=True)"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "markdown",
71
+ "metadata": {},
72
+ "source": [
73
+ "#### Preprocess image and text inputs"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": null,
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": [
82
+ "img = vis_processors[\"eval\"](raw_image).unsqueeze(0).to(device)\n",
83
+ "txt = text_processors[\"eval\"](caption)"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "markdown",
88
+ "metadata": {},
89
+ "source": [
90
+ "#### Compute image-text matching (ITM) score"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": null,
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "itm_output = model({\"image\": img, \"text_input\": txt}, match_head=\"itm\")\n",
100
+ "itm_scores = torch.nn.functional.softmax(itm_output, dim=1)\n",
101
+ "print(f'The image and text are matched with a probability of {itm_scores[:, 1].item():.3%}')"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": null,
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "itc_score = model({\"image\": img, \"text_input\": txt}, match_head='itc')\n",
111
+ "print('The image feature and text feature has a cosine similarity of %.4f'%itc_score)"
112
+ ]
113
+ }
114
+ ],
115
+ "metadata": {
116
+ "kernelspec": {
117
+ "display_name": "Python 3 (ipykernel)",
118
+ "language": "python",
119
+ "name": "python3"
120
+ },
121
+ "language_info": {
122
+ "codemirror_mode": {
123
+ "name": "ipython",
124
+ "version": 3
125
+ },
126
+ "file_extension": ".py",
127
+ "mimetype": "text/x-python",
128
+ "name": "python",
129
+ "nbconvert_exporter": "python",
130
+ "pygments_lexer": "ipython3",
131
+ "version": "3.8.13"
132
+ },
133
+ "vscode": {
134
+ "interpreter": {
135
+ "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
136
+ }
137
+ }
138
+ },
139
+ "nbformat": 4,
140
+ "nbformat_minor": 2
141
+ }
a_main_folder/lavis_examples/blip2_instructed_generation.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
a_main_folder/lavis_examples/blip_feature_extraction.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
a_main_folder/lavis_examples/blip_image_captioning.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
a_main_folder/lavis_examples/blip_image_text_matching.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
a_main_folder/lavis_examples/blip_text_localization.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
a_main_folder/lavis_examples/blip_vqa.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
a_main_folder/lavis_examples/blip_zero_shot_classification.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
a_main_folder/lavis_examples/clip_feature_extraction.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
a_main_folder/lavis_examples/clip_zero_shot_classification.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
a_main_folder/litserve/.lightning_studio/.studiorc ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # This script is only for your user and runs in every shell you open.
2
+ # Use it to personalize your shell.
3
+ #
4
+ # Example: export MY_KEY=abcd-1234
a_main_folder/litserve/.lightning_studio/on_start.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # This script runs every time your Studio starts, from your home directory.
4
+
5
+ # List files under fast_load that need to load quickly on start (e.g. model checkpoints).
6
+ #
7
+ # ! fast_load
8
+ # <your file here>
9
+
10
+ # Add your startup commands below.
11
+ #
12
+ # Example: streamlit run my_app.py
13
+ # Example: gradio my_app.py
a_main_folder/litserve/.lightning_studio/on_stop.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # This script runs every time your Studio sleeps, from your home directory.
4
+
5
+ # Add your shutdown commands below.
6
+ #
7
+ # Example: docker down my-container
8
+ # Example: sudo service mysql stop
a_main_folder/litserve/aurasr.ipynb ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)\n",
13
+ "INFO: Started server process [1819673]\n",
14
+ "INFO: Started server process [1819665]\n",
15
+ "INFO: Waiting for application startup.\n",
16
+ "INFO: Started server process [1819688]\n",
17
+ "INFO: Waiting for application startup.\n",
18
+ "INFO: Application startup complete.\n",
19
+ "INFO: Waiting for application startup.\n",
20
+ "INFO: Application startup complete.\n",
21
+ "INFO: Application startup complete.\n",
22
+ "INFO: Started server process [1819696]\n",
23
+ "INFO: Waiting for application startup.\n",
24
+ "INFO: Application startup complete.\n"
25
+ ]
26
+ },
27
+ {
28
+ "name": "stdout",
29
+ "output_type": "stream",
30
+ "text": [
31
+ "Swagger UI is available at http://0.0.0.0:8000/docs\n"
32
+ ]
33
+ },
34
+ {
35
+ "name": "stderr",
36
+ "output_type": "stream",
37
+ "text": [
38
+ "Traceback (most recent call last):\n",
39
+ " File \"<string>\", line 1, in <module>\n",
40
+ " File \"/dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/multiprocessing/spawn.py\", line 116, in spawn_main\n",
41
+ " exitcode = _main(fd, parent_sentinel)\n",
42
+ " File \"/dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/multiprocessing/spawn.py\", line 126, in _main\n",
43
+ " self = reduction.pickle.load(from_parent)\n",
44
+ "AttributeError: Can't get attribute 'AuraSRLitAPI' on <module '__main__' (built-in)>\n",
45
+ "Traceback (most recent call last):\n",
46
+ " File \"<string>\", line 1, in <module>\n",
47
+ " File \"/dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/multiprocessing/spawn.py\", line 116, in spawn_main\n",
48
+ " exitcode = _main(fd, parent_sentinel)\n",
49
+ " File \"/dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/multiprocessing/spawn.py\", line 126, in _main\n",
50
+ " self = reduction.pickle.load(from_parent)\n",
51
+ "AttributeError: Can't get attribute 'AuraSRLitAPI' on <module '__main__' (built-in)>\n",
52
+ "Traceback (most recent call last):\n",
53
+ " File \"<string>\", line 1, in <module>\n",
54
+ " File \"/dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/multiprocessing/spawn.py\", line 116, in spawn_main\n",
55
+ " exitcode = _main(fd, parent_sentinel)\n",
56
+ " File \"/dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/multiprocessing/spawn.py\", line 126, in _main\n",
57
+ " self = reduction.pickle.load(from_parent)\n",
58
+ "AttributeError: Can't get attribute 'AuraSRLitAPI' on <module '__main__' (built-in)>\n",
59
+ "Traceback (most recent call last):\n",
60
+ " File \"<string>\", line 1, in <module>\n",
61
+ " File \"/dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/multiprocessing/spawn.py\", line 116, in spawn_main\n",
62
+ " exitcode = _main(fd, parent_sentinel)\n",
63
+ " File \"/dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/multiprocessing/spawn.py\", line 126, in _main\n",
64
+ " self = reduction.pickle.load(from_parent)\n",
65
+ "AttributeError: Can't get attribute 'AuraSRLitAPI' on <module '__main__' (built-in)>\n"
66
+ ]
67
+ },
68
+ {
69
+ "name": "stdout",
70
+ "output_type": "stream",
71
+ "text": [
72
+ "Shutting down LitServe\n"
73
+ ]
74
+ },
75
+ {
76
+ "ename": "KeyboardInterrupt",
77
+ "evalue": "",
78
+ "output_type": "error",
79
+ "traceback": [
80
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
81
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
82
+ "Cell \u001b[0;32mIn[1], line 37\u001b[0m\n\u001b[1;32m 35\u001b[0m api \u001b[38;5;241m=\u001b[39m AuraSRLitAPI()\n\u001b[1;32m 36\u001b[0m server \u001b[38;5;241m=\u001b[39m ls\u001b[38;5;241m.\u001b[39mLitServer(api, timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m---> 37\u001b[0m \u001b[43mserver\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mport\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m8000\u001b[39;49m\u001b[43m)\u001b[49m\n",
83
+ "File \u001b[0;32m/dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages/litserve/server.py:488\u001b[0m, in \u001b[0;36mLitServer.run\u001b[0;34m(self, host, port, num_api_servers, log_level, generate_client_file, api_server_worker_type, **kwargs)\u001b[0m\n\u001b[1;32m 486\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSwagger UI is available at http://0.0.0.0:\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mport\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/docs\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 487\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m servers:\n\u001b[0;32m--> 488\u001b[0m \u001b[43ms\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 489\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShutting down LitServe\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
84
+ "File \u001b[0;32m/dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/multiprocessing/process.py:149\u001b[0m, in \u001b[0;36mBaseProcess.join\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 147\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_parent_pid \u001b[38;5;241m==\u001b[39m os\u001b[38;5;241m.\u001b[39mgetpid(), \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcan only join a child process\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_popen \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcan only join a started process\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m--> 149\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_popen\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m res \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 151\u001b[0m _children\u001b[38;5;241m.\u001b[39mdiscard(\u001b[38;5;28mself\u001b[39m)\n",
85
+ "File \u001b[0;32m/dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/multiprocessing/popen_fork.py:43\u001b[0m, in \u001b[0;36mPopen.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;66;03m# This shouldn't block if wait() returned successfully.\u001b[39;00m\n\u001b[0;32m---> 43\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpoll\u001b[49m\u001b[43m(\u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mWNOHANG\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0.0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturncode\n",
86
+ "File \u001b[0;32m/dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/multiprocessing/popen_fork.py:27\u001b[0m, in \u001b[0;36mPopen.poll\u001b[0;34m(self, flag)\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 27\u001b[0m pid, sts \u001b[38;5;241m=\u001b[39m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwaitpid\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflag\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m 29\u001b[0m \u001b[38;5;66;03m# Child process not yet created. See #1731717\u001b[39;00m\n\u001b[1;32m 30\u001b[0m \u001b[38;5;66;03m# e.errno == errno.ECHILD == 10\u001b[39;00m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
87
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
88
+ ]
89
+ }
90
+ ],
91
+ "source": [
92
+ "# !pip install aura_sr\n",
93
+ "\n",
94
+ "import litserve as ls\n",
95
+ "import torch\n",
96
+ "from io import BytesIO\n",
97
+ "\n",
98
+ "\n",
99
+ "from PIL import Image\n",
100
+ "from fastapi import Response\n",
101
+ "from aura_sr import AuraSR\n",
102
+ "\n",
103
+ "class AuraSRLitAPI(ls.LitAPI):\n",
104
+ " def setup(self, device):\n",
105
+ " # Load the model\n",
106
+ " self.aura_sr = AuraSR.from_pretrained(\"fal-ai/AuraSR\")\n",
107
+ "\n",
108
+ " def decode_request(self, request):\n",
109
+ " # Extract file from request\n",
110
+ " return request[\"content\"].file\n",
111
+ "\n",
112
+ " def predict(self, image_data):\n",
113
+ " # Generate the upscaled image\n",
114
+ " image = Image.open(image_data)\n",
115
+ " upscaled_image = self.aura_sr.upscale_4x(image)\n",
116
+ " \n",
117
+ " return upscaled_image\n",
118
+ "\n",
119
+ " def encode_response(self, image):\n",
120
+ " buffered = BytesIO()\n",
121
+ " image.save(buffered, format=\"PNG\")\n",
122
+ " return Response(content=buffered.getvalue(), headers={\"Content-Type\": \"image/png\"})\n",
123
+ "\n",
124
+ "# Starting the server\n",
125
+ "if __name__ == \"__main__\":\n",
126
+ " api = AuraSRLitAPI()\n",
127
+ " server = ls.LitServer(api, timeout=False)\n",
128
+ " server.run(port=8000)"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 2,
134
+ "metadata": {},
135
+ "outputs": [
136
+ {
137
+ "name": "stdout",
138
+ "output_type": "stream",
139
+ "text": [
140
+ "Requirement already satisfied: aura_sr in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (0.0.4)\n",
141
+ "Requirement already satisfied: torch>=2.0 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from aura_sr) (2.5.1)\n",
142
+ "Requirement already satisfied: torchvision in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from aura_sr) (0.20.1)\n",
143
+ "Requirement already satisfied: numpy in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from aura_sr) (1.26.4)\n",
144
+ "Requirement already satisfied: einops in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from aura_sr) (0.8.0)\n",
145
+ "Requirement already satisfied: huggingface-hub in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from aura_sr) (0.27.0)\n",
146
+ "Requirement already satisfied: safetensors in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from aura_sr) (0.4.5)\n",
147
+ "Requirement already satisfied: filelock in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (3.16.1)\n",
148
+ "Requirement already satisfied: typing-extensions>=4.8.0 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (4.12.2)\n",
149
+ "Requirement already satisfied: networkx in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (3.4.2)\n",
150
+ "Requirement already satisfied: jinja2 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (3.1.5)\n",
151
+ "Requirement already satisfied: fsspec in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (2024.9.0)\n",
152
+ "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (12.4.127)\n",
153
+ "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (12.4.127)\n",
154
+ "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (12.4.127)\n",
155
+ "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (9.1.0.70)\n",
156
+ "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (12.4.5.8)\n",
157
+ "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (11.2.1.3)\n",
158
+ "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (10.3.5.147)\n",
159
+ "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (11.6.1.9)\n",
160
+ "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (12.3.1.170)\n",
161
+ "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (2.21.5)\n",
162
+ "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (12.4.127)\n",
163
+ "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (12.4.127)\n",
164
+ "Requirement already satisfied: triton==3.1.0 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (3.1.0)\n",
165
+ "Requirement already satisfied: sympy==1.13.1 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torch>=2.0->aura_sr) (1.13.1)\n",
166
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from sympy==1.13.1->torch>=2.0->aura_sr) (1.3.0)\n",
167
+ "Requirement already satisfied: packaging>=20.9 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from huggingface-hub->aura_sr) (24.2)\n",
168
+ "Requirement already satisfied: pyyaml>=5.1 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from huggingface-hub->aura_sr) (6.0.2)\n",
169
+ "Requirement already satisfied: requests in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from huggingface-hub->aura_sr) (2.32.3)\n",
170
+ "Requirement already satisfied: tqdm>=4.42.1 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from huggingface-hub->aura_sr) (4.67.1)\n",
171
+ "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from torchvision->aura_sr) (11.0.0)\n",
172
+ "Requirement already satisfied: MarkupSafe>=2.0 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from jinja2->torch>=2.0->aura_sr) (3.0.2)\n",
173
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from requests->huggingface-hub->aura_sr) (3.4.1)\n",
174
+ "Requirement already satisfied: idna<4,>=2.5 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from requests->huggingface-hub->aura_sr) (3.10)\n",
175
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from requests->huggingface-hub->aura_sr) (2.3.0)\n",
176
+ "Requirement already satisfied: certifi>=2017.4.17 in /dscilab_dungvo/workspace/bin/envs/litserve/lib/python3.10/site-packages (from requests->huggingface-hub->aura_sr) (2024.12.14)\n",
177
+ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
178
+ "\u001b[0m"
179
+ ]
180
+ }
181
+ ],
182
+ "source": [
183
+ "!pip install --upgrade aura_sr"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": null,
189
+ "metadata": {},
190
+ "outputs": [],
191
+ "source": []
192
+ }
193
+ ],
194
+ "metadata": {
195
+ "kernelspec": {
196
+ "display_name": "litserve",
197
+ "language": "python",
198
+ "name": "python3"
199
+ },
200
+ "language_info": {
201
+ "codemirror_mode": {
202
+ "name": "ipython",
203
+ "version": 3
204
+ },
205
+ "file_extension": ".py",
206
+ "mimetype": "text/x-python",
207
+ "name": "python",
208
+ "nbconvert_exporter": "python",
209
+ "pygments_lexer": "ipython3",
210
+ "version": "3.10.16"
211
+ }
212
+ },
213
+ "nbformat": 4,
214
+ "nbformat_minor": 2
215
+ }
a_main_folder/litserve/aurasr/.lightning_studio/.studiorc ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # This script is only for your user and runs in every shell you open.
2
+ # Use it to personalize your shell.
3
+ #
4
+ # Example: export MY_KEY=abcd-1234
a_main_folder/litserve/aurasr/.lightning_studio/on_start.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # This script runs every time your Studio starts, from your home directory.
4
+
5
+ # List files under fast_load that need to load quickly on start (e.g. model checkpoints).
6
+ #
7
+ # ! fast_load
8
+ # <your file here>
9
+
10
+ # Add your startup commands below.
11
+ #
12
+ # Example: streamlit run my_app.py
13
+ # Example: gradio my_app.py
a_main_folder/litserve/aurasr/.lightning_studio/on_stop.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # This script runs every time your Studio sleeps, from your home directory.
4
+
5
+ # Add your shutdown commands below.
6
+ #
7
+ # Example: docker down my-container
8
+ # Example: sudo service mysql stop
a_main_folder/litserve/aurasr/client.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import requests
3
+ from datetime import datetime
4
+
5
+ # Update this URL to your server's URL if hosted remotely
6
+ API_URL = "http://127.0.0.1:8000/predict"
7
+
8
+ def send_generate_request(path):
9
+ inputFile = open(path, 'rb')
10
+ inputData = inputFile.read()
11
+ inputFile.close()
12
+
13
+ response = requests.post(API_URL, files={"content": inputData})
14
+ if response.status_code == 200:
15
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S").lower()
16
+ filename = f"output-{timestamp}.png"
17
+
18
+ with open(filename, "wb") as output_file:
19
+ output_file.write(response.content)
20
+
21
+ print(f"Audio saved to {filename}")
22
+ else:
23
+ print(f"Error: Response with status code {response.status_code} - {response.text}")
24
+
25
+ if __name__ == "__main__":
26
+ parser = argparse.ArgumentParser(description="Send text to stable audio server and receive generated audio.")
27
+ parser.add_argument("--path", required=True, help="Path for the file's melody")
28
+ args = parser.parse_args()
29
+
30
+ send_generate_request(args.path)
a_main_folder/litserve/aurasr/input.jpg ADDED
a_main_folder/litserve/aurasr/server.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import litserve as ls
2
+ import torch
3
+ from io import BytesIO
4
+ from PIL import Image
5
+ from fastapi import Response
6
+ from aura_sr import AuraSR
7
+
8
+ class AuraSRLitAPI(ls.LitAPI):
9
+ def setup(self, device):
10
+ # Load the model
11
+ self.aura_sr = AuraSR.from_pretrained("fal-ai/AuraSR")
12
+
13
+ def decode_request(self, request):
14
+ # Extract file from request
15
+ return request["content"].file
16
+
17
+ def predict(self, image_data):
18
+ # Generate the upscaled image
19
+ image = Image.open(image_data)
20
+ upscaled_image = self.aura_sr.upscale_4x(image)
21
+
22
+ return upscaled_image
23
+
24
+ def encode_response(self, image):
25
+ buffered = BytesIO()
26
+ image.save(buffered, format="PNG")
27
+ return Response(content=buffered.getvalue(), headers={"Content-Type": "image/png"})
28
+
29
+ # Starting the server
30
+ if __name__ == "__main__":
31
+ api = AuraSRLitAPI()
32
+ server = ls.LitServer(api, timeout=False)
33
+ server.run(port=8000)
a_main_folder/llm2vec/test.ipynb ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/dscilab_dungvo/workspace/bin/envs/llm2vec/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import os\n",
19
+ "import llm2vec"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 3,
25
+ "metadata": {},
26
+ "outputs": [
27
+ {
28
+ "name": "stderr",
29
+ "output_type": "stream",
30
+ "text": [
31
+ "/dscilab_dungvo/workspace/bin/envs/llm2vec/lib/python3.10/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
32
+ " warnings.warn(\n",
33
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
34
+ "Downloading shards: 100%|██████████████████████| 4/4 [28:51<00:00, 432.87s/it]\n",
35
+ "Loading checkpoint shards: 100%|████████████████| 4/4 [00:06<00:00, 1.65s/it]\n"
36
+ ]
37
+ }
38
+ ],
39
+ "source": [
40
+ "import torch\n",
41
+ "from llm2vec import LLM2Vec\n",
42
+ "\n",
43
+ "l2v = LLM2Vec.from_pretrained(\n",
44
+ " \"McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp\",\n",
45
+ " peft_model_name_or_path=\"McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-unsup-simcse\",\n",
46
+ " device_map=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
47
+ " torch_dtype=torch.bfloat16,\n",
48
+ ")"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": null,
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": []
57
+ }
58
+ ],
59
+ "metadata": {
60
+ "kernelspec": {
61
+ "display_name": "llm2vec",
62
+ "language": "python",
63
+ "name": "python3"
64
+ },
65
+ "language_info": {
66
+ "codemirror_mode": {
67
+ "name": "ipython",
68
+ "version": 3
69
+ },
70
+ "file_extension": ".py",
71
+ "mimetype": "text/x-python",
72
+ "name": "python",
73
+ "nbconvert_exporter": "python",
74
+ "pygments_lexer": "ipython3",
75
+ "version": "3.10.16"
76
+ }
77
+ },
78
+ "nbformat": 4,
79
+ "nbformat_minor": 2
80
+ }
a_main_folder/ultralytics/input.jpg ADDED
a_main_folder/ultralytics/test.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
open_clip/src/open_clip/hf_configs.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HF architecture dict:
2
+ arch_dict = {
3
+ # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
4
+ "roberta": {
5
+ "config_names": {
6
+ "context_length": "max_position_embeddings",
7
+ "vocab_size": "vocab_size",
8
+ "width": "hidden_size",
9
+ "heads": "num_attention_heads",
10
+ "layers": "num_hidden_layers",
11
+ "layer_attr": "layer",
12
+ "token_embeddings_attr": "embeddings"
13
+ },
14
+ "pooler": "mean_pooler",
15
+ },
16
+ # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
17
+ "xlm-roberta": {
18
+ "config_names": {
19
+ "context_length": "max_position_embeddings",
20
+ "vocab_size": "vocab_size",
21
+ "width": "hidden_size",
22
+ "heads": "num_attention_heads",
23
+ "layers": "num_hidden_layers",
24
+ "layer_attr": "layer",
25
+ "token_embeddings_attr": "embeddings"
26
+ },
27
+ "pooler": "mean_pooler",
28
+ },
29
+ # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
30
+ "mt5": {
31
+ "config_names": {
32
+ # unlimited seqlen
33
+ # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
34
+ # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
35
+ "context_length": "",
36
+ "vocab_size": "vocab_size",
37
+ "width": "d_model",
38
+ "heads": "num_heads",
39
+ "layers": "num_layers",
40
+ "layer_attr": "block",
41
+ "token_embeddings_attr": "embed_tokens"
42
+ },
43
+ "pooler": "mean_pooler",
44
+ },
45
+ # https://huggingface.co/docs/transformers/model_doc/bert
46
+ "bert": {
47
+ "config_names": {
48
+ "context_length": "max_position_embeddings",
49
+ "vocab_size": "vocab_size",
50
+ "width": "hidden_size",
51
+ "heads": "num_attention_heads",
52
+ "layers": "num_hidden_layers",
53
+ },
54
+ "pooler": "cls_pooler",
55
+ },
56
+ # https://huggingface.co/docs/transformers/model_doc/m2m_100
57
+ "m2m_100": {
58
+ "config_names": {
59
+ "context_length": "max_position_embeddings",
60
+ "vocab_size": "vocab_size",
61
+ "width": "d_model",
62
+ "heads": "encoder_attention_heads",
63
+ "layers": "encoder_layers",
64
+ },
65
+ "pooler": "cls_pooler",
66
+ },
67
+ }
open_clip/src/open_clip/model_configs/RN101.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": [
6
+ 3,
7
+ 4,
8
+ 23,
9
+ 3
10
+ ],
11
+ "width": 64,
12
+ "patch_size": null
13
+ },
14
+ "text_cfg": {
15
+ "context_length": 77,
16
+ "vocab_size": 49408,
17
+ "width": 512,
18
+ "heads": 8,
19
+ "layers": 12
20
+ }
21
+ }
open_clip/src/open_clip/model_configs/RN50x16.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "image_size": 384,
5
+ "layers": [
6
+ 6,
7
+ 8,
8
+ 18,
9
+ 8
10
+ ],
11
+ "width": 96,
12
+ "patch_size": null
13
+ },
14
+ "text_cfg": {
15
+ "context_length": 77,
16
+ "vocab_size": 49408,
17
+ "width": 768,
18
+ "heads": 12,
19
+ "layers": 12
20
+ }
21
+ }
open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP-i18n-256.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "init_logit_bias": -10,
4
+ "custom_text": true,
5
+ "vision_cfg": {
6
+ "image_size": 256,
7
+ "timm_model_name": "vit_base_patch16_siglip_256",
8
+ "timm_model_pretrained": false,
9
+ "timm_pool": "map",
10
+ "timm_proj": "none"
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 64,
14
+ "vocab_size": 250000,
15
+ "hf_tokenizer_name": "timm/ViT-B-16-SigLIP-i18n-256",
16
+ "tokenizer_kwargs": {
17
+ "clean": "canonicalize"
18
+ },
19
+ "width": 768,
20
+ "heads": 12,
21
+ "layers": 12,
22
+ "no_causal_mask": true,
23
+ "proj_bias": true,
24
+ "pool_type": "last",
25
+ "norm_kwargs":{
26
+ "eps": 1e-6
27
+ }
28
+ }
29
+ }
open_clip/src/open_clip/model_configs/ViT-B-16-quickgelu.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "quick_gelu": true,
4
+ "vision_cfg": {
5
+ "image_size": 224,
6
+ "layers": 12,
7
+ "width": 768,
8
+ "patch_size": 16
9
+ },
10
+ "text_cfg": {
11
+ "context_length": 77,
12
+ "vocab_size": 49408,
13
+ "width": 512,
14
+ "heads": 8,
15
+ "layers": 12
16
+ }
17
+ }
open_clip/src/open_clip/model_configs/ViT-B-16.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 12,
6
+ "width": 768,
7
+ "patch_size": 16
8
+ },
9
+ "text_cfg": {
10
+ "context_length": 77,
11
+ "vocab_size": 49408,
12
+ "width": 512,
13
+ "heads": 8,
14
+ "layers": 12
15
+ }
16
+ }
open_clip/src/open_clip/model_configs/ViT-B-32-plus-256.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 640,
3
+ "vision_cfg": {
4
+ "image_size": 256,
5
+ "layers": 12,
6
+ "width": 896,
7
+ "patch_size": 32
8
+ },
9
+ "text_cfg": {
10
+ "context_length": 77,
11
+ "vocab_size": 49408,
12
+ "width": 640,
13
+ "heads": 10,
14
+ "layers": 12
15
+ }
16
+ }
open_clip/src/open_clip/model_configs/ViT-B-32-quickgelu.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "quick_gelu": true,
4
+ "vision_cfg": {
5
+ "image_size": 224,
6
+ "layers": 12,
7
+ "width": 768,
8
+ "patch_size": 32
9
+ },
10
+ "text_cfg": {
11
+ "context_length": 77,
12
+ "vocab_size": 49408,
13
+ "width": 512,
14
+ "heads": 8,
15
+ "layers": 12
16
+ }
17
+ }
open_clip/src/open_clip/model_configs/ViT-B-32.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 12,
6
+ "width": 768,
7
+ "patch_size": 32
8
+ },
9
+ "text_cfg": {
10
+ "context_length": 77,
11
+ "vocab_size": 49408,
12
+ "width": 512,
13
+ "heads": 8,
14
+ "layers": 12
15
+ }
16
+ }
open_clip/src/open_clip/model_configs/ViT-H-14-378-quickgelu.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "quick_gelu": true,
4
+ "vision_cfg": {
5
+ "image_size": 378,
6
+ "layers": 32,
7
+ "width": 1280,
8
+ "head_width": 80,
9
+ "patch_size": 14
10
+ },
11
+ "text_cfg": {
12
+ "context_length": 77,
13
+ "vocab_size": 49408,
14
+ "width": 1024,
15
+ "heads": 16,
16
+ "layers": 24
17
+ }
18
+ }
open_clip/src/open_clip/model_configs/ViT-H-14-CLIPA.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 32,
6
+ "width": 1280,
7
+ "head_width": 80,
8
+ "patch_size": 14,
9
+ "no_ln_pre": true,
10
+ "pool_type": "avg",
11
+ "final_ln_after_pool": true
12
+ },
13
+ "text_cfg": {
14
+ "context_length": 32,
15
+ "vocab_size": 32000,
16
+ "hf_tokenizer_name": "bert-base-uncased",
17
+ "tokenizer_kwargs": {
18
+ "strip_sep_token": true
19
+ },
20
+ "width": 1024,
21
+ "heads": 16,
22
+ "layers": 24,
23
+ "pool_type": "last",
24
+ "no_causal_mask": true
25
+ }
26
+ }