Spaces:

acmyu
/

KeyframesAI

Paused

App Files Files Community

acmyu commited on Aug 7, 2025

Commit

3366cca

verified ·

1 Parent(s): d298085

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
LICENSE +201 -0
README.md +12 -12
app.py +48 -7
caculate_metrics_256.py +27 -0
caculate_metrics_512.py +27 -0
evaluate.py +186 -0
inception.py +138 -0
main.py +1097 -0
metrics.json +1538 -0
metrics.py +522 -0
pose-frames.py +16 -0
pose.py +15 -0
requirements.txt +8 -0
run_stage1.sh +18 -0
run_stage2.sh +18 -0
run_stage3.sh +15 -0
run_test_stage1.sh +9 -0
run_test_stage2.sh +13 -0
run_test_stage3.sh +12 -0
sd.py +13 -0
setup.txt +41 -0
single_extract_pose.py +35 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/configs/dwpose-l_384x288.py +257 -0
src/configs/stage1_config.py +181 -0
src/configs/stage2_config.py +192 -0
src/configs/stage3_config.py +217 -0
src/configs/yolox_l_8xb8-300e_coco.py +245 -0
src/controlnet_aux/__init__.py +18 -0
src/controlnet_aux/__pycache__/__init__.cpython-311.pyc +0 -0
src/controlnet_aux/__pycache__/util.cpython-311.pyc +0 -0
src/controlnet_aux/canny/__init__.py +36 -0
src/controlnet_aux/canny/__pycache__/__init__.cpython-311.pyc +0 -0
src/controlnet_aux/dwpose/__init__.py +92 -0
src/controlnet_aux/dwpose/__pycache__/__init__.cpython-311.pyc +0 -0
src/controlnet_aux/dwpose/__pycache__/util.cpython-311.pyc +0 -0
src/controlnet_aux/dwpose/__pycache__/wholebody.cpython-311.pyc +0 -0
src/controlnet_aux/dwpose/dwpose_config/dwpose-l_384x288.py +257 -0
src/controlnet_aux/dwpose/util.py +303 -0
src/controlnet_aux/dwpose/wholebody.py +121 -0
src/controlnet_aux/dwpose/yolox_config/yolox_l_8xb8-300e_coco.py +245 -0
src/controlnet_aux/hed/__init__.py +129 -0
src/controlnet_aux/hed/__pycache__/__init__.cpython-311.pyc +0 -0
src/controlnet_aux/leres/__init__.py +118 -0
src/controlnet_aux/leres/__pycache__/__init__.cpython-311.pyc +0 -0
src/controlnet_aux/leres/leres/LICENSE +23 -0
src/controlnet_aux/leres/leres/Resnet.py +199 -0
src/controlnet_aux/leres/leres/Resnext_torch.py +237 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+src/controlnet_aux/tests/test_image.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
----
-title: KeyframesAI2
-emoji: 📊
-colorFrom: green
-colorTo: red
-sdk: gradio
-sdk_version: 5.41.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: KeyframesAI
+emoji: 📈
+colorFrom: gray
+colorTo: gray
+sdk: gradio
+sdk_version: 5.22.0
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,7 +1,48 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+from main import run_app, run_train, run_inference
+import spaces
+from PIL import Image
+import cv2
+import os
+import gradio as gr
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            char_imgs = gr.Gallery(type="pil", label="Images of the Character")
+            mocap = gr.Video(label="Motion-Capture Video")
+            tr_steps = gr.Number(label="Training steps", value=10)
+            inf_steps = gr.Number(label="Inference steps", value=10)
+            fps = gr.Number(label="Output frame rate", value=12)
+            modelId = gr.Text(label="Model Id", value="fine_tuned_pcdms")
+            remove_bg = gr.Checkbox(label="Remove background", value=False)
+            resize_inputs = gr.Checkbox(label="Resize images to match video", value=True)
+            train_btn = gr.Button(value="Train")
+            inference_btn = gr.Button(value="Inference")
+            submit_btn = gr.Button(value="Generate")
+        with gr.Column():
+            animation = gr.Video(label="Result")
+            frames = gr.Gallery(type="pil", label="Frames")
+    submit_btn.click(
+        run_app, inputs=[char_imgs, mocap, tr_steps, inf_steps, fps, remove_bg, resize_inputs], outputs=[animation, frames]
+    )
+    train_btn.click(
+        run_train, inputs=[char_imgs, tr_steps, remove_bg, resize_inputs, modelId], outputs=[]
+    )
+    inference_btn.click(
+        run_inference, inputs=[char_imgs, mocap, inf_steps, fps, remove_bg, resize_inputs, modelId], outputs=[animation, frames]
+    )
+demo.launch(share=True)

caculate_metrics_256.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from metrics import FID, LPIPS, Reconstruction_Metrics, preprocess_path_for_deform_task
+import torch
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+fid = FID()
+lpips_obj = LPIPS()
+rec = Reconstruction_Metrics()
+real_path = './datasets/deepfashing/train_lst_256_png'
+gt_path = '/datasets/deepfashing/test_lst_256_png'
+distorated_path = './PCDMs_Results/stage3_256_results'
+results_save_path =  distorated_path + '_results.txt'    # save path
+gt_list, distorated_list = preprocess_path_for_deform_task(gt_path, distorated_path)
+print(len(gt_list), len(distorated_list))
+FID = fid.calculate_from_disk(distorated_path, real_path, img_size=(176,256))
+LPIPS = lpips_obj.calculate_from_disk(distorated_list, gt_list, img_size=(176,256), sort=False)
+REC = rec.calculate_from_disk(distorated_list, gt_list, distorated_path,  img_size=(176,256), sort=False, debug=False)
+print ("FID: "+str(FID)+"\nLPIPS: "+str(LPIPS)+"\nSSIM: "+str(REC))
+with open(results_save_path, 'a') as ff:
+    ff.write("\nFID: "+str(FID)+"\nLPIPS: "+str(LPIPS)+"\nSSIM: "+str(REC))

caculate_metrics_512.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from metrics import FID, LPIPS, Reconstruction_Metrics, preprocess_path_for_deform_task
+import torch
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+fid = FID()
+lpips_obj = LPIPS()
+rec = Reconstruction_Metrics()
+real_path = './datasets/deepfashing/train_lst_512_png'
+gt_path = '/datasets/deepfashing/test_lst_512_png'
+distorated_path =  './PCDMs_Results/stage3_512_results'
+results_save_path =  distorated_path + '_results.txt'    # save path
+gt_list, distorated_list = preprocess_path_for_deform_task(gt_path, distorated_path)
+print(len(gt_list), len(distorated_list))
+FID = fid.calculate_from_disk(distorated_path, real_path, img_size=(352,512))
+LPIPS = lpips_obj.calculate_from_disk(distorated_list, gt_list, img_size=(352,512), sort=False)
+REC = rec.calculate_from_disk(distorated_list, gt_list, distorated_path,  img_size=(352,512), sort=False, debug=False)
+print ("FID: "+str(FID)+"\nLPIPS: "+str(LPIPS)+"\nSSIM: "+str(REC))
+with open(results_save_path, 'a') as ff:
+    ff.write("\nFID: "+str(FID)+"\nLPIPS: "+str(LPIPS)+"\nSSIM: "+str(REC))

evaluate.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from main import extract_frames, run
+from PIL import Image
+import numpy as np
+from skimage.metrics import structural_similarity as ssim
+from skimage.metrics import peak_signal_noise_ratio as psnr
+import torch
+import torchvision.transforms as transforms
+import lpips
+from pytorch_fid.fid_score import calculate_fid_given_paths
+import os
+import json
+# Convert PIL to numpy
+def pil_to_np(img):
+    return np.array(img).astype(np.float32) / 255.0
+# SSIM
+def compute_ssim(img1, img2):
+    img1_np = pil_to_np(img1)
+    img2_np = pil_to_np(img2)
+    h, w = img1_np.shape[:2]
+    min_dim = min(h, w)
+    win_size = min(7, min_dim if min_dim % 2 == 1 else min_dim - 1)  # ensure odd
+    return ssim(img1_np, img2_np, win_size=win_size, channel_axis=-1, data_range=1.0)
+# PSNR
+def compute_psnr(img1, img2):
+    img1_np = pil_to_np(img1)
+    img2_np = pil_to_np(img2)
+    return psnr(img1_np, img2_np, data_range=1.0)
+# LPIPS
+lpips_model = lpips.LPIPS(net='alex')
+lpips_transform = transforms.Compose([
+    transforms.Resize((256, 256)),
+    transforms.ToTensor(),
+    transforms.Normalize([0.5]*3, [0.5]*3)
+])
+def compute_lpips(img1, img2):
+    img1_tensor = lpips_transform(img1).unsqueeze(0)
+    img2_tensor = lpips_transform(img2).unsqueeze(0)
+    return lpips_model(img1_tensor, img2_tensor).item()
+# FID: Save images to temp folders for FID calculation
+def compute_fid(img1, img2):
+    os.makedirs('temp/img1', exist_ok=True)
+    os.makedirs('temp/img2', exist_ok=True)
+    img1.save('temp/img1/0.png')
+    img2.save('temp/img2/0.png')
+    fid = calculate_fid_given_paths(['temp/img1', 'temp/img2'], batch_size=1, device='cpu', dims=2048)
+    return fid
+with open('metrics.json', 'r') as file:
+    metrics = json.load(file)
+def get_score(item, image_paths, video_path, train_steps=100, inference_steps=10, fps=12, bg_remove=False):
+    print(item)
+    images = []
+    for path in image_paths:
+        img = Image.open(path)
+        images.append(img)
+    gt_frames = extract_frames(video_path, fps)
+    os.makedirs('out/'+item, exist_ok=True)
+    for i, frame in enumerate(gt_frames):
+        frame.save("out/"+item+"/frame_"+str(i)+".png")
+    results = run(images, video_path, train_steps=100, inference_steps=10, fps=12, bg_remove=False, finetune=True)
+    for i, result in enumerate(results):
+        result.save("out/"+item+"/result_"+str(i)+".png")
+    results_base = run(images, video_path, train_steps=100, inference_steps=10, fps=12, bg_remove=False, finetune=False)
+    for i, result in enumerate(results_base):
+        result.save("out/"+item+"/base_"+str(i)+".png")
+    """
+    img1=gt_frames[0]
+    img2=Image.open("out/base_0.png")
+    print("SSIM:", compute_ssim(img1, img2))
+    print("PSNR:", compute_psnr(img1, img2))
+    print("LPIPS:", compute_lpips(img1, img2))
+    print("FID:", compute_fid(img1, img2))
+    """
+    ssim = []
+    psnr = []
+    lpips = []
+    fid = []
+    ssim2 = []
+    psnr2 = []
+    lpips2 = []
+    fid2 = []
+    for gt, result, base in zip(gt_frames, results, results_base):
+        ssim.append(float(compute_ssim(gt, result)))
+        psnr.append(float(compute_psnr(gt, result)))
+        lpips.append(float(compute_lpips(gt, result)))
+        fid.append(float(compute_fid(gt, result)))
+        ssim2.append(float(compute_ssim(gt, base)))
+        psnr2.append(float(compute_psnr(gt, base)))
+        lpips2.append(float(compute_lpips(gt, base)))
+        fid2.append(float(compute_fid(gt, base)))
+    print("SSIM:", sum(ssim)/len(ssim))
+    print("PSNR:", sum(psnr)/len(psnr))
+    print("LPIPS:", sum(lpips)/len(lpips))
+    print("FID:", sum(fid)/len(fid))
+    print('baseline:')
+    print("SSIM:", sum(ssim2)/len(ssim2))
+    print("PSNR:", sum(psnr2)/len(psnr2))
+    print("LPIPS:", sum(lpips2)/len(lpips2))
+    print("FID:", sum(fid2)/len(fid2))
+    metrics[item] = {'ft': {}, 'base': {}}
+    metrics[item]['ft']['ssim'] = {'avg': sum(ssim)/len(ssim), 'vals': ssim}
+    metrics[item]['ft']['psnr'] = {'avg': sum(psnr)/len(psnr), 'vals': psnr}
+    metrics[item]['ft']['lpips'] = {'avg': sum(lpips)/len(lpips), 'vals': lpips}
+    metrics[item]['ft']['fid'] = {'avg': sum(fid)/len(fid), 'vals': fid}
+    metrics[item]['base']['ssim'] = {'avg': sum(ssim2)/len(ssim2), 'vals': ssim2}
+    metrics[item]['base']['psnr'] = {'avg': sum(psnr2)/len(psnr2), 'vals': psnr2}
+    metrics[item]['base']['lpips'] = {'avg': sum(lpips2)/len(lpips2), 'vals': lpips2}
+    metrics[item]['base']['fid'] = {'avg': sum(fid2)/len(fid2), 'vals': fid2}
+    with open('metrics.json', "w", encoding="utf-8") as json_file:
+        json.dump(metrics, json_file, ensure_ascii=False, indent=4)
+items = ['sidewalk', 'aaa', 'azri', 'dead', 'frankgirl', 'kobold', 'ramona', 'renee', 'walk', 'woody']
+for item in items:
+    if item in metrics:
+        continue
+    get_score(item, ['test/'+item+'/1.jpg', 'test/'+item+'/2.jpg', 'test/'+item+'/3.jpg'], 'test/'+item+'/v.mp4')
+ssim = []
+psnr = []
+lpips = []
+fid = []
+ssim2 = []
+psnr2 = []
+lpips2 = []
+fid2 = []
+for item in metrics.keys():
+    ssim.append(metrics[item]['ft']['ssim']['avg'])
+    psnr.append(metrics[item]['ft']['psnr']['avg'])
+    lpips.append(metrics[item]['ft']['lpips']['avg'])
+    fid.append(metrics[item]['ft']['fid']['avg'])
+    ssim2.append(metrics[item]['base']['ssim']['avg'])
+    psnr2.append(metrics[item]['base']['psnr']['avg'])
+    lpips2.append(metrics[item]['base']['lpips']['avg'])
+    fid2.append(metrics[item]['base']['fid']['avg'])
+    print(item)
+    print("SSIM:", metrics[item]['ft']['ssim']['avg'], metrics[item]['base']['ssim']['avg'])
+    print("PSNR:", metrics[item]['ft']['psnr']['avg'], metrics[item]['base']['psnr']['avg'])
+    print("LPIPS:", metrics[item]['ft']['lpips']['avg'], metrics[item]['base']['lpips']['avg'])
+    print("FID:", metrics[item]['ft']['fid']['avg'], metrics[item]['base']['fid']['avg'])
+print('Results:')
+print("SSIM:", sum(ssim)/len(ssim))
+print("PSNR:", sum(psnr)/len(psnr))
+print("LPIPS:", sum(lpips)/len(lpips))
+print("FID:", sum(fid)/len(fid))
+print('baseline:')
+print("SSIM:", sum(ssim2)/len(ssim2))
+print("PSNR:", sum(psnr2)/len(psnr2))
+print("LPIPS:", sum(lpips2)/len(lpips2))
+print("FID:", sum(fid2)/len(fid2))

inception.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import models
+class InceptionV3(nn.Module):
+    """Pretrained InceptionV3 network returning feature maps"""
+    # Index of default block of inception to return,
+    # corresponds to output of final average pooling
+    DEFAULT_BLOCK_INDEX = 3
+    # Maps feature dimensionality to their output blocks indices
+    BLOCK_INDEX_BY_DIM = {
+        64: 0,   # First max pooling features
+        192: 1,  # Second max pooling featurs
+        768: 2,  # Pre-aux classifier features
+        2048: 3  # Final average pooling features
+    }
+    def __init__(self,
+                 output_blocks=[DEFAULT_BLOCK_INDEX],
+                 resize_input=True,
+                 normalize_input=True,
+                 requires_grad=False):
+        """Build pretrained InceptionV3
+        Parameters
+        ----------
+        output_blocks : list of int
+            Indices of blocks to return features of. Possible values are:
+                - 0: corresponds to output of first max pooling
+                - 1: corresponds to output of second max pooling
+                - 2: corresponds to output which is fed to aux classifier
+                - 3: corresponds to output of final average pooling
+        resize_input : bool
+            If true, bilinearly resizes input to width and height 299 before
+            feeding input to model. As the network without fully connected
+            layers is fully convolutional, it should be able to handle inputs
+            of arbitrary size, so resizing might not be strictly needed
+        normalize_input : bool
+            If true, normalizes the input to the statistics the pretrained
+            Inception network expects
+        requires_grad : bool
+            If true, parameters of the model require gradient. Possibly useful
+            for finetuning the network
+        """
+        super(InceptionV3, self).__init__()
+        self.resize_input = resize_input
+        self.normalize_input = normalize_input
+        self.output_blocks = sorted(output_blocks)
+        self.last_needed_block = max(output_blocks)
+        assert self.last_needed_block <= 3, \
+            'Last possible output block index is 3'
+        self.blocks = nn.ModuleList()
+        inception = models.inception_v3(pretrained=True)
+        # Block 0: input to maxpool1
+        block0 = [
+            inception.Conv2d_1a_3x3,
+            inception.Conv2d_2a_3x3,
+            inception.Conv2d_2b_3x3,
+            nn.MaxPool2d(kernel_size=3, stride=2)
+        ]
+        self.blocks.append(nn.Sequential(*block0))
+        # Block 1: maxpool1 to maxpool2
+        if self.last_needed_block >= 1:
+            block1 = [
+                inception.Conv2d_3b_1x1,
+                inception.Conv2d_4a_3x3,
+                nn.MaxPool2d(kernel_size=3, stride=2)
+            ]
+            self.blocks.append(nn.Sequential(*block1))
+        # Block 2: maxpool2 to aux classifier
+        if self.last_needed_block >= 2:
+            block2 = [
+                inception.Mixed_5b,
+                inception.Mixed_5c,
+                inception.Mixed_5d,
+                inception.Mixed_6a,
+                inception.Mixed_6b,
+                inception.Mixed_6c,
+                inception.Mixed_6d,
+                inception.Mixed_6e,
+            ]
+            self.blocks.append(nn.Sequential(*block2))
+        # Block 3: aux classifier to final avgpool
+        if self.last_needed_block >= 3:
+            block3 = [
+                inception.Mixed_7a,
+                inception.Mixed_7b,
+                inception.Mixed_7c,
+                nn.AdaptiveAvgPool2d(output_size=(1, 1))
+            ]
+            self.blocks.append(nn.Sequential(*block3))
+        for param in self.parameters():
+            param.requires_grad = requires_grad
+    def forward(self, inp):
+        """Get Inception feature maps
+        Parameters
+        ----------
+        inp : torch.autograd.Variable
+            Input tensor of shape Bx3xHxW. Values are expected to be in
+            range (0, 1)
+        Returns
+        -------
+        List of torch.autograd.Variable, corresponding to the selected output
+        block, sorted ascending by index
+        """
+        outp = []
+        x = inp
+        if self.resize_input:
+            x = F.upsample(x, size=(299, 299), mode='bilinear')
+        if self.normalize_input:
+            x = x.clone()
+            x[:, 0] = x[:, 0] * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
+            x[:, 1] = x[:, 1] * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
+            x[:, 2] = x[:, 2] * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+            if idx in self.output_blocks:
+                outp.append(x)
+            if idx == self.last_needed_block:
+                break
+        return outp

main.py ADDED Viewed

	@@ -0,0 +1,1097 @@

+import logging
+import math
+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
+from diffusers.models.controlnet import ControlNetConditioningEmbedding
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from tqdm.auto import tqdm
+from src.configs.stage2_config import args
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from src.dataset.stage2_dataset import InpaintDataset, InpaintCollate_fn
+from transformers import CLIPVisionModelWithProjection
+from transformers import Dinov2Model
+from src.models.stage2_inpaint_unet_2d_condition import Stage2_InapintUNet2DConditionModel
+import glob
+import os
+import torch
+from torch import nn
+from PIL import Image, ImageOps
+import numpy as np
+from diffusers import UniPCMultistepScheduler
+from src.models.stage2_inpaint_unet_2d_condition import Stage2_InapintUNet2DConditionModel
+from torchvision import transforms
+from diffusers.models.controlnet import ControlNetConditioningEmbedding
+from transformers import CLIPImageProcessor
+from transformers import Dinov2Model
+from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel,ControlNetModel,DDIMScheduler
+from src.pipelines.PCDMs_pipeline import PCDMsPipeline
+#from single_extract_pose import inference_pose
+import spaces
+from easy_dwpose import DWposeDetector
+from PIL import Image
+import cv2
+import os
+import gradio as gr
+import rembg
+import uuid
+import gc
+from numba import cuda
+from huggingface_hub import hf_hub_download
+# Inputs ===================================================================================================
+input_img = "sm.png"
+train_imgs = ["target.png"]
+in_vid = "walk.mp4"
+out_vid = 'out.mp4'
+"""
+train_steps = 100
+inference_steps = 10
+fps = 12
+"""
+debug = False
+save_model = True
+max_batch_size = 8
+# Pose detection ==============================================================================================
+def load_models():
+    dwpose = DWposeDetector(device="cpu")
+    rembg_session = rembg.new_session("u2netp")
+    pcdms_model = hf_hub_download(repo_id="acmyu/PCDMs", filename="pcdms_ckpt.pt")
+    # Load scheduler
+    noise_scheduler = DDPMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1-base", subfolder="scheduler")
+    # Load model
+    image_encoder_p = Dinov2Model.from_pretrained('facebook/dinov2-giant')
+    image_encoder_g = CLIPVisionModelWithProjection.from_pretrained('laion/CLIP-ViT-H-14-laion2B-s32B-b79K')#("openai/clip-vit-base-patch32")
+    vae = AutoencoderKL.from_pretrained("stabilityai/stable-diffusion-2-1-base", subfolder="vae")
+    unet = Stage2_InapintUNet2DConditionModel.from_pretrained(
+                "stabilityai/stable-diffusion-2-1-base",
+                torch_dtype=torch.float16,
+                subfolder="unet",
+                in_channels=9,
+                low_cpu_mem_usage=False,
+                ignore_mismatched_sizes=True)
+    return dwpose, rembg_session, pcdms_model, noise_scheduler, image_encoder_p, image_encoder_g, vae, unet
+#load_models()
+def resize_and_pad(img, target_img):
+    tw, th = target_img.size
+    w, h = img.size
+    if tw/th > w/h:
+        tw = int(th * w/h)
+    elif tw/th < w/h:
+        th = int(tw * h/w)
+    img = img.resize((tw, th), Image.BICUBIC)
+    tw, th = target_img.size
+    new_img = Image.new("RGB", (tw, th), (0, 0, 0))
+    left = (tw - img.width) // 2
+    top = (th - img.height) // 2
+    new_img.paste(img, (left, top))
+    return new_img
+def remove_zero_pad(image):
+    image = np.array(image)
+    dummy = np.argwhere(image != 0) # assume blackground is zero
+    max_y = dummy[:, 0].max()
+    min_y = dummy[:, 0].min()
+    min_x = dummy[:, 1].min()
+    max_x = dummy[:, 1].max()
+    crop_image = image[min_y:max_y, min_x:max_x]
+    return Image.fromarray(crop_image)
+def get_pose(img, dwpose, outfile, crop=False):
+    #pil_image = Image.open("imgs/"+img).convert("RGB")
+    #skeleton = dwpose(pil_image, output_type="np", include_hands=True, include_face=False)
+    #img.thumbnail((512,512))
+    out_img = dwpose(img, include_hands=True, include_face=False)
+    #print(pose['bodies'])
+    if crop:
+        bbox = out_img.getbbox()
+        out_img = out_img.crop(bbox)
+        out_img = ImageOps.expand(out_img, border=int(out_img.width*0.2), fill=(0,0,0))
+    return out_img
+def extract_frames(video_path, fps):
+    video_capture = cv2.VideoCapture(video_path)
+    frame_count = 0
+    frames = []
+    fps_in = video_capture.get(cv2.CAP_PROP_FPS)
+    fps_out = fps
+    index_in = -1
+    index_out = -1
+    while True:
+        success = video_capture.grab()
+        if not success: break
+        index_in += 1
+        out_due = int(index_in / fps_in * fps_out)
+        if out_due > index_out:
+            success, frame = video_capture.retrieve()
+            if not success:
+                break
+            index_out += 1
+            frame_count += 1
+            frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
+    video_capture.release()
+    print(f"Extracted {frame_count} frames")
+    return frames
+def removebg(img, rembg_session):
+    result = Image.new("RGB", img.size, "#ffffff")
+    out = rembg.remove(img, session=rembg_session)
+    result.paste(out, mask=out)
+    return result
+def prepare_inputs_train(images, bg_remove, dwpose, rembg_session):
+    if bg_remove:
+        images = [removebg(img, rembg_session) for img in images]
+    in_img = images[0]
+    in_pose = get_pose(in_img, dwpose, "in_pose.png")
+    train_poses = []
+    train_imgs = [resize_and_pad(img, in_img) for img in images[1:]]
+    for i, img in enumerate(train_imgs):
+        train_poses.append(get_pose(img, dwpose, "tr_pose"+str(i)+".png"))
+    return in_img, in_pose, train_imgs, train_poses
+def prepare_inputs_inference(in_img, in_vid, fps, dwpose, resize='target', is_app=False):
+    progress=gr.Progress(track_tqdm=True)
+    print("prepare_inputs_inference")
+    in_pose = get_pose(in_img, dwpose, "in_pose.png")
+    frames = extract_frames(in_vid, fps)
+    #frames = [removebg(img, rembg_session) for img in frames]
+    if debug:
+        for i, frame in enumerate(frames):
+            frame.save("out/frame_"+str(i)+".png")
+    print("vid: ", in_vid, fps)
+    progress_bar = tqdm(range(len(frames)), initial=0, desc="Frames")
+    target_poses = []
+    max_left = max_top = 999999
+    max_right = max_bottom = 0
+    it = frames
+    if is_app:
+        it = progress.tqdm(frames, desc="Pose Detection")
+    for f in it:
+        tpose = get_pose(f, dwpose, "tar_pose"+str(len(target_poses))+".png")
+        target_poses.append(tpose)
+        progress_bar.update(1)
+        bbox = tpose.getbbox()
+        left, top, right, bottom = bbox
+        max_left = min(max_left, left)
+        max_top = min(max_top, top)
+        max_right = max(max_right, right)
+        max_bottom = max(max_bottom, bottom)
+    target_poses_cropped = []
+    for tpose in target_poses:
+        if resize=='target':
+            tpose = tpose.crop((max_left, max_top, max_right, max_bottom))
+            tpose = ImageOps.expand(tpose, border=int(tpose.width*0.2), fill=(0,0,0))
+            tpose = resize_and_pad(tpose, in_img)
+        if debug:
+            tpose.save("out/"+"tar_pose"+str(len(target_poses_cropped))+".png")
+        target_poses_cropped.append(tpose)
+    return target_poses_cropped, in_pose
+def prepare_inputs(images, in_vid, fps, bg_remove, dwpose, rembg_session, resize='target', is_app=False):
+    in_img, in_pose, train_imgs, train_poses = prepare_inputs_train(images, bg_remove, dwpose, rembg_session)
+    target_poses_cropped, _ = prepare_inputs_inference(in_img, in_vid, fps, dwpose, resize, is_app)
+    return in_img, in_pose, train_imgs, train_poses, target_poses_cropped
+# Training ===================================================================================================
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.18.0.dev0")
+logger = get_logger(__name__)
+class ImageProjModel_p(torch.nn.Module):
+    """SD model with image prompt"""
+    def __init__(self, in_dim, hidden_dim, out_dim, dropout = 0.):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(in_dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.LayerNorm(hidden_dim),
+            nn.Linear(hidden_dim, out_dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.net(x)
+class ImageProjModel_g(torch.nn.Module):
+    """SD model with image prompt"""
+    def __init__(self, in_dim, hidden_dim, out_dim, dropout = 0.):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(in_dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.LayerNorm(hidden_dim),
+            nn.Linear(hidden_dim, out_dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):  # b, 257,1280
+        return self.net(x)
+class SDModel(torch.nn.Module):
+    """SD model with image prompt"""
+    def __init__(self, unet) -> None:
+        super().__init__()
+        self.image_proj_model_p = ImageProjModel_p(in_dim=1536, hidden_dim=768, out_dim=1024)
+        self.unet = unet
+        self.pose_proj = ControlNetConditioningEmbedding(
+            conditioning_embedding_channels=320,
+            block_out_channels=(16, 32, 96, 256),
+            conditioning_channels=3)
+    def forward(self, noisy_latents, timesteps, simg_f_p, timg_f_g, pose_f):
+        extra_image_embeddings_p = self.image_proj_model_p(simg_f_p)
+        extra_image_embeddings_g = timg_f_g
+        print(extra_image_embeddings_p.size())
+        print(extra_image_embeddings_g.size())
+        encoder_image_hidden_states = torch.cat([extra_image_embeddings_p ,extra_image_embeddings_g], dim=1)
+        pose_cond = self.pose_proj(pose_f)
+        pred_noise = self.unet(noisy_latents, timesteps, class_labels=timg_f_g, encoder_hidden_states=encoder_image_hidden_states,my_pose_cond=pose_cond).sample
+        return pred_noise
+def load_training_checkpoint(model, pcdms_model, tag=None, **kwargs):
+    #model_sd = torch.load(load_dir, map_location="cpu")["module"]
+    model_sd = torch.load(
+        pcdms_model,
+        map_location="cpu"
+    )["module"]
+    image_proj_model_dict = {}
+    pose_proj_dict = {}
+    unet_dict = {}
+    for k in model_sd.keys():
+        if k.startswith("pose_proj"):
+            pose_proj_dict[k.replace("pose_proj.", "")] = model_sd[k]
+        elif k.startswith("image_proj_model_p"):
+            image_proj_model_dict[k.replace("image_proj_model_p.", "")] = model_sd[k]
+        elif k.startswith("image_proj_model."):
+            image_proj_model_dict[k.replace("image_proj_model.", "")] = model_sd[k]
+        elif k.startswith("unet"):
+            unet_dict[k.replace("unet.", "")] = model_sd[k]
+        else:
+            print(k)
+    model.pose_proj.load_state_dict(pose_proj_dict)
+    model.image_proj_model_p.load_state_dict(image_proj_model_dict)
+    model.unet.load_state_dict(unet_dict)
+    return model, 0, 0
+def checkpoint_model(checkpoint_folder, ckpt_id, model, epoch, last_global_step, **kwargs):
+    """Utility function for checkpointing model + optimizer dictionaries
+    The main purpose for this is to be able to resume training from that instant again
+    """
+    checkpoint_state_dict = {
+        "epoch": epoch,
+        "last_global_step": last_global_step,
+    }
+    # Add extra kwargs too
+    checkpoint_state_dict.update(kwargs)
+    success = model.save_checkpoint(checkpoint_folder, ckpt_id, checkpoint_state_dict)
+    status_msg = f"checkpointing: checkpoint_folder={checkpoint_folder}, ckpt_id={ckpt_id}"
+    if success:
+        logging.info(f"Success {status_msg}")
+    else:
+        logging.warning(f"Failure {status_msg}")
+    return
+@spaces.GPU(duration=600)
+def train(modelId, in_image, in_pose, train_images, train_poses, train_steps, pcdms_model, noise_scheduler, image_encoder_p, image_encoder_g, vae, unet, finetune=True, is_app=False):
+    logging_dir = 'outputs/logging'
+    print('start train')
+    progress=gr.Progress(track_tqdm=True)
+    accelerator = Accelerator(
+        log_with=args.report_to,
+        project_dir=logging_dir,
+        mixed_precision=args.mixed_precision,
+        gradient_accumulation_steps=args.gradient_accumulation_steps
+    )
+    # Make one log on every process with the configuration for debugging.
+    #logging.basicConfig(
+    #    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    #    datefmt="%m/%d/%Y %H:%M:%S",
+    #    level=logging.INFO, )
+    print(accelerator.state)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    set_seed(42)
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        os.makedirs('outputs', exist_ok=True)
+    """
+    unet = Stage2_InapintUNet2DConditionModel.from_pretrained("stabilityai/stable-diffusion-2-1-base", subfolder="unet",
+                                                   in_channels=9, class_embed_type="projection" ,projection_class_embeddings_input_dim=1024,
+                                                  low_cpu_mem_usage=False, ignore_mismatched_sizes=True)
+    """
+    image_encoder_p.requires_grad_(False)
+    image_encoder_g.requires_grad_(False)
+    vae.requires_grad_(False)
+    sd_model = SDModel(unet=unet)
+    sd_model.train()
+    if args.gradient_checkpointing:
+        sd_model.enable_gradient_checkpointing()
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+    learning_rate = 1e-4
+    train_batch_size = min(len(train_images), max_batch_size) #len(train_images) % 16
+    # Optimizer creation
+    params_to_optimize = sd_model.parameters()
+    optimizer = torch.optim.AdamW(
+        params_to_optimize,
+        lr=learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    inputs = [{
+        "source_image": in_image,
+        "source_pose": in_pose,
+        "target_image": timg,
+        "target_pose": tpose,
+    } for timg, tpose in zip(train_images, train_poses)]
+    """
+    inputs = {[
+        "source_image": Image.open('imgs/sm.png'),
+        "source_pose": Image.open('imgs/sm_pose.jpg'),
+        "target_image": Image.open('imgs/target.png'),
+        "target_pose": Image.open('imgs/target_pose.jpg'),
+    ]}
+    """
+    #print(inputs)
+    dataset = InpaintDataset(
+        inputs,
+        'imgs/',
+        size=(args.img_width, args.img_height), # w h
+        imgp_drop_rate=0.1,
+        imgg_drop_rate=0.1,
+    )
+    """
+    dataset = InpaintDataset(
+        args.json_path,
+        args.image_root_path,
+        size=(args.img_width, args.img_height), # w h
+        imgp_drop_rate=0.1,
+        imgg_drop_rate=0.1,
+    )
+    """
+    train_sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset, num_replicas=accelerator.num_processes, rank=accelerator.process_index, shuffle=True)
+    train_dataloader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=train_sampler,
+        collate_fn=InpaintCollate_fn,
+        batch_size=train_batch_size,
+        num_workers=0,)
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    args.max_train_steps = train_steps
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+    # Prepare everything with our `accelerator`.
+    sd_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(sd_model, optimizer, train_dataloader, lr_scheduler)
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    """
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    """
+    # Move vae, unet and text_encoder to device and cast to weight_dtype
+    vae.to(accelerator.device, dtype=weight_dtype)
+    sd_model.unet.to(accelerator.device, dtype=weight_dtype)
+    image_encoder_p.to(accelerator.device, dtype=weight_dtype)
+    image_encoder_g.to(accelerator.device, dtype=weight_dtype)
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    args.num_train_epochs = train_steps
+    # Train!
+    total_batch_size = (
+            train_batch_size
+            * accelerator.num_processes
+            * args.gradient_accumulation_steps
+    )
+    print("***** Running training *****")
+    print(f"  Num batches each epoch = {len(train_dataloader)}")
+    print(f"  Num Epochs = {args.num_train_epochs}")
+    print(f"  Instantaneous batch size per device = {train_batch_size}")
+    print(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
+    )
+    print(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    print(f"  Total optimization steps = {args.max_train_steps}")
+    if args.resume_from_checkpoint:
+        # New Code #
+        # Loads the DeepSpeed checkpoint from the specified path
+        prior_model, last_epoch, last_global_step = load_training_checkpoint(
+            sd_model,
+            pcdms_model,
+            **{"load_optimizer_states": True, "load_lr_scheduler_states": True},
+        )
+        print(f"Resumed from checkpoint: {args.resume_from_checkpoint}, global step: {last_global_step}")
+        starting_epoch = last_epoch
+        global_steps = last_global_step
+        sd_model = sd_model
+    else:
+        global_steps = 0
+        starting_epoch = 0
+        sd_model = sd_model
+    progress_bar = tqdm(range(global_steps, args.max_train_steps), initial=global_steps, desc="Steps",
+                        # Only show the progress bar once on each machine.
+                        disable=not accelerator.is_local_main_process, )
+    bsz = train_batch_size
+    if not finetune or train_steps == 0:
+        accelerator.wait_for_everyone()
+        accelerator.end_training()
+        return {k: v.cpu() for k, v in sd_model.state_dict().items()}
+    it = range(starting_epoch, args.num_train_epochs)
+    if is_app:
+        it = progress.tqdm(it, desc="Fine-tuning")
+    for epoch in it:
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(sd_model):
+                with torch.no_grad():
+                    # Convert images to latent space
+                    latents = vae.encode(batch["source_target_image"].to(dtype=weight_dtype)).latent_dist.sample()
+                    latents = latents * vae.config.scaling_factor
+                    # Get the masked image latents
+                    masked_latents = vae.encode(batch["vae_source_mask_image"].to(dtype=weight_dtype)).latent_dist.sample()
+                    masked_latents = masked_latents * vae.config.scaling_factor
+                    bsz = batch["target_image"].size(dim=0)
+                    # mask
+                    mask1 = torch.ones((bsz, 1, int(args.img_height / 8), int(args.img_width / 8))).to(accelerator.device, dtype=weight_dtype)
+                    mask0 = torch.zeros((bsz, 1, int(args.img_height / 8), int(args.img_width / 8))).to(accelerator.device, dtype=weight_dtype)
+                    mask = torch.cat([mask1, mask0], dim=3)
+                    # Get the image embedding for conditioning
+                    cond_image_feature_p = image_encoder_p(batch["source_image"].to(accelerator.device, dtype=weight_dtype))
+                    cond_image_feature_p = (cond_image_feature_p.last_hidden_state)
+                    cond_image_feature_g = image_encoder_g(batch["target_image"].to(accelerator.device, dtype=weight_dtype), ).image_embeds
+                    cond_image_feature_g =cond_image_feature_g.unsqueeze(1)
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (latents.shape[0], latents.shape[1], 1, 1), device=latents.device
+                    )
+                # Sample a random timestep for each image
+                #timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (train_batch_size,),device=latents.device, )
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,),device=latents.device, )
+                timesteps = timesteps.long()
+                # Add noise to the latents according to the noise magnitude at each timestep (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                #print(noisy_latents.size(), mask.size(), masked_latents.size())
+                noisy_latents = torch.cat([noisy_latents, mask, masked_latents], dim=1)
+                # Get the text embedding for conditioning
+                cond_pose = batch["source_target_pose"].to(dtype=weight_dtype)
+                #print(noisy_latents.size())
+                #print(cond_image_feature_p.size())
+                #print(cond_image_feature_g.size())
+                #print(cond_pose.size())
+                # Predict the noise residual
+                model_pred = sd_model(noisy_latents, timesteps, cond_image_feature_p,cond_image_feature_g, cond_pose, )
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(
+                        f"Unknown prediction type {noise_scheduler.config.prediction_type}"
+                    )
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = sd_model.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                global_steps += 1
+            if global_steps >= args.max_train_steps:
+                break
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            print(logs)
+            progress_bar.set_postfix(**logs)
+        progress_bar.update(1)
+    # Create the pipeline using  the trained modules and save it.
+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+    if save_model: #if global_steps % args.checkpointing_steps == 0 or global_steps == args.max_train_steps:
+        print('saving', modelId)
+        checkpoint_state_dict = {
+            "epoch": 0,
+            "module": {k: v.cpu() for k, v in sd_model.state_dict().items()}, #sd_model.state_dict(),
+        }
+        print(list(sd_model.state_dict().keys())[:20])
+        torch.save(checkpoint_state_dict, modelId+".pt")
+        gc.collect()
+        torch.cuda.empty_cache()
+        #device = cuda.get_current_device()
+        #device.reset()
+        print('done train')
+        return
+    gc.collect()
+    torch.cuda.empty_cache()
+    return {k: v.cpu() for k, v in sd_model.state_dict().items()}
+# Pose-transfer ===================================================================================================
+device = "cuda"
+class ImageProjModel(torch.nn.Module):
+    """SD model with image prompt"""
+    def __init__(self, in_dim, hidden_dim, out_dim, dropout = 0.):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(in_dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.LayerNorm(hidden_dim),
+            nn.Linear(hidden_dim, out_dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.net(x)
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+    w, h = imgs[0].size
+    print(w, h)
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    grid_w, grid_h = grid.size
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+def load_mydict(modelId, finetuned_model):
+    if save_model:
+        model_ckpt_path = modelId+'.pt'
+        model_sd = torch.load(model_ckpt_path, map_location="cpu")["module"]
+    else:
+        model_sd = finetuned_model #torch.load(model_ckpt_path, map_location="cpu")["module"]
+    image_proj_model_dict = {}
+    pose_proj_dict = {}
+    unet_dict = {}
+    for k in model_sd.keys():
+        if k.startswith("pose_proj"):
+            pose_proj_dict[k.replace("pose_proj.", "")] = model_sd[k]
+        elif k.startswith("image_proj_model_p"):
+            image_proj_model_dict[k.replace("image_proj_model_p.", "")] = model_sd[k]
+        elif k.startswith("image_proj_model"):
+            image_proj_model_dict[k.replace("image_proj_model.", "")] = model_sd[k]
+        elif k.startswith("unet"):
+            unet_dict[k.replace("unet.", "")] = model_sd[k]
+        else:
+            print(k)
+    return image_proj_model_dict, pose_proj_dict, unet_dict
+@spaces.GPU(duration=600)
+def inference(modelId, in_image, in_pose, target_poses, inference_steps, finetuned_model, vae, unet, image_encoder, is_app=False):
+    print('start inference')
+    progress=gr.Progress(track_tqdm=True)
+    if not save_model:
+        finetuned_model = {k: v.cuda() for k, v in finetuned_model.items()}
+    device = "cuda"
+    pretrained_model_name_or_path ="stabilityai/stable-diffusion-2-1-base"
+    image_encoder_path = "facebook/dinov2-giant"
+    #model_ckpt_path = "./pcdms_ckpt.pt"   # ckpt path
+    model_ckpt_path = modelId+'.pt'
+    clip_image_processor = CLIPImageProcessor()
+    img_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ])
+    generator = torch.Generator(device=device).manual_seed(42)
+    """
+    unet = Stage2_InapintUNet2DConditionModel.from_pretrained(pretrained_model_name_or_path, torch_dtype=torch.float16,subfolder="unet",in_channels=9, low_cpu_mem_usage=False, ignore_mismatched_sizes=True).to(device)
+    vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path,subfolder="vae").to(device, dtype=torch.float16)
+    image_encoder = Dinov2Model.from_pretrained(image_encoder_path).to(device, dtype=torch.float16)
+    """
+    noise_scheduler = DDIMScheduler(
+        num_train_timesteps=1000,
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        clip_sample=False,
+        set_alpha_to_one=False,
+        steps_offset=1,
+    )
+    unet = unet.to(device, dtype=torch.float16)
+    vae = vae.to(device, dtype=torch.float16)
+    image_encoder = image_encoder.to(device, dtype=torch.float16)
+    image_proj_model = ImageProjModel(in_dim=1536, hidden_dim=768, out_dim=1024).to(device).to(dtype=torch.float16)
+    pose_proj_model = ControlNetConditioningEmbedding(
+        conditioning_embedding_channels=320,
+        block_out_channels=(16, 32, 96, 256),
+        conditioning_channels=3).to(device).to(dtype=torch.float16)
+    # load weight
+    print('loading', modelId)
+    image_proj_model_dict, pose_proj_dict, unet_dict = load_mydict(modelId, finetuned_model)
+    print('loaded', modelId)
+    image_proj_model.load_state_dict(image_proj_model_dict)
+    pose_proj_model.load_state_dict(pose_proj_dict)
+    unet.load_state_dict(unet_dict)
+    pipe = PCDMsPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base", unet=unet,  torch_dtype=torch.float16, scheduler=noise_scheduler,feature_extractor=None,safety_checker=None).to(device)
+    print('====================== model load finish ===================')
+    results = []
+    progress_bar = tqdm(range(len(target_poses)), initial=0, desc="Frames")
+    it = target_poses
+    if is_app:
+        it = progress.tqdm(it, desc="Pose Transfer")
+    for pose in it:
+        num_samples = 1
+        image_size = (512, 512)
+        s_img_path = 'imgs/'+input_img # input image 1
+        #target_pose_img = 'imgs/pose_'+str(n)+'.png' # input image 2
+        #t_pose = inference_pose(target_pose_img, image_size=(image_size[1], image_size[0])).resize(image_size, Image.BICUBIC)
+        #t_pose = Image.open(target_pose_img).convert("RGB").resize((image_size), Image.BICUBIC)
+        t_pose = pose.convert("RGB").resize((image_size), Image.BICUBIC)
+        #t_pose = resize_and_pad(pose.convert("RGB"))
+        #s_img = Image.open(s_img_path)
+        width_orig, height_orig = in_image.size
+        s_img = in_image.convert("RGB").resize(image_size, Image.BICUBIC)
+        #s_img = resize_and_pad(in_image.convert("RGB"))
+        black_image = Image.new("RGB", s_img.size, (0, 0, 0)).resize(image_size, Image.BICUBIC)
+        s_img_t_mask = Image.new("RGB", (s_img.width * 2, s_img.height))
+        s_img_t_mask.paste(s_img, (0, 0))
+        s_img_t_mask.paste(black_image, (s_img.width, 0))
+        #s_pose = inference_pose(s_img_path, image_size=(image_size[1], image_size[0])).resize(image_size, Image.BICUBIC)
+        #s_pose = Image.open('imgs/sm_pose.jpg').convert("RGB").resize(image_size, Image.BICUBIC)
+        s_pose = in_pose.convert("RGB").resize(image_size, Image.BICUBIC)
+        #s_pose = resize_and_pad(in_pose.convert("RGB"))
+        print('source image width: {}, height: {}'.format(s_pose.width, s_pose.height))
+        #t_pose = Image.open(target_pose_img).convert("RGB").resize((image_size), Image.BICUBIC)
+        st_pose = Image.new("RGB", (s_pose.width * 2, s_pose.height))
+        st_pose.paste(s_pose, (0, 0))
+        st_pose.paste(t_pose, (s_pose.width, 0))
+        clip_s_img = clip_image_processor(images=s_img, return_tensors="pt").pixel_values
+        vae_image = torch.unsqueeze(img_transform(s_img_t_mask), 0)
+        cond_st_pose = torch.unsqueeze(img_transform(st_pose), 0)
+        mask1 = torch.ones((1, 1, int(image_size[0] / 8), int(image_size[1] / 8))).to(device, dtype=torch.float16)
+        mask0 = torch.zeros((1, 1, int(image_size[0] / 8), int(image_size[1] / 8))).to(device, dtype=torch.float16)
+        mask = torch.cat([mask1, mask0], dim=3)
+        with torch.inference_mode():
+            cond_pose = pose_proj_model(cond_st_pose.to(dtype=torch.float16, device=device))
+            simg_mask_latents = pipe.vae.encode(vae_image.to(device, dtype=torch.float16)).latent_dist.sample()
+            simg_mask_latents = simg_mask_latents * 0.18215
+            images_embeds = image_encoder(clip_s_img.to(device, dtype=torch.float16)).last_hidden_state
+            image_prompt_embeds = image_proj_model(images_embeds)
+            uncond_image_prompt_embeds = image_proj_model(torch.zeros_like(images_embeds))
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        output, _ = pipe(
+            simg_mask_latents= simg_mask_latents,
+            mask = mask,
+            cond_pose = cond_pose,
+            prompt_embeds=image_prompt_embeds,
+            negative_prompt_embeds=uncond_image_prompt_embeds,
+            height=image_size[1],
+            width=image_size[0]*2,
+            num_images_per_prompt=num_samples,
+            guidance_scale=2.0,
+            generator=generator,
+            num_inference_steps=inference_steps,
+        )
+        output = output.images[-1]
+        result = output.crop((image_size[0], 0, image_size[0] * 2, image_size[1]))
+        result = result.resize((width_orig, height_orig), Image.BICUBIC)
+        #result = remove_zero_pad(result)
+        if debug:
+            result.save('out/'+str(len(results))+'.png')
+        results.append(result)
+        progress_bar.update(1)
+    gc.collect()
+    torch.cuda.empty_cache()
+    return results
+def gen_vid(frames, video_name, fps, codec):
+    progress=gr.Progress(track_tqdm=True)
+    frame = cv2.cvtColor(np.array(frames[0]), cv2.COLOR_RGB2BGR)
+    height, width, layers = frame.shape
+    #video = cv2.VideoWriter(video_name, 0, 1, (width,height))
+    if codec == 'mp4':
+        video = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
+    else:
+        video = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc(*'VP90'), fps, (width, height))
+    for r in progress.tqdm(frames, desc="Creating video"):
+        image = cv2.cvtColor(np.array(r), cv2.COLOR_RGB2BGR)
+        video.write(image)
+    #cv2.destroyAllWindows()
+    #video.release()
+def run(images, video_path, train_steps=100, inference_steps=10, fps=12, bg_remove=False, resize_inputs=True, finetune=True, is_app=False):
+    print("==== Load Models ====")
+    dwpose, rembg_session, pcdms_model, noise_scheduler, image_encoder_p, image_encoder_g, vae, unet = load_models()
+    print("==== Pose Detection ====")
+    if resize_inputs:
+        resize = 'target'
+    else:
+        resize = 'none'
+    in_img, in_pose, train_imgs, train_poses, target_poses = prepare_inputs(images, video_path, fps, bg_remove, dwpose, rembg_session, resize=resize, is_app=is_app)
+    if save_model:
+        train("fine_tuned_pcdms", in_img, in_pose, train_imgs, train_poses, train_steps, pcdms_model, noise_scheduler, image_encoder_p, image_encoder_g, vae, unet, finetune, is_app)
+        print('next')
+        results = inference("fine_tuned_pcdms", in_img, in_pose, target_poses, inference_steps, None, vae, unet, image_encoder_p, is_app)
+    else:
+        print("==== Finetuning ====")
+        finetuned_model = train("fine_tuned_pcdms", in_img, in_pose, train_imgs, train_poses, train_steps, pcdms_model, noise_scheduler, image_encoder_p, image_encoder_g, vae, unet, finetune, is_app)
+        print("==== Pose Transfer ====")
+        results = inference("fine_tuned_pcdms", in_img, in_pose, target_poses, inference_steps, finetuned_model, vae, unet, image_encoder_p, is_app)
+    return results
+def run_train(images, train_steps=100, bg_remove=False, resize_inputs=True, modelId="fine_tuned_pcdms"):
+    finetune=True
+    is_app=True
+    images = [img[0] for img in images]
+    dwpose, rembg_session, pcdms_model, noise_scheduler, image_encoder_p, image_encoder_g, vae, unet = load_models()
+    if resize_inputs:
+        resize = 'target'
+    else:
+        resize = 'none'
+    in_img, in_pose, train_imgs, train_poses = prepare_inputs_train(images, bg_remove, dwpose, rembg_session)
+    train(modelId, in_img, in_pose, train_imgs, train_poses, train_steps, pcdms_model, noise_scheduler, image_encoder_p, image_encoder_g, vae, unet, finetune, is_app)
+def run_inference(images, video_path, inference_steps=10, fps=12, bg_remove=False, resize_inputs=True, modelId="fine_tuned_pcdms"):
+    is_app=True
+    images = [img[0] for img in images]
+    in_img = images[0]
+    dwpose, rembg_session, pcdms_model, noise_scheduler, image_encoder_p, image_encoder_g, vae, unet = load_models()
+    target_poses, in_pose = prepare_inputs_inference(in_img, video_path, fps, dwpose, 'target', is_app)
+    results = inference(modelId, in_img, in_pose, target_poses, inference_steps, None, vae, unet, image_encoder_p, is_app)
+    if debug:
+        gen_vid(results, out_vid+'.mp4', fps, 'mp4')
+    else:
+        gen_vid(results, out_vid+'.webm', fps, 'webm')
+    print("Done!")
+    return out_vid+'.webm', results
+def run_app(images, video_path, train_steps=100, inference_steps=10, fps=12, bg_remove=False, resize_inputs=True):
+    images = [img[0] for img in images]
+    results = run(images, video_path, train_steps, inference_steps, fps, bg_remove, resize_inputs, finetune=True, is_app=True)
+    print("==== Video generation ====")
+    out_vid = f"out_{uuid.uuid4()}"
+    if debug:
+        gen_vid(results, out_vid+'.mp4', fps, 'mp4')
+    else:
+        gen_vid(results, out_vid+'.webm', fps, 'webm')
+    print("Done!")
+    return out_vid+'.webm', results
+"""
+train_steps = 100
+inference_steps = 10
+fps = 12
+"""
+"""
+iface = gr.Interface(
+    fn=run,
+    inputs=[
+        gr.Gallery(type="pil", label="Images of the Character"),
+        gr.Video(label="Motion-Capture Video"),
+        gr.Number(label="Training steps", value=100),
+        gr.Number(label="Inference steps", value=10),
+        gr.Number(label="Output frame rate", value=12),
+        gr.Checkbox(label="Remove background", value=False),
+    ],
+    outputs=[gr.Video(label="Result"), gr.Gallery(type="pil", label="Frames")],
+    title="Keyframes AI",
+    description="Upload images of your character and a motion-capture video to generate an animation of the character.",
+)
+"""

metrics.json ADDED Viewed

	@@ -0,0 +1,1538 @@

+{
+    "sidewalk": {
+        "ft": {
+            "ssim": {
+                "avg": 0.9425153732299805,
+                "vals": [
+                    0.9539688229560852,
+                    0.9410380721092224,
+                    0.9339408278465271,
+                    0.9381983876228333,
+                    0.937977135181427,
+                    0.937738835811615,
+                    0.9322819709777832,
+                    0.9447341561317444,
+                    0.9701671004295349,
+                    0.9403354525566101,
+                    0.9407327771186829,
+                    0.9384147524833679,
+                    0.9429612159729004,
+                    0.9541351199150085,
+                    0.9593266844749451,
+                    0.9423799514770508,
+                    0.9342405200004578,
+                    0.9316573143005371,
+                    0.9376768469810486,
+                    0.9355674386024475,
+                    0.9092764258384705,
+                    0.9486138820648193,
+                    0.9740718007087708,
+                    0.9381063580513,
+                    0.9418080449104309,
+                    0.9435502886772156,
+                    0.9383361339569092,
+                    0.9491550326347351,
+                    0.9579421877861023,
+                    0.9433062672615051,
+                    0.9372091889381409,
+                    0.9316847920417786,
+                    0.9353565573692322,
+                    0.9314845204353333,
+                    0.9296603202819824,
+                    0.947252094745636,
+                    0.9745006561279297,
+                    0.9371557235717773,
+                    0.9355664849281311,
+                    0.9372367858886719,
+                    0.9420561790466309,
+                    0.9548425674438477
+                ]
+            },
+            "psnr": {
+                "avg": 29.313762982030134,
+                "vals": [
+                    31.79728361792486,
+                    29.12789926111764,
+                    28.150918151496114,
+                    28.197009952290536,
+                    28.30431945800427,
+                    28.3766962913435,
+                    27.95230686775166,
+                    29.60480299071412,
+                    33.870620988359505,
+                    28.2723202748136,
+                    28.65414745402314,
+                    29.44914322199964,
+                    28.918554998476544,
+                    30.829394826527686,
+                    32.453438778794194,
+                    29.07800808482171,
+                    28.126848239355724,
+                    28.219538928254238,
+                    28.15327266691132,
+                    28.37061664435462,
+                    22.572103732095137,
+                    30.218097049078665,
+                    35.291027583657176,
+                    28.264550325776455,
+                    28.548636901711063,
+                    30.077494275071125,
+                    28.765906730485675,
+                    30.473295981007006,
+                    32.31943212041054,
+                    29.03166397751077,
+                    28.20738376155079,
+                    27.52313906207686,
+                    28.068443800302507,
+                    28.02290963487483,
+                    27.137965527670815,
+                    29.891409757083256,
+                    35.52381896568835,
+                    28.314135705730163,
+                    28.17076848548687,
+                    29.376791831995504,
+                    28.654625695022126,
+                    30.817302643645203
+                ]
+            },
+            "lpips": {
+                "avg": 0.0587034212159259,
+                "vals": [
+                    0.03719184175133705,
+                    0.05626270920038223,
+                    0.06762276589870453,
+                    0.07581378519535065,
+                    0.06386660039424896,
+                    0.06519967317581177,
+                    0.06447210162878036,
+                    0.044685374945402145,
+                    0.02818106673657894,
+                    0.06283751875162125,
+                    0.05770150199532509,
+                    0.05407204478979111,
+                    0.06568614393472672,
+                    0.041273392736911774,
+                    0.03479328751564026,
+                    0.05250183865427971,
+                    0.06724748760461807,
+                    0.07919331640005112,
+                    0.06452416628599167,
+                    0.06678760051727295,
+                    0.15427803993225098,
+                    0.04608333483338356,
+                    0.026074513792991638,
+                    0.06458891183137894,
+                    0.06116756051778793,
+                    0.050854653120040894,
+                    0.06727714836597443,
+                    0.04326387867331505,
+                    0.037447478622198105,
+                    0.0521712489426136,
+                    0.0674269050359726,
+                    0.07980889827013016,
+                    0.06633622944355011,
+                    0.06805833429098129,
+                    0.07028443366289139,
+                    0.043847665190696716,
+                    0.025734560564160347,
+                    0.06363274157047272,
+                    0.06513398140668869,
+                    0.0585525706410408,
+                    0.06366591155529022,
+                    0.039940472692251205
+                ]
+            },
+            "fid": {
+                "avg": 42.43536594462592,
+                "vals": [
+                    20.22960865639975,
+                    47.958607601608676,
+                    57.62921688800561,
+                    95.16926849364827,
+                    52.86706191106997,
+                    40.88534322140695,
+                    66.72949840511895,
+                    24.21628873128267,
+                    18.986568909208156,
+                    33.69661265603083,
+                    24.557634338057728,
+                    54.261117389557846,
+                    51.156090096940055,
+                    22.082607032341244,
+                    24.085151411940856,
+                    45.16219303566392,
+                    51.10103371191223,
+                    94.4284018519586,
+                    61.20478944897824,
+                    53.09587418802096,
+                    45.77148143461985,
+                    25.706806643970687,
+                    17.209723468709765,
+                    38.78578933921177,
+                    25.072541814813434,
+                    41.178136179334956,
+                    37.296019926311956,
+                    27.822427143452785,
+                    22.649814443169078,
+                    37.95954774350004,
+                    47.89610979304835,
+                    90.26735601619404,
+                    50.99027170447254,
+                    47.766544543497034,
+                    74.45788273316838,
+                    22.916864843654203,
+                    17.551236020192366,
+                    29.612576631133486,
+                    32.3629873396745,
+                    46.47368512482174,
+                    39.25242590132321,
+                    23.782172906863416
+                ]
+            }
+        },
+        "base": {
+            "ssim": {
+                "avg": 0.8774991716657367,
+                "vals": [
+                    0.8641023635864258,
+                    0.8767493367195129,
+                    0.8762156367301941,
+                    0.90242999792099,
+                    0.9017764925956726,
+                    0.8788102269172668,
+                    0.8805813789367676,
+                    0.8612765669822693,
+                    0.8842142224311829,
+                    0.8914286494255066,
+                    0.8772357106208801,
+                    0.8998511433601379,
+                    0.8634107112884521,
+                    0.8928578495979309,
+                    0.8836771845817566,
+                    0.8996903300285339,
+                    0.866424560546875,
+                    0.8816824555397034,
+                    0.8785967230796814,
+                    0.8867073655128479,
+                    0.8279438614845276,
+                    0.8717477321624756,
+                    0.8780853748321533,
+                    0.8800415992736816,
+                    0.8771164417266846,
+                    0.8793924450874329,
+                    0.8809206485748291,
+                    0.8608385920524597,
+                    0.893017590045929,
+                    0.8689785003662109,
+                    0.8729929327964783,
+                    0.8567183017730713,
+                    0.8588916659355164,
+                    0.8677377104759216,
+                    0.8715691566467285,
+                    0.8649471402168274,
+                    0.884833037853241,
+                    0.913370668888092,
+                    0.8970480561256409,
+                    0.8238387107849121,
+                    0.8899227976799011,
+                    0.8872933387756348
+                ]
+            },
+            "psnr": {
+                "avg": 21.225323061553667,
+                "vals": [
+                    18.731802028716977,
+                    22.517723678922334,
+                    21.373543493859014,
+                    22.758004432454015,
+                    22.955530937204767,
+                    20.145150147854874,
+                    21.634592878743387,
+                    20.379519268519918,
+                    23.696208467856458,
+                    22.98936924589105,
+                    21.756553483824376,
+                    23.69843738135742,
+                    17.770960047525655,
+                    22.7351530856508,
+                    20.95728828090057,
+                    23.440357115988917,
+                    20.458007935114516,
+                    19.903659717164693,
+                    19.636953200826923,
+                    21.34791655058212,
+                    19.213297117979387,
+                    21.474593210996353,
+                    23.02245609312815,
+                    21.57037201859069,
+                    20.49675090194842,
+                    21.149239825321867,
+                    21.768542181018454,
+                    20.75462471848509,
+                    23.340499225598812,
+                    20.184658098890935,
+                    19.84577062228628,
+                    18.651530199516753,
+                    17.713097904145002,
+                    19.501488720897637,
+                    20.525847469970195,
+                    20.044060320208107,
+                    23.33142797550424,
+                    25.84051181341309,
+                    24.517940621682737,
+                    16.233878462859337,
+                    21.53839254074825,
+                    21.857857163105283
+                ]
+            },
+            "lpips": {
+                "avg": 0.17535314992779777,
+                "vals": [
+                    0.1866557002067566,
+                    0.19489219784736633,
+                    0.1676192283630371,
+                    0.13815467059612274,
+                    0.1378740817308426,
+                    0.19519048929214478,
+                    0.1394621878862381,
+                    0.18285232782363892,
+                    0.1300656795501709,
+                    0.13945598900318146,
+                    0.1777152568101883,
+                    0.1422179788351059,
+                    0.2465370148420334,
+                    0.173164963722229,
+                    0.16046200692653656,
+                    0.1408388763666153,
+                    0.1748061180114746,
+                    0.15652400255203247,
+                    0.18918247520923615,
+                    0.13944856822490692,
+                    0.3080754280090332,
+                    0.15395303070545197,
+                    0.13146977126598358,
+                    0.21760974824428558,
+                    0.16546086966991425,
+                    0.18631187081336975,
+                    0.17548373341560364,
+                    0.2248706817626953,
+                    0.14110884070396423,
+                    0.1748245805501938,
+                    0.17234086990356445,
+                    0.1958460807800293,
+                    0.22889724373817444,
+                    0.1727704405784607,
+                    0.15012094378471375,
+                    0.20727047324180603,
+                    0.1201525330543518,
+                    0.09184442460536957,
+                    0.11003588140010834,
+                    0.34251728653907776,
+                    0.18208707869052887,
+                    0.19866067171096802
+                ]
+            },
+            "fid": {
+                "avg": 104.964235596045,
+                "vals": [
+                    111.40174829746752,
+                    148.92090091433516,
+                    128.51386183067834,
+                    61.321546832998955,
+                    72.54608643303393,
+                    119.84550665904197,
+                    104.43999062002221,
+                    60.195703096095556,
+                    64.8998572256939,
+                    49.645917433260855,
+                    114.28769797616002,
+                    66.89825805245846,
+                    155.83362493523907,
+                    88.44848612474361,
+                    104.61621566440316,
+                    163.43248950817738,
+                    172.53727330437798,
+                    101.98603573984371,
+                    125.94605415324145,
+                    102.60611610457259,
+                    104.21668109264168,
+                    102.24131609759299,
+                    70.50377311449385,
+                    161.15530398717667,
+                    96.17823491839759,
+                    105.78022442714524,
+                    120.89941533798705,
+                    94.75931214260221,
+                    73.59833927600653,
+                    112.51851327401378,
+                    148.31978447992762,
+                    113.95507129041124,
+                    82.26377459422716,
+                    154.98015860895285,
+                    96.10781829434029,
+                    100.0667395138281,
+                    83.65808540884001,
+                    50.67035884888851,
+                    64.41624770555333,
+                    123.2744913103858,
+                    106.94257164137686,
+                    123.66830876325491
+                ]
+            }
+        }
+    },
+    "aaa": {
+        "ft": {
+            "ssim": {
+                "avg": 0.8912452217694875,
+                "vals": [
+                    0.9061050415039062,
+                    0.9104815125465393,
+                    0.9111503958702087,
+                    0.8999118208885193,
+                    0.9075160026550293,
+                    0.9037303924560547,
+                    0.9100475311279297,
+                    0.9105749130249023,
+                    0.9070033431053162,
+                    0.928657054901123,
+                    0.9301331043243408,
+                    0.9153156280517578,
+                    0.9060927033424377,
+                    0.9026443362236023,
+                    0.9006186127662659,
+                    0.8967500329017639,
+                    0.8858065605163574,
+                    0.8841550946235657,
+                    0.8957152366638184,
+                    0.9159244894981384,
+                    0.9212849140167236,
+                    0.8954355716705322,
+                    0.882415771484375,
+                    0.8875539898872375,
+                    0.8762226700782776,
+                    0.8741069436073303,
+                    0.8663904070854187,
+                    0.8608574867248535,
+                    0.8592395782470703,
+                    0.8617714047431946,
+                    0.8670153021812439,
+                    0.8589889407157898,
+                    0.8667375445365906,
+                    0.8646472096443176,
+                    0.8644879460334778,
+                    0.8704419732093811,
+                    0.8701417446136475
+                ]
+            },
+            "psnr": {
+                "avg": 23.218790631433333,
+                "vals": [
+                    22.60000443641932,
+                    23.213327330683896,
+                    23.966372308676554,
+                    22.61628135760781,
+                    23.334516149012956,
+                    22.76956956037783,
+                    23.522473461596828,
+                    23.818158050349368,
+                    24.35100933616721,
+                    27.585580066737258,
+                    27.485016628132442,
+                    25.653444844066296,
+                    24.116901996142047,
+                    23.141365324276826,
+                    23.439784068380483,
+                    23.33054311947808,
+                    22.82591865046196,
+                    22.71185043604199,
+                    24.538035778803117,
+                    27.826709590087102,
+                    28.474405666001857,
+                    24.769433736339693,
+                    23.449171903609013,
+                    23.46383536533758,
+                    21.324737956273246,
+                    21.96812016491501,
+                    20.7294582590173,
+                    20.641701301001603,
+                    20.774969220796272,
+                    21.108372009572886,
+                    22.00821781586527,
+                    20.4745834349127,
+                    21.075791929560445,
+                    21.104358437386903,
+                    21.55546473865332,
+                    22.000344888516832,
+                    21.32542404177411
+                ]
+            },
+            "lpips": {
+                "avg": 0.10130888052486084,
+                "vals": [
+                    0.14061428606510162,
+                    0.14724630117416382,
+                    0.1335088610649109,
+                    0.13316522538661957,
+                    0.12164061516523361,
+                    0.11964336037635803,
+                    0.10177505016326904,
+                    0.10693053156137466,
+                    0.0958232507109642,
+                    0.05439213290810585,
+                    0.05292022228240967,
+                    0.07251676917076111,
+                    0.09491521865129471,
+                    0.11164701730012894,
+                    0.1037021279335022,
+                    0.09932567179203033,
+                    0.0995492935180664,
+                    0.0946572870016098,
+                    0.07657209038734436,
+                    0.04164290428161621,
+                    0.04009518772363663,
+                    0.06518009305000305,
+                    0.07143466174602509,
+                    0.07886430621147156,
+                    0.09445148706436157,
+                    0.11044608056545258,
+                    0.1215718686580658,
+                    0.12195122241973877,
+                    0.12177268415689468,
+                    0.13400736451148987,
+                    0.1025841161608696,
+                    0.11698935180902481,
+                    0.11627939343452454,
+                    0.11498457193374634,
+                    0.11028993129730225,
+                    0.10681547224521637,
+                    0.11852256953716278
+                ]
+            },
+            "fid": {
+                "avg": 128.47027428857402,
+                "vals": [
+                    210.6965716493914,
+                    136.02845249812458,
+                    142.9994319683501,
+                    102.24152561128506,
+                    106.32239193197034,
+                    133.25417613786345,
+                    148.41552185542787,
+                    156.6543860781757,
+                    169.9280658509937,
+                    96.20484628876847,
+                    124.39223402724446,
+                    157.57056317546318,
+                    182.60839379211808,
+                    143.061621048883,
+                    129.08572262883925,
+                    146.6837514000539,
+                    121.28023529371208,
+                    133.09083552748842,
+                    133.1701816777087,
+                    106.30667335984795,
+                    92.99936563096986,
+                    106.42616281960741,
+                    91.02325698154696,
+                    82.03384095285311,
+                    100.23715859488628,
+                    103.07187891625573,
+                    126.73029941764449,
+                    120.07365192426221,
+                    120.96345164031321,
+                    116.58719477188482,
+                    167.96926051103426,
+                    180.28950392287587,
+                    105.1495396073212,
+                    140.74437360480772,
+                    99.20108044973863,
+                    113.96675633000953,
+                    105.93779079951634
+                ]
+            }
+        },
+        "base": {
+            "ssim": {
+                "avg": 0.8579733484500164,
+                "vals": [
+                    0.8666160702705383,
+                    0.7987958788871765,
+                    0.8860695362091064,
+                    0.8763647079467773,
+                    0.8763935565948486,
+                    0.8563421368598938,
+                    0.8911734223365784,
+                    0.8716912269592285,
+                    0.8579948544502258,
+                    0.8772208094596863,
+                    0.8896298408508301,
+                    0.8552396893501282,
+                    0.8748952746391296,
+                    0.8613572120666504,
+                    0.8676340579986572,
+                    0.8660659193992615,
+                    0.8730340600013733,
+                    0.8534577488899231,
+                    0.8619491457939148,
+                    0.8790878653526306,
+                    0.869230329990387,
+                    0.8246889114379883,
+                    0.8481850624084473,
+                    0.8987539410591125,
+                    0.8677256107330322,
+                    0.8619834780693054,
+                    0.8389760851860046,
+                    0.8448922634124756,
+                    0.8406286239624023,
+                    0.805531919002533,
+                    0.8549509048461914,
+                    0.8503775596618652,
+                    0.8362762928009033,
+                    0.8465806841850281,
+                    0.8307406902313232,
+                    0.850950300693512,
+                    0.8335282206535339
+                ]
+            },
+            "psnr": {
+                "avg": 19.081316861697736,
+                "vals": [
+                    16.88895564145245,
+                    15.797760205800826,
+                    18.41262183621939,
+                    18.562114490640262,
+                    17.917161492127512,
+                    18.252547952717784,
+                    19.64865146927212,
+                    17.6865025569328,
+                    17.99347739896675,
+                    18.854024365954317,
+                    19.934364132621422,
+                    18.675988138292176,
+                    19.247068668941317,
+                    19.570660435858237,
+                    17.830968802374684,
+                    19.397491216401374,
+                    21.611632805892775,
+                    20.071126840114104,
+                    19.438827872416823,
+                    22.09851766389441,
+                    20.985060014772596,
+                    17.811881997736606,
+                    20.828426349668014,
+                    21.464988324765418,
+                    19.570091780949138,
+                    20.7044778911251,
+                    18.34111665771247,
+                    19.15076106907652,
+                    20.11914583622098,
+                    16.338984721289947,
+                    21.540692338174694,
+                    19.138283664226506,
+                    17.11142702905118,
+                    18.437481626987633,
+                    18.706711870048608,
+                    20.13241290437168,
+                    17.73631581974743
+                ]
+            },
+            "lpips": {
+                "avg": 0.2180521393547187,
+                "vals": [
+                    0.24907447397708893,
+                    0.5150208473205566,
+                    0.21998971700668335,
+                    0.30011799931526184,
+                    0.24314486980438232,
+                    0.2725614607334137,
+                    0.2167254090309143,
+                    0.22530333697795868,
+                    0.2836877703666687,
+                    0.2629338204860687,
+                    0.16042795777320862,
+                    0.2589859664440155,
+                    0.18903297185897827,
+                    0.19533997774124146,
+                    0.27064773440361023,
+                    0.1686403751373291,
+                    0.12280213087797165,
+                    0.25335609912872314,
+                    0.29388710856437683,
+                    0.09789006412029266,
+                    0.1359485536813736,
+                    0.37839996814727783,
+                    0.16171419620513916,
+                    0.09748921543359756,
+                    0.11395677924156189,
+                    0.13522587716579437,
+                    0.20320472121238708,
+                    0.20818984508514404,
+                    0.16585132479667664,
+                    0.2417808175086975,
+                    0.11952673643827438,
+                    0.15657898783683777,
+                    0.21135613322257996,
+                    0.20219209790229797,
+                    0.2686466574668884,
+                    0.16088804602622986,
+                    0.3074091076850891
+                ]
+            },
+            "fid": {
+                "avg": 168.1660536989984,
+                "vals": [
+                    247.82225753332435,
+                    261.94302538502393,
+                    148.08945162416452,
+                    156.38665720624414,
+                    164.2083172443518,
+                    152.55262134635595,
+                    167.74632833375753,
+                    186.55618400333205,
+                    237.13432800336085,
+                    295.59781724649554,
+                    183.50749905003778,
+                    233.3946841940326,
+                    127.19021998668687,
+                    237.64574583795806,
+                    212.50245640676792,
+                    138.55530819236137,
+                    214.2925259680326,
+                    206.29937532009325,
+                    166.21050755154022,
+                    78.77898145774333,
+                    102.08709635256376,
+                    205.22517444539076,
+                    103.51534026875984,
+                    79.41715249937273,
+                    119.52601444212452,
+                    103.10200144999655,
+                    177.805092182125,
+                    132.67670025242347,
+                    131.8997749255857,
+                    137.54224584649327,
+                    132.2298444263562,
+                    220.02890730124264,
+                    148.88606776876063,
+                    136.47310567443643,
+                    165.10752223661132,
+                    125.63803892245707,
+                    184.56961597657588
+                ]
+            }
+        }
+    },
+    "azri": {
+        "ft": {
+            "ssim": {
+                "avg": 0.9058952973439143,
+                "vals": [
+                    0.8917362689971924,
+                    0.9481084942817688,
+                    0.8861625790596008,
+                    0.9341784119606018,
+                    0.9068629145622253,
+                    0.9030451774597168,
+                    0.9057510495185852,
+                    0.8903522491455078,
+                    0.8899074196815491,
+                    0.8972444534301758,
+                    0.9205942153930664,
+                    0.9142631888389587,
+                    0.8862788081169128,
+                    0.9196522235870361,
+                    0.8918246626853943,
+                    0.9560407996177673,
+                    0.901726484298706,
+                    0.8848044276237488,
+                    0.8926780819892883,
+                    0.8907856941223145,
+                    0.9449396729469299,
+                    0.8873879313468933,
+                    0.9299740791320801,
+                    0.904637336730957,
+                    0.9079319834709167,
+                    0.9113084673881531,
+                    0.8961232304573059,
+                    0.881411075592041,
+                    0.891535222530365,
+                    0.9156699776649475,
+                    0.9131000638008118,
+                    0.8913257122039795,
+                    0.9246047139167786,
+                    0.8882644772529602,
+                    0.9553866982460022,
+                    0.8979193568229675,
+                    0.884848415851593,
+                    0.8858329653739929,
+                    0.8967573046684265,
+                    0.9442141652107239,
+                    0.8944272398948669,
+                    0.9422566294670105,
+                    0.9031774401664734,
+                    0.9035826325416565,
+                    0.9115505814552307,
+                    0.8966580033302307,
+                    0.8833451867103577,
+                    0.8935332894325256,
+                    0.9135630130767822,
+                    0.9111640453338623,
+                    0.8814460635185242,
+                    0.9066808819770813
+                ]
+            },
+            "psnr": {
+                "avg": 22.26210885067453,
+                "vals": [
+                    21.127093280116206,
+                    24.839156863255635,
+                    20.600073186747085,
+                    25.025180077941343,
+                    22.081731981690943,
+                    22.42101370484669,
+                    22.81864654433271,
+                    21.18838097936547,
+                    22.058042539733158,
+                    21.10834360782046,
+                    23.075382471633933,
+                    22.544052476196704,
+                    19.00360259824341,
+                    21.92184820041113,
+                    21.430364755014008,
+                    28.074184819479914,
+                    22.007287073710827,
+                    20.596292100864723,
+                    22.0403525229489,
+                    20.915785732740424,
+                    24.24433505144897,
+                    20.759471581753495,
+                    23.580874095639594,
+                    21.765209438099404,
+                    22.735731723488712,
+                    23.165893102346963,
+                    21.193429487248192,
+                    20.883114381461837,
+                    20.509329225340185,
+                    22.74356627523407,
+                    23.076752632172514,
+                    20.80786955805867,
+                    22.356163191215202,
+                    21.454672093200195,
+                    28.091895600685465,
+                    21.89353215478105,
+                    20.445727082361902,
+                    21.73245024548928,
+                    21.415697749014967,
+                    25.255915842679492,
+                    21.344568596586,
+                    25.736038367482795,
+                    21.80143662440821,
+                    22.556710854506832,
+                    23.011730483476125,
+                    21.360129615824693,
+                    21.183864623422796,
+                    20.786549475403824,
+                    23.05387236898404,
+                    22.61546996399324,
+                    18.695781286483605,
+                    22.495061945689415
+                ]
+            },
+            "lpips": {
+                "avg": 0.0681791385420813,
+                "vals": [
+                    0.07602706551551819,
+                    0.04024939239025116,
+                    0.08144165575504303,
+                    0.04438445717096329,
+                    0.07352830469608307,
+                    0.05720607936382294,
+                    0.06115525960922241,
+                    0.08031078428030014,
+                    0.07755088806152344,
+                    0.08515866100788116,
+                    0.058882661163806915,
+                    0.07514132559299469,
+                    0.08883378654718399,
+                    0.08077041804790497,
+                    0.0614573135972023,
+                    0.017901955172419548,
+                    0.06920649111270905,
+                    0.07639990746974945,
+                    0.07837355881929398,
+                    0.0728088989853859,
+                    0.04596266895532608,
+                    0.09040576219558716,
+                    0.057428933680057526,
+                    0.07047676295042038,
+                    0.054519202560186386,
+                    0.0576803982257843,
+                    0.0698535293340683,
+                    0.08105408400297165,
+                    0.09364423155784607,
+                    0.06507238000631332,
+                    0.07589171826839447,
+                    0.0750972181558609,
+                    0.074916310608387,
+                    0.062267474830150604,
+                    0.021616671234369278,
+                    0.06903669983148575,
+                    0.07748014479875565,
+                    0.08583179116249084,
+                    0.06618104875087738,
+                    0.05046549439430237,
+                    0.07481929659843445,
+                    0.042737700045108795,
+                    0.07031229138374329,
+                    0.05742187052965164,
+                    0.058822184801101685,
+                    0.08137130737304688,
+                    0.08340458571910858,
+                    0.08779360353946686,
+                    0.05354199558496475,
+                    0.0719795972108841,
+                    0.09594655781984329,
+                    0.06549282371997833
+                ]
+            },
+            "fid": {
+                "avg": 87.69454384414368,
+                "vals": [
+                    77.53360222199552,
+                    81.69009799979364,
+                    126.35462309879928,
+                    128.097754928267,
+                    117.15623067707392,
+                    98.01858464323817,
+                    77.96826502055629,
+                    78.41206612900142,
+                    110.78376599817574,
+                    63.587186575144706,
+                    54.89619147757511,
+                    84.46815633637291,
+                    76.09334109513291,
+                    89.06784327761028,
+                    112.09989177281398,
+                    77.76499620466669,
+                    127.0296162823459,
+                    56.2819223080697,
+                    69.43078527369693,
+                    64.65111942898734,
+                    62.87374700137458,
+                    128.69000277570802,
+                    85.75299650509533,
+                    101.57266095839137,
+                    69.76067120877498,
+                    86.34886735845666,
+                    68.48340578638442,
+                    98.73242401936356,
+                    63.17021113035836,
+                    65.11579193591135,
+                    77.95177629642525,
+                    89.78335003024443,
+                    86.28969096009206,
+                    119.6802369335825,
+                    89.66745457666659,
+                    132.85212105669805,
+                    114.10763350784305,
+                    103.6134398480294,
+                    59.686968923134835,
+                    82.54559469802132,
+                    111.1869442643463,
+                    85.38777030577579,
+                    106.32082608988044,
+                    106.12593735864233,
+                    86.00820771852335,
+                    64.28650309776683,
+                    102.70937080791995,
+                    47.320553137933764,
+                    53.731227883240436,
+                    71.80043885301173,
+                    96.20161778732583,
+                    70.97179633123264
+                ]
+            }
+        },
+        "base": {
+            "ssim": {
+                "avg": 0.765552927668278,
+                "vals": [
+                    0.7876270413398743,
+                    0.6932912468910217,
+                    0.7860949039459229,
+                    0.8119332194328308,
+                    0.7923873066902161,
+                    0.7616052031517029,
+                    0.7635032534599304,
+                    0.752696692943573,
+                    0.7297654151916504,
+                    0.7741971015930176,
+                    0.7830759882926941,
+                    0.787074089050293,
+                    0.7227017283439636,
+                    0.7842667698860168,
+                    0.7618840336799622,
+                    0.7612975239753723,
+                    0.7881305813789368,
+                    0.7242482304573059,
+                    0.7725765109062195,
+                    0.7895841598510742,
+                    0.7584860920906067,
+                    0.7806427478790283,
+                    0.7808322906494141,
+                    0.8068225979804993,
+                    0.7732831835746765,
+                    0.7021427154541016,
+                    0.7602499127388,
+                    0.7775993943214417,
+                    0.7879728674888611,
+                    0.7852631211280823,
+                    0.7593006491661072,
+                    0.7491123080253601,
+                    0.8211724162101746,
+                    0.791597306728363,
+                    0.8033311367034912,
+                    0.751656711101532,
+                    0.7145156860351562,
+                    0.7085480690002441,
+                    0.7710719704627991,
+                    0.7261748313903809,
+                    0.7700297236442566,
+                    0.721587598323822,
+                    0.769636869430542,
+                    0.7924219965934753,
+                    0.7783514857292175,
+                    0.7467233538627625,
+                    0.7454271912574768,
+                    0.7511434555053711,
+                    0.7623993754386902,
+                    0.7588648796081543,
+                    0.7819810509681702,
+                    0.792468249797821
+                ]
+            },
+            "psnr": {
+                "avg": 12.04275296711651,
+                "vals": [
+                    11.831093266628356,
+                    12.528152127082233,
+                    11.504680724545494,
+                    12.06579748941828,
+                    12.193623545830743,
+                    12.967612777094432,
+                    11.22349001603837,
+                    12.676957730222881,
+                    10.463052935595755,
+                    12.068058666401921,
+                    11.815425022083929,
+                    11.474556127083932,
+                    11.65343712380346,
+                    12.970802840776267,
+                    10.921165557898965,
+                    11.487784206793332,
+                    11.862155399851087,
+                    12.500533116810459,
+                    13.26478322723994,
+                    11.454244322299186,
+                    12.846474859150605,
+                    11.178912278082944,
+                    11.668886917161911,
+                    13.535748073878615,
+                    11.74158297715729,
+                    11.307205931581416,
+                    11.122995221603784,
+                    12.315128915653881,
+                    11.927970316701074,
+                    11.471458996386659,
+                    10.8378095973721,
+                    12.40354222062087,
+                    13.883183470169415,
+                    12.746526277133636,
+                    13.423830377359124,
+                    12.183072443407536,
+                    11.745039572219284,
+                    11.288605142920026,
+                    10.811118546539486,
+                    11.492946654674247,
+                    12.552522641514392,
+                    12.042564039263278,
+                    11.3939431319692,
+                    12.898514335036811,
+                    11.114529050567972,
+                    11.339446334579078,
+                    12.24930476842462,
+                    13.364881567497516,
+                    13.445838873213505,
+                    12.214397326986626,
+                    12.84755151678836,
+                    11.900215690944371
+                ]
+            },
+            "lpips": {
+                "avg": 0.3510053243774634,
+                "vals": [
+                    0.3419290781021118,
+                    0.4748748242855072,
+                    0.3430050015449524,
+                    0.3214326500892639,
+                    0.28315189480781555,
+                    0.38122087717056274,
+                    0.40785902738571167,
+                    0.283894807100296,
+                    0.28781914710998535,
+                    0.31599509716033936,
+                    0.31139951944351196,
+                    0.40369728207588196,
+                    0.4894903302192688,
+                    0.3339408040046692,
+                    0.3329698443412781,
+                    0.2978940010070801,
+                    0.39562928676605225,
+                    0.3887198865413666,
+                    0.3585241734981537,
+                    0.31963711977005005,
+                    0.31140702962875366,
+                    0.3825278878211975,
+                    0.3666013181209564,
+                    0.24337142705917358,
+                    0.36876243352890015,
+                    0.36285972595214844,
+                    0.33685219287872314,
+                    0.3693651854991913,
+                    0.2829691767692566,
+                    0.30595219135284424,
+                    0.3462897539138794,
+                    0.49221092462539673,
+                    0.35862377285957336,
+                    0.2963302731513977,
+                    0.2860722541809082,
+                    0.33742064237594604,
+                    0.46653813123703003,
+                    0.4688953757286072,
+                    0.3590819835662842,
+                    0.3915751874446869,
+                    0.43592336773872375,
+                    0.4350709915161133,
+                    0.3321887254714966,
+                    0.2887817919254303,
+                    0.3315331041812897,
+                    0.34705182909965515,
+                    0.29848411679267883,
+                    0.3338838815689087,
+                    0.3551400601863861,
+                    0.2849394679069519,
+                    0.276046484708786,
+                    0.32644152641296387
+                ]
+            },
+            "fid": {
+                "avg": 290.9468764478715,
+                "vals": [
+                    317.3580192368432,
+                    285.3201578307224,
+                    227.35443388770344,
+                    284.40224085168455,
+                    250.42273416927597,
+                    332.1637800582306,
+                    421.4826897186527,
+                    175.65665020370423,
+                    462.80212160877016,
+                    313.11887539989885,
+                    216.874426261107,
+                    318.69325303966303,
+                    352.27748822517265,
+                    337.56780348968255,
+                    333.41852463427074,
+                    291.18619781007527,
+                    231.22305793958952,
+                    182.59524666699008,
+                    174.66560996071973,
+                    287.25586265039675,
+                    245.32536993670467,
+                    340.7025359683132,
+                    430.81577040883786,
+                    151.6384776188366,
+                    179.9751120645675,
+                    262.16017361838544,
+                    252.1575376021156,
+                    383.6182322995117,
+                    293.8831833204783,
+                    238.37454832172995,
+                    253.2247312385866,
+                    300.3330815210274,
+                    263.0408874830202,
+                    308.8415900149356,
+                    208.06900694354897,
+                    312.3743463947788,
+                    281.0093276692239,
+                    401.06852280176753,
+                    412.57353143782905,
+                    245.6395460326994,
+                    330.86620573911364,
+                    396.22633300703814,
+                    253.3246381630602,
+                    243.78755013247724,
+                    343.90170319814894,
+                    411.48625023760695,
+                    426.23161153775044,
+                    167.20827449948533,
+                    253.45101894407023,
+                    249.6648801639487,
+                    233.26966302525298,
+                    259.15476030131083
+                ]
+            }
+        }
+    },
+    "dead": {
+        "ft": {
+            "ssim": {
+                "avg": 0.8463698829475202,
+                "vals": [
+                    0.819096565246582,
+                    0.8230092525482178,
+                    0.8352133631706238,
+                    0.9176575541496277,
+                    0.8429977297782898,
+                    0.8367952704429626,
+                    0.8534295558929443,
+                    0.838138997554779,
+                    0.8528039455413818,
+                    0.8470211029052734,
+                    0.8170969486236572,
+                    0.8159563541412354,
+                    0.8493235111236572,
+                    0.8536948561668396,
+                    0.8294236063957214,
+                    0.90614253282547,
+                    0.8337429165840149,
+                    0.8743107914924622,
+                    0.8367175459861755,
+                    0.8243162631988525,
+                    0.8247227668762207,
+                    0.8266720771789551,
+                    0.9225577712059021,
+                    0.8386837840080261,
+                    0.8380022644996643,
+                    0.8386256098747253,
+                    0.8460710048675537,
+                    0.8459582924842834,
+                    0.835181713104248,
+                    0.8186705708503723,
+                    0.8184948563575745,
+                    0.8417790532112122,
+                    0.8489491939544678,
+                    0.8365125060081482,
+                    0.9095039963722229,
+                    0.8317481875419617,
+                    0.8886545300483704,
+                    0.8443787097930908
+                ]
+            },
+            "psnr": {
+                "avg": 23.795184903621124,
+                "vals": [
+                    22.123992625667,
+                    22.383136736110977,
+                    22.815137157510414,
+                    29.058328544219666,
+                    23.458824312069826,
+                    23.35115778300144,
+                    23.899720737917086,
+                    23.251382000597822,
+                    23.8556550518376,
+                    24.496927997944468,
+                    22.229396380994544,
+                    21.835705931258186,
+                    24.044979173806603,
+                    23.87095747368083,
+                    22.755647052287486,
+                    27.255022608267765,
+                    23.384111730889643,
+                    24.47027594982054,
+                    22.969660848990017,
+                    22.118529030027556,
+                    22.65373935684274,
+                    22.26046784537836,
+                    29.458407087230075,
+                    23.06010458055496,
+                    23.57190723261845,
+                    23.327430637055485,
+                    23.589256344441864,
+                    23.48439936889143,
+                    23.067392439373478,
+                    22.096704419878947,
+                    21.619810610831216,
+                    23.50251506589982,
+                    23.94463595644848,
+                    23.01993677097123,
+                    28.729724692646393,
+                    23.230957096762136,
+                    26.24107924644081,
+                    23.730008458437332
+                ]
+            },
+            "lpips": {
+                "avg": 0.07999030775145481,
+                "vals": [
+                    0.09683731198310852,
+                    0.10403041541576385,
+                    0.10699359327554703,
+                    0.02980189025402069,
+                    0.07957077771425247,
+                    0.08596174418926239,
+                    0.06019214540719986,
+                    0.07882507890462875,
+                    0.07737825810909271,
+                    0.08655022084712982,
+                    0.09879130125045776,
+                    0.12470565736293793,
+                    0.055472321808338165,
+                    0.06304865330457687,
+                    0.10779929161071777,
+                    0.030872609466314316,
+                    0.07079628109931946,
+                    0.06584230065345764,
+                    0.10193256288766861,
+                    0.09854554384946823,
+                    0.10243230313062668,
+                    0.10806915909051895,
+                    0.025890624150633812,
+                    0.08075970411300659,
+                    0.077939972281456,
+                    0.06577856838703156,
+                    0.07062944024801254,
+                    0.08819691836833954,
+                    0.10097920894622803,
+                    0.105897456407547,
+                    0.11288004368543625,
+                    0.06032264977693558,
+                    0.063026562333107,
+                    0.10948407649993896,
+                    0.030226124450564384,
+                    0.06960238516330719,
+                    0.053624995052814484,
+                    0.0899435430765152
+                ]
+            },
+            "fid": {
+                "avg": 80.0617442123876,
+                "vals": [
+                    77.72583958495193,
+                    108.01430563351622,
+                    83.80114371601998,
+                    43.79561757437186,
+                    64.21238376481085,
+                    107.90633495184883,
+                    94.90226901876251,
+                    81.66996436668452,
+                    56.44677259421701,
+                    133.90454895719694,
+                    94.53365874681484,
+                    103.25655943118333,
+                    56.07493043457935,
+                    76.20330391887549,
+                    86.46052802881235,
+                    35.14997667649733,
+                    62.05464557602466,
+                    82.75703146138048,
+                    78.68766046633303,
+                    95.9801870274251,
+                    69.69440837156752,
+                    133.97665503608607,
+                    23.992440096968796,
+                    73.95956883955756,
+                    101.69397444074366,
+                    96.09931824946253,
+                    44.15662245914518,
+                    83.0950768685819,
+                    113.04304804723927,
+                    123.68218057444115,
+                    80.64456666041356,
+                    51.48177417838611,
+                    61.27753437751468,
+                    89.59810342754149,
+                    64.8439294361588,
+                    63.560962531495846,
+                    68.41856099127594,
+                    75.58989355384159
+                ]
+            }
+        },
+        "base": {
+            "ssim": {
+                "avg": 0.7146695460143843,
+                "vals": [
+                    0.6920250058174133,
+                    0.6781535148620605,
+                    0.7473878860473633,
+                    0.7988237738609314,
+                    0.7046812176704407,
+                    0.6728485226631165,
+                    0.6648303866386414,
+                    0.6507443785667419,
+                    0.6572958827018738,
+                    0.7580508589744568,
+                    0.7515758872032166,
+                    0.759639322757721,
+                    0.7379322052001953,
+                    0.7911661267280579,
+                    0.7242826819419861,
+                    0.6797776222229004,
+                    0.7349328994750977,
+                    0.7590611577033997,
+                    0.6552085876464844,
+                    0.745119035243988,
+                    0.7352902293205261,
+                    0.7022021412849426,
+                    0.7449350357055664,
+                    0.7506275773048401,
+                    0.7204473614692688,
+                    0.666022002696991,
+                    0.7632265090942383,
+                    0.7753145098686218,
+                    0.6686270833015442,
+                    0.6356704831123352,
+                    0.6632815003395081,
+                    0.7420862317085266,
+                    0.6526774764060974,
+                    0.7651051878929138,
+                    0.7491373419761658,
+                    0.6927105784416199,
+                    0.7062166333198547,
+                    0.6603279113769531
+                ]
+            },
+            "psnr": {
+                "avg": 17.075681593330103,
+                "vals": [
+                    16.02221993829837,
+                    16.391894818273492,
+                    19.158898065865564,
+                    21.168902461771523,
+                    15.849282724811843,
+                    13.438762906336217,
+                    14.041096876172848,
+                    14.614652873958555,
+                    14.836499194441464,
+                    19.42617293541887,
+                    19.6028175608849,
+                    19.50721422138582,
+                    17.430967194872665,
+                    20.64046847536366,
+                    17.37638733796819,
+                    15.173079056741063,
+                    18.277006035337127,
+                    18.80925161485643,
+                    14.095576974205786,
+                    19.420502974160595,
+                    19.454135471046165,
+                    17.222296068007893,
+                    18.225982648513018,
+                    19.51789549326609,
+                    16.618582564745015,
+                    15.364057378648432,
+                    18.71637998732089,
+                    19.61078528607858,
+                    14.215511993158811,
+                    13.88753261057972,
+                    10.856355225225222,
+                    18.09338753925673,
+                    14.179305881135015,
+                    20.07084940675992,
+                    19.650702290598378,
+                    15.665586196724535,
+                    16.945129336951908,
+                    15.299770927402514
+                ]
+            },
+            "lpips": {
+                "avg": 0.2726066539946355,
+                "vals": [
+                    0.2813563346862793,
+                    0.3176368176937103,
+                    0.239101380109787,
+                    0.1326800435781479,
+                    0.31650233268737793,
+                    0.3085799217224121,
+                    0.32426244020462036,
+                    0.32414373755455017,
+                    0.3856937885284424,
+                    0.21604351699352264,
+                    0.2195548415184021,
+                    0.18589559197425842,
+                    0.21749232709407806,
+                    0.1575351059436798,
+                    0.2156955450773239,
+                    0.29644495248794556,
+                    0.22918514907360077,
+                    0.21251118183135986,
+                    0.37587040662765503,
+                    0.193213552236557,
+                    0.2757120132446289,
+                    0.31559503078460693,
+                    0.2390986531972885,
+                    0.2627783417701721,
+                    0.22325637936592102,
+                    0.35516589879989624,
+                    0.2559050917625427,
+                    0.2075577825307846,
+                    0.3593791425228119,
+                    0.39297720789909363,
+                    0.3490094244480133,
+                    0.22949230670928955,
+                    0.3761798143386841,
+                    0.20215649902820587,
+                    0.21151788532733917,
+                    0.33757245540618896,
+                    0.27517855167388916,
+                    0.3411214053630829
+                ]
+            },
+            "fid": {
+                "avg": 171.8519597862991,
+                "vals": [
+                    144.77019624546853,
+                    154.75288495007914,
+                    160.95209947656429,
+                    117.83869765679046,
+                    151.64615714342628,
+                    157.47977926729806,
+                    162.33033867519435,
+                    265.0183384600493,
+                    281.5918475384978,
+                    152.44583427606105,
+                    133.1601149299724,
+                    194.62961438661011,
+                    244.89585280015393,
+                    131.48030011431425,
+                    167.5752450544133,
+                    201.44646187475317,
+                    107.98531116987058,
+                    170.08420689868527,
+                    236.9186629037794,
+                    141.81304882152676,
+                    138.10872705204866,
+                    174.07596100327163,
+                    133.47905631898664,
+                    200.28084705986407,
+                    100.24977206742142,
+                    146.11378626732036,
+                    180.40072384521355,
+                    134.37882246824165,
+                    266.2742357839778,
+                    204.80154766025774,
+                    283.97677018987224,
+                    144.08380223026322,
+                    176.91537688202212,
+                    127.66897079587724,
+                    130.9521420141355,
+                    166.36337398681758,
+                    214.39949796639,
+                    129.03606564387414
+                ]
+            }
+        }
+    }
+}

metrics.py ADDED Viewed

	@@ -0,0 +1,522 @@

+import os
+import pathlib
+import torch
+import numpy as np
+import skimage
+from imageio import imread
+from scipy import linalg
+from torch.nn.functional import adaptive_avg_pool2d
+from skimage.metrics import structural_similarity as compare_ssim
+from skimage.metrics import peak_signal_noise_ratio as compare_psnr
+import glob
+import argparse
+import matplotlib.pyplot as plt
+from inception import InceptionV3
+#from scripts.PerceptualSimilarity.models import dist_model as dm
+import lpips
+import pandas as pd
+import json
+import imageio
+import cv2
+print(skimage.__version__)
+class FID():
+    """docstring for FID
+    Calculates the Frechet Inception Distance (FID) to evalulate GANs
+    The FID metric calculates the distance between two distributions of images.
+    Typically, we have summary statistics (mean & covariance matrix) of one
+    of these distributions, while the 2nd distribution is given by a GAN.
+    When run as a stand-alone program, it compares the distribution of
+    images that are stored as PNG/JPEG at a specified location with a
+    distribution given by summary statistics (in pickle format).
+    The FID is calculated by assuming that X_1 and X_2 are the activations of
+    the pool_3 layer of the inception net for generated samples and real world
+    samples respectivly.
+    See --help to see further details.
+    Code apapted from https://github.com/bioinf-jku/TTUR to use PyTorch instead
+    of Tensorflow
+    Copyright 2018 Institute of Bioinformatics, JKU Linz
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+    """
+    def __init__(self):
+        self.dims = 2048
+        self.batch_size = 128
+        self.cuda = True
+        self.verbose=False
+        block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[self.dims]
+        self.model = InceptionV3([block_idx])
+        if self.cuda:
+            # TODO: put model into specific GPU
+            self.model.cuda()
+    def __call__(self, images, gt_path):
+        """ images:  list of the generated image. The values must lie between 0 and 1.
+            gt_path: the path of the ground truth images.  The values must lie between 0 and 1.
+        """
+        if not os.path.exists(gt_path):
+            raise RuntimeError('Invalid path: %s' % gt_path)
+        print('calculate gt_path statistics...')
+        m1, s1 = self.compute_statistics_of_path(gt_path, self.verbose)
+        print('calculate generated_images statistics...')
+        m2, s2 = self.calculate_activation_statistics(images, self.verbose)
+        fid_value = self.calculate_frechet_distance(m1, s1, m2, s2)
+        return fid_value
+    def calculate_from_disk(self, generated_path, gt_path, img_size):
+        """
+        """
+        if not os.path.exists(gt_path):
+            raise RuntimeError('Invalid path: %s' % gt_path)
+        if not os.path.exists(generated_path):
+            raise RuntimeError('Invalid path: %s' % generated_path)
+        print ('exp-path - '+generated_path)
+        print('calculate gt_path statistics...')
+        m1, s1 = self.compute_statistics_of_path(gt_path, self.verbose, img_size)
+        print('calculate generated_path statistics...')
+        m2, s2 = self.compute_statistics_of_path(generated_path, self.verbose, img_size)
+        print('calculate frechet distance...')
+        fid_value = self.calculate_frechet_distance(m1, s1, m2, s2)
+        print('fid_distance %f' % (fid_value))
+        return fid_value
+    def compute_statistics_of_path(self, path , verbose, img_size):
+        size_flag = '{}_{}'.format(img_size[0], img_size[1])
+        npz_file = os.path.join(path, size_flag + '_statistics.npz')
+        if os.path.exists(npz_file):
+            f = np.load(npz_file)
+            m, s = f['mu'][:], f['sigma'][:]
+            f.close()
+        else:
+            path = pathlib.Path(path)
+            files = list(path.glob('*.jpg')) + list(path.glob('*.png'))
+            imgs = (np.array([(cv2.resize(imread(str(fn)).astype(np.float32),img_size,interpolation=cv2.INTER_CUBIC)) for fn in files]))/255.0
+            # Bring images to shape (B, 3, H, W)
+            imgs = imgs.transpose((0, 3, 1, 2))
+            # Rescale images to be between 0 and 1
+            m, s = self.calculate_activation_statistics(imgs, verbose)
+            np.savez(npz_file, mu=m, sigma=s)
+        return m, s
+    def calculate_activation_statistics(self, images, verbose):
+        """Calculation of the statistics used by the FID.
+        Params:
+        -- images      : Numpy array of dimension (n_images, 3, hi, wi). The values
+                         must lie between 0 and 1.
+        -- model       : Instance of inception model
+        -- batch_size  : The images numpy array is split into batches with
+                         batch size batch_size. A reasonable batch size
+                         depends on the hardware.
+        -- dims        : Dimensionality of features returned by Inception
+        -- cuda        : If set to True, use GPU
+        -- verbose     : If set to True and parameter out_step is given, the
+                         number of calculated batches is reported.
+        Returns:
+        -- mu    : The mean over samples of the activations of the pool_3 layer of
+                   the inception model.
+        -- sigma : The covariance matrix of the activations of the pool_3 layer of
+                   the inception model.
+        """
+        act = self.get_activations(images, verbose)
+        mu = np.mean(act, axis=0)
+        sigma = np.cov(act, rowvar=False)
+        return mu, sigma
+    def get_activations(self, images, verbose=False):
+        """Calculates the activations of the pool_3 layer for all images.
+        Params:
+        -- images      : Numpy array of dimension (n_images, 3, hi, wi). The values
+                         must lie between 0 and 1.
+        -- model       : Instance of inception model
+        -- batch_size  : the images numpy array is split into batches with
+                         batch size batch_size. A reasonable batch size depends
+                         on the hardware.
+        -- dims        : Dimensionality of features returned by Inception
+        -- cuda        : If set to True, use GPU
+        -- verbose     : If set to True and parameter out_step is given, the number
+                         of calculated batches is reported.
+        Returns:
+        -- A numpy array of dimension (num images, dims) that contains the
+           activations of the given tensor when feeding inception with the
+           query tensor.
+        """
+        self.model.eval()
+        d0 = images.shape[0]
+        if self.batch_size > d0:
+            print(('Warning: batch size is bigger than the data size. '
+                   'Setting batch size to data size'))
+            self.batch_size = d0
+        n_batches = d0 // self.batch_size
+        n_used_imgs = n_batches * self.batch_size
+        pred_arr = np.empty((n_used_imgs, self.dims))
+        for i in range(n_batches):
+            if verbose:
+                print('\rPropagating batch %d/%d' % (i + 1, n_batches))
+                      # end='', flush=True)
+            start = i * self.batch_size
+            end = start + self.batch_size
+            batch = torch.from_numpy(images[start:end]).type(torch.FloatTensor)
+            # batch = Variable(batch, volatile=True)
+            if self.cuda:
+                batch = batch.cuda()
+            pred = self.model(batch)[0]
+            # If model output is not scalar, apply global spatial average pooling.
+            # This happens if you choose a dimensionality not equal 2048.
+            if pred.shape[2] != 1 or pred.shape[3] != 1:
+                pred = adaptive_avg_pool2d(pred, output_size=(1, 1))
+            pred_arr[start:end] = pred.cpu().data.numpy().reshape(self.batch_size, -1)
+        if verbose:
+            print(' done')
+        return pred_arr
+    def calculate_frechet_distance(self, mu1, sigma1, mu2, sigma2, eps=1e-6):
+        """Numpy implementation of the Frechet Distance.
+        The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+        and X_2 ~ N(mu_2, C_2) is
+                d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+        Stable version by Dougal J. Sutherland.
+        Params:
+        -- mu1   : Numpy array containing the activations of a layer of the
+                   inception net (like returned by the function 'get_predictions')
+                   for generated samples.
+        -- mu2   : The sample mean over activations, precalculated on an
+                   representive data set.
+        -- sigma1: The covariance matrix over activations for generated samples.
+        -- sigma2: The covariance matrix over activations, precalculated on an
+                   representive data set.
+        Returns:
+        --   : The Frechet Distance.
+        """
+        mu1 = np.atleast_1d(mu1)
+        mu2 = np.atleast_1d(mu2)
+        sigma1 = np.atleast_2d(sigma1)
+        sigma2 = np.atleast_2d(sigma2)
+        assert mu1.shape == mu2.shape, \
+            'Training and test mean vectors have different lengths'
+        assert sigma1.shape == sigma2.shape, \
+            'Training and test covariances have different dimensions'
+        diff = mu1 - mu2
+        # Product might be almost singular
+        covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+        if not np.isfinite(covmean).all():
+            msg = ('fid calculation produces singular product; '
+                   'adding %s to diagonal of cov estimates') % eps
+            print(msg)
+            offset = np.eye(sigma1.shape[0]) * eps
+            covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+        # Numerical error might give slight imaginary component
+        if np.iscomplexobj(covmean):
+            if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+                m = np.max(np.abs(covmean.imag))
+                raise ValueError('Imaginary component {}'.format(m))
+            covmean = covmean.real
+        tr_covmean = np.trace(covmean)
+        return (diff.dot(diff) + np.trace(sigma1) +
+                np.trace(sigma2) - 2 * tr_covmean)
+class Reconstruction_Metrics():
+    def __init__(self, metric_list=['ssim', 'psnr', 'l1', 'mae'], data_range=1, win_size=51, multichannel=True):
+        self.data_range = data_range
+        self.win_size = win_size
+        self.multichannel = multichannel
+        for metric in metric_list:
+            if metric in ['ssim', 'psnr', 'l1', 'mae']:
+                setattr(self, metric, True)
+            else:
+                print('unsupport reconstruction metric: %s'%metric)
+    def __call__(self, inputs, gts):
+        """
+        inputs: the generated image, size (b,c,w,h), data range(0, data_range)
+        gts:    the ground-truth image, size (b,c,w,h), data range(0, data_range)
+        """
+        result = dict()
+        [b,n,w,h] = inputs.size()
+        inputs = inputs.view(b*n, w, h).detach().cpu().numpy().astype(np.float32).transpose(1,2,0)
+        gts = gts.view(b*n, w, h).detach().cpu().numpy().astype(np.float32).transpose(1,2,0)
+        if hasattr(self, 'ssim'):
+            ssim_value = compare_ssim(inputs, gts, data_range=self.data_range,
+                            win_size=self.win_size, multichannel=self.multichannel)
+            result['ssim'] = ssim_value
+        if hasattr(self, 'psnr'):
+            psnr_value = compare_psnr(inputs, gts, self.data_range)
+            result['psnr'] = psnr_value
+        if hasattr(self, 'l1'):
+            l1_value = compare_l1(inputs, gts)
+            result['l1'] = l1_value
+        if hasattr(self, 'mae'):
+            mae_value = compare_mae(inputs, gts)
+            result['mae'] = mae_value
+        return result
+    def calculate_from_disk(self, inputs, gts,  save_path=None, img_size=(176,256), sort=True, debug=0):
+        """
+            inputs: .txt files, floders, image files (string), image files (list)
+            gts: .txt files, floders, image files (string), image files (list)
+        """
+        if sort:
+            input_image_list = sorted(get_image_list(inputs))
+            gt_image_list = sorted(get_image_list(gts))
+        else:
+            input_image_list = get_image_list(inputs)
+            gt_image_list = get_image_list(gts)
+        size_flag = '{}_{}'.format(img_size[0], img_size[1])
+        npz_file = os.path.join(save_path, size_flag + '_metrics.npz')
+        if os.path.exists(npz_file):
+            f = np.load(npz_file)
+            psnr,ssim,ssim_256,mae,l1=f['psnr'],f['ssim'],f['ssim_256'],f['mae'],f['l1']
+        else:
+            psnr = []
+            ssim = []
+            ssim_256 = []
+            mae = []
+            l1 = []
+            names = []
+            for index in range(len(input_image_list)):
+                name = os.path.basename(input_image_list[index])
+                names.append(name)
+                img_gt = (cv2.resize(imread(str(gt_image_list[index])).astype(np.float32), img_size,interpolation=cv2.INTER_CUBIC)) /255.0
+                img_pred = (cv2.resize(imread(str(input_image_list[index])).astype(np.float32), img_size,interpolation=cv2.INTER_CUBIC)) / 255.0
+                if debug != 0:
+                    plt.subplot('121')
+                    plt.imshow(img_gt)
+                    plt.title('Groud truth')
+                    plt.subplot('122')
+                    plt.imshow(img_pred)
+                    plt.title('Output')
+                    plt.show()
+                psnr.append(compare_psnr(img_gt, img_pred, data_range=self.data_range))
+                ssim.append(compare_ssim(img_gt, img_pred, data_range=self.data_range,
+                            win_size=self.win_size,multichannel=self.multichannel, channel_axis=2))
+                mae.append(compare_mae(img_gt, img_pred))
+                l1.append(compare_l1(img_gt, img_pred))
+                img_gt_256 = img_gt*255.0
+                img_pred_256 = img_pred*255.0
+                ssim_256.append(compare_ssim(img_gt_256, img_pred_256, gaussian_weights=True, sigma=1.2,
+                                use_sample_covariance=False, multichannel=True, channel_axis=2,
+                                data_range=img_pred_256.max() - img_pred_256.min()))
+                if np.mod(index, 200) == 0:
+                    print(
+                        str(index) + ' images processed',
+                        "PSNR: %.4f" % round(np.mean(psnr), 4),
+                        "SSIM_256: %.4f" % round(np.mean(ssim_256), 4),
+                        "MAE: %.4f" % round(np.mean(mae), 4),
+                        "l1: %.4f" % round(np.mean(l1), 4),
+                    )
+            if save_path:
+                np.savez(save_path + '/' + size_flag + '_metrics.npz', psnr=psnr, ssim=ssim, ssim_256=ssim_256, mae=mae, l1=l1, names=names)
+        print(
+            "PSNR: %.4f" % round(np.mean(psnr), 4),
+            "PSNR Variance: %.4f" % round(np.var(psnr), 4),
+            "SSIM_256: %.4f" % round(np.mean(ssim_256), 4),
+            "SSIM_256 Variance: %.4f" % round(np.var(ssim_256), 4),
+            "MAE: %.4f" % round(np.mean(mae), 4),
+            "MAE Variance: %.4f" % round(np.var(mae), 4),
+            "l1: %.4f" % round(np.mean(l1), 4),
+            "l1 Variance: %.4f" % round(np.var(l1), 4)
+        )
+        dic = {"psnr":[round(np.mean(psnr), 6)],
+               "psnr_variance": [round(np.var(psnr), 6)],
+               "ssim_256": [round(np.mean(ssim_256), 6)],
+               "ssim_256_variance": [round(np.var(ssim_256), 6)],
+               "mae": [round(np.mean(mae), 6)],
+               "mae_variance": [round(np.var(mae), 6)],
+               "l1": [round(np.mean(l1), 6)],
+               "l1_variance": [round(np.var(l1), 6)] }
+        return dic
+def get_image_list(flist):
+    if isinstance(flist, list):
+        return flist
+    # flist: image file path, image directory path, text file flist path
+    if isinstance(flist, str):
+        if os.path.isdir(flist):
+            flist = list(glob.glob(flist + '/*.jpg')) + list(glob.glob(flist + '/*.png'))
+            flist.sort()
+            return flist
+        if os.path.isfile(flist):
+            try:
+                return np.genfromtxt(flist, dtype=np.str)
+            except:
+                return [flist]
+    print('can not read files from %s return empty list'%flist)
+    return []
+def compare_l1(img_true, img_test):
+    img_true = img_true.astype(np.float32)
+    img_test = img_test.astype(np.float32)
+    return np.mean(np.abs(img_true - img_test))
+def compare_mae(img_true, img_test):
+    img_true = img_true.astype(np.float32)
+    img_test = img_test.astype(np.float32)
+    return np.sum(np.abs(img_true - img_test)) / np.sum(img_true + img_test)
+def preprocess_path_for_deform_task(gt_path, distorted_path):
+    distorted_image_list = sorted(get_image_list(distorted_path))
+    gt_list=[]
+    distorated_list=[]
+    for distorted_image in distorted_image_list:
+        image = os.path.basename(distorted_image)[1:]
+        image = image.split('_to_')[-1]
+        gt_image = gt_path + '/' + image.replace('jpg', 'png')
+        if not os.path.isfile(gt_image):
+            print(distorted_image, gt_image)
+            print('=====')
+            continue
+        gt_list.append(gt_image)
+        distorated_list.append(distorted_image)
+    return gt_list, distorated_list
+class LPIPS():
+    def __init__(self, use_gpu=True):
+        self.model =  lpips.LPIPS(net='alex').eval().cuda()
+        self.use_gpu=use_gpu
+    def __call__(self, image_1, image_2):
+        """
+            image_1: images with size (n, 3, w, h) with value [-1, 1]
+            image_2: images with size (n, 3, w, h) with value [-1, 1]
+        """
+        result = self.model.forward(image_1, image_2)
+        return result
+    def calculate_from_disk(self, path_1, path_2,img_size, batch_size=64, verbose=False, sort=True):
+        if sort:
+            files_1 = sorted(get_image_list(path_1))
+            files_2 = sorted(get_image_list(path_2))
+        else:
+            files_1 = get_image_list(path_1)
+            files_2 = get_image_list(path_2)
+        results=[]
+        d0 = len(files_1)
+        if batch_size > d0:
+            print(('Warning: batch size is bigger than the data size. '
+                   'Setting batch size to data size'))
+            batch_size = d0
+        n_batches = d0 // batch_size
+        for i in range(n_batches):
+            if verbose:
+                print('\rPropagating batch %d/%d' % (i + 1, n_batches))
+                      # end='', flush=True)
+            start = i * batch_size
+            end = start + batch_size
+            imgs_1 = np.array([cv2.resize(imread(str(fn)).astype(np.float32),img_size,interpolation=cv2.INTER_CUBIC)/255.0 for fn in files_1[start:end]])
+            imgs_2 = np.array([cv2.resize(imread(str(fn)).astype(np.float32),img_size,interpolation=cv2.INTER_CUBIC)/255.0 for fn in files_2[start:end]])
+            imgs_1 = imgs_1.transpose((0, 3, 1, 2))
+            imgs_2 = imgs_2.transpose((0, 3, 1, 2))
+            img_1_batch = torch.from_numpy(imgs_1).type(torch.FloatTensor)
+            img_2_batch = torch.from_numpy(imgs_2).type(torch.FloatTensor)
+            if self.use_gpu:
+                img_1_batch = img_1_batch.cuda()
+                img_2_batch = img_2_batch.cuda()
+                with torch.no_grad():
+                    result = self.model.forward(img_1_batch, img_2_batch)
+            results.append(result)
+        distance = torch.cat(results,0)[:,0,0,0].mean()
+        print('lpips: %.3f'%distance)
+        return distance

pose-frames.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#from annotator.dwpose import DWposeDetector
+from easy_dwpose import DWposeDetector
+from PIL import Image
+device = "cpu"
+dwpose = DWposeDetector(device=device)
+for n in range(1, 46):
+    pil_image = Image.open("videos/dance2/frame ("+str(n)+").png").convert("RGB")
+    #skeleton = dwpose(pil_image, output_type="np", include_hands=True, include_face=False)
+    out_img, pose = dwpose(pil_image, include_hands=True, include_face=True)
+    print(pose['bodies'])
+    out_img.save('videos/dance'+str(n)+'.png')

pose.py ADDED Viewed

	@@ -0,0 +1,15 @@

+#from annotator.dwpose import DWposeDetector
+from easy_dwpose import DWposeDetector
+from PIL import Image
+device = "cpu"
+dwpose = DWposeDetector(device=device)
+pil_image = Image.open("imgs/baggy.png").convert("RGB")
+#skeleton = dwpose(pil_image, output_type="np", include_hands=True, include_face=False)
+out_img, _ = dwpose(pil_image, include_hands=True, include_face=False)
+#print(pose['bodies'])
+out_img.save("pose.png")

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+easy-dwpose
+diffusers
+controlnet-aux
+transformers
+accelerate
+gradio
+rembg[cpu]
+spaces

run_stage1.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+accelerate launch --gpu_ids 0,1,2,3,4,5,6,7 --use_deepspeed --num_processes 8   \
+  stage1_train_prior_model.py \
+  --pretrained_model_name_or_path="kandinsky-community/kandinsky-2-2-prior" \
+  --image_encoder_path='{image_encoder_path}' \
+  --img_path='{image_path}' \
+  --json_path='{data.json}' \
+  --output_dir="{output_dir}" \
+  --img_height=512  \
+  --img_width=512   \
+  --train_batch_size=128 \
+  --gradient_accumulation_steps=1 \
+  --max_train_steps=100000 \
+  --noise_offset=0.1 \
+  --learning_rate=1e-05 \
+  --weight_decay=0.01 \
+  --lr_scheduler="constant" --num_warmup_steps=2000 \
+  --checkpointing_steps=5000 \
+  --seed 42

run_stage2.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+accelerate launch --gpu_ids 0,1,2,3,4,5,6,7 --num_processes 8 --use_deepspeed --mixed_precision="fp16"  stage2_train_inpaint_model.py \
+  --pretrained_model_name_or_path="stabilityai/stable-diffusion-2-1-base" \
+  --image_encoder_p_path='facebook/dinov2-giant' \
+  --image_encoder_g_path='{image_encoder_path}' \
+  --json_path='{data.json}' \
+  --image_root_path="{image_path}"  \
+  --output_dir="{output_dir}" \
+  --img_height=512  \
+  --img_width=512   \
+  --learning_rate=1e-4 \
+  --train_batch_size=8 \
+  --max_train_steps=1000000 \
+  --mixed_precision="fp16" \
+  --checkpointing_steps=5000  \
+  --noise_offset=0.1 \
+  --lr_warmup_steps 5000  \
+  --seed 42

run_stage3.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+accelerate launch --gpu_ids 0,1,2,3,4,5,6,7 --num_processes 8 --use_deepspeed --mixed_precision="fp16"  stage3_train_refined_model.py \
+ --pretrained_model_name_or_path="stabilityai/stable-diffusion-2-1-base" \
+  --image_encoder_path='facebook/dinov2-giant' \
+  --img_path='{image_path}' \
+  --json_path='{data.json}' \
+  --gen_t_img_path='{stage2_generate}' \
+  --output_dir="{output_dir}" \
+ --learning_rate=1e-5 \
+ --train_batch_size=16 \
+ --max_train_steps=1000000 \
+ --mixed_precision="fp16" \
+ --checkpointing_steps=5000  \
+ --noise_offset=0.1 \
+ --report_to=tensorboard \
+ --lr_warmup_steps 5000  \

run_test_stage1.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+python3  stage1_batchtest_prior_model.py \
+ --pretrained_model_name_or_path="kandinsky-community/kandinsky-2-2-prior" \
+ --image_encoder_path="{image_encoder_path}" \
+ --img_path='{image_path}' \
+ --json_path='{data.json}' \
+ --pose_path="{normalized_pose_txt}" \
+ --save_path="./logs/view_stage1/512_512" \
+ --weights_name="{save_ckpt}"\

run_test_stage2.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+python3  stage2_batchtest_inpaint_model.py \
+  --img_weigh 512 \
+  --img_height 512 \
+ --pretrained_model_name_or_path="stabilityai/stable-diffusion-2-1-base" \
+ --image_encoder_g_path='{image_encoder_path}' \
+ --image_encoder_p_path='facebook/dinov2-giant' \
+ --img_path='{image_path}' \
+ --json_path='{data.json}' \
+ --pose_path="{pose_path}" \
+ --target_embed_path="./logs/view_stage1/512_512/" \
+ --save_path="./logs/view_stage2/512_512" \
+ --weights_name="{save_ckpt}" \
+ --calculate_metrics

run_test_stage3.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+python3  stage3_batchtest_refined_model.py \
+  --img_weigh 512 \
+  --img_height 512 \
+ --pretrained_model_name_or_path="stabilityai/stable-diffusion-2-1-base" \
+ --image_encoder_p_path='facebook/dinov2-giant' \
+ --img_path='{image_path}' \
+ --json_path='{data.json}' \
+ --pose_path="{pose_path}" \
+ --gen_t_img_path="./logs/view_stage2/512_512/" \
+ --save_path="./logs/view_stage3/512_512" \
+ --weights_name"{save_ckpt}" \
+ --calculate_metrics

sd.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
+import torch
+model_id = "stabilityai/stable-diffusion-2-1-base"
+scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
+pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, torch_dtype=torch.float16)
+pipe = pipe.to("cpu")
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+image.save("astronaut_rides_horse.png")

setup.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+pip install -U xformers --index-url https://download.pytorch.org/whl/cu118
+pip install easy-dwpose
+pip install diffusers==0.24.0
+pip install controlnet-aux==0.0.7
+pip install transformers==4.32.1
+pip install accelerate==0.24.1
+pip install huggingface-hub==0.25.2
+pip install gdown
+pip install -U openmim
+mim install mmengine
+mim install "mmcv==2.1.0"
+mim install "mmdet==3.3.0"
+mim install "mmpose==1.3.2"
+apt-get install libgl1
+#s1
+gdown https://drive.google.com/uc?id=11a5-a5C8NWA4m6i1g099fQQrdohz5gQ1
+#s2
+gdown https://drive.google.com/uc?id=1JhWeScr9bQtoQmB503VDyomaDmxBHail
+#s3
+gdown https://drive.google.com/uc?id=11JZXfYVlgLFqmE8jCbLWjQWO7LrwYq-I
+#demo
+gdown https://drive.google.com/uc?id=1JFFy_FBxOFuGFBcB6xMIVwcQb8bfnpO9
+pip install numpy==1.26.4
+pip install -U xformers --index-url https://download.pytorch.org/whl/cu118
+for pytorch 2.4.0?
+pip install xformers==0.0.28.dev895 or pip install xformers==0.0.28.dev893
+48gb ram for finetuning

single_extract_pose.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from src.controlnet_aux import DWposeDetector
+from PIL import Image
+import torchvision.transforms as transforms
+import torch
+def init_dwpose_detector(device):
+    # specify configs, ckpts and device, or it will be downloaded automatically and use cpu by default
+    det_config = './src/configs/yolox_l_8xb8-300e_coco.py'
+    det_ckpt = './ckpts/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth'
+    pose_config = './src/configs/dwpose-l_384x288.py'
+    pose_ckpt = './ckpts/dw-ll_ucoco_384.pth'
+    dwpose_model = DWposeDetector(
+        det_config=det_config,
+        det_ckpt=det_ckpt,
+        pose_config=pose_config,
+        pose_ckpt=pose_ckpt,
+        device=device
+    )
+    return dwpose_model.to(device)
+def inference_pose(img_path, image_size=(1024, 1024)):
+    device = torch.device(f"cuda:{0}")
+    model = init_dwpose_detector(device=device)
+    pil_image = Image.open(img_path).convert("RGB").resize(image_size, Image.BICUBIC)
+    dwpose_image = model(pil_image, output_type='np', image_resolution=image_size[1])
+    save_dwpose_image = Image.fromarray(dwpose_image)
+    return save_dwpose_image
+inference_pose('imgs/test.png').save("pose.png")

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (178 Bytes). View file

src/configs/dwpose-l_384x288.py ADDED Viewed

	@@ -0,0 +1,257 @@

+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0e-5,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        # use cosine lr from 150 to 300 epoch
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+# codec settings
+codec = dict(
+    type='SimCCLabel',
+    input_size=(288, 384),
+    sigma=(6., 6.93),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False)
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        _scope_='mmdet',
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=1.,
+        widen_factor=1.,
+        out_indices=(4, ),
+        channel_attention=True,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU'),
+        init_cfg=dict(
+            type='Pretrained',
+            prefix='backbone.',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+            'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth'  # noqa
+        )),
+    head=dict(
+        type='RTMCCHead',
+        in_channels=1024,
+        out_channels=133,
+        input_size=codec['input_size'],
+        in_featuremap_size=(9, 12),
+        simcc_split_ratio=codec['simcc_split_ratio'],
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.,
+            drop_path=0.,
+            act_fn='SiLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=dict(
+            type='KLDiscretLoss',
+            use_target_weight=True,
+            beta=10.,
+            label_softmax=True),
+        decoder=codec),
+    test_cfg=dict(flip_test=True, ))
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = '/data/'
+backend_args = dict(backend='local')
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+#         f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+#     }))
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0),
+        ]),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='PackPoseInputs')
+]
+train_pipeline_stage2 = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[0.75, 1.25],
+        rotate_factor=60),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=0.5),
+        ]),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+datasets = []
+dataset_coco=dict(
+    type=dataset_type,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+    data_prefix=dict(img='coco/train2017/'),
+    pipeline=[],
+)
+datasets.append(dataset_coco)
+scene = ['Magic_show', 'Entertainment', 'ConductMusic', 'Online_class',
+         'TalkShow', 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow',
+         'Singing', 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference']
+for i in range(len(scene)):
+    datasets.append(
+        dict(
+            type=dataset_type,
+            data_root=data_root,
+            data_mode=data_mode,
+            ann_file='UBody/annotations/'+scene[i]+'/keypoint_annotation.json',
+            data_prefix=dict(img='UBody/images/'+scene[i]+'/'),
+            pipeline=[],
+        )
+    )
+# data loaders
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CombinedDataset',
+        metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+        datasets=datasets,
+        pipeline=train_pipeline,
+        test_mode=False,
+    ))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
+        bbox_file=f'{data_root}coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='coco/val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+# evaluators
+val_evaluator = dict(
+    type='CocoWholeBodyMetric',
+    ann_file=data_root + 'coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator

src/configs/stage1_config.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import os
+import argparse
+parser = argparse.ArgumentParser(description="Simple example of a training script.")
+parser.add_argument(
+    "--pretrained_model_name_or_path",
+    type=str,
+    default=None,
+    required=True,
+    help="Path to pretrained model or model identifier from huggingface.co/models.",
+)
+parser.add_argument(
+    "--pretrained_image_model_path",
+    type=str,
+    default=None,
+    help="Path to pretrained model or model identifier from huggingface.co/models.",
+)
+parser.add_argument(
+    "--pretrained_pose_model_path",
+    type=str,
+    default=None,
+    help="Path to pretrained model or model identifier from huggingface.co/models.",
+)
+parser.add_argument(
+    "--unet_config_file",
+    type=str,
+    default=None,
+    help="Config file of UNet model",
+)
+parser.add_argument("--json_path", type=str, default="./datasets/deepfashing/train_data.json", help="json path", )
+parser.add_argument("--img_path", type=str, default="./datasets/deepfashing/all_data_png/", help="image path", )
+parser.add_argument("--image_encoder_path", type=str, default="./OpenCLIP-ViT-H-14",
+                    help="Path to pretrained model or model identifier from huggingface.co/models.", )
+parser.add_argument("--img_width", type=int, default=512, help="width", )
+parser.add_argument("--img_height", type=int, default=512, help="height", )
+parser.add_argument(
+    "--output_dir",
+    type=str,
+    default="sd-model-finetuned",
+    help="The output directory where the model predictions and checkpoints will be written.",
+)
+parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+parser.add_argument(
+    "--center_crop",
+    action="store_true",
+    help="Whether to center crop images before resizing to resolution (if not set, random crop will be used)",
+)
+parser.add_argument(
+    "--random_flip",
+    action="store_true",
+    help="whether to randomly flip images horizontally",
+)
+parser.add_argument(
+    '--clip_penultimate',
+    type=bool,
+    default=False,
+    help='Use penultimate CLIP layer for text embedding'
+)
+parser.add_argument(
+    "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+)
+parser.add_argument("--num_train_epochs", type=int, default=100000000)
+parser.add_argument(
+    "--max_train_steps",
+    type=int,
+    default=None,
+    help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+)
+parser.add_argument(
+    "--gradient_accumulation_steps",
+    type=int,
+    default=1,
+    help="Number of updates steps to accumulate before performing a backward/update pass.",
+)
+parser.add_argument(
+    "--gradient_checkpointing",
+    action="store_true",
+    help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+)
+parser.add_argument(
+    "--learning_rate",
+    type=float,
+    default=1e-4,
+    help="Initial learning rate (after the potential warmup period) to use.",
+)
+parser.add_argument(
+    "--weight_decay",
+    type=float,
+    default=0.01,
+    help="Initial learning rate (after the potential warmup period) to use.",
+)
+parser.add_argument(
+    "--lr_scheduler",
+    type=str,
+    default="linear",
+    help="The scheduler type to use.",
+    choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+)
+parser.add_argument(
+    "--num_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+)
+parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+parser.add_argument(
+    "--logging_dir",
+    type=str,
+    default="logs",
+    help=(
+        "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+        " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+    ),
+)
+parser.add_argument(
+    "--print_freq",
+    type=int,
+    default=1,
+    help=(
+        "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+        " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+    ),
+)
+parser.add_argument(
+    "--report_to",
+    type=str,
+    default="tensorboard",
+    help=(
+        'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+        ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+        "Only applicable when `--with_tracking` is passed."
+    ),
+)
+parser.add_argument(
+    "--checkpointing_steps",
+    type=int,
+    default=500,
+    help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+)
+parser.add_argument(
+    "--resume_from_checkpoint",
+    type=str,
+    default=None,
+    help="If the training should continue from a checkpoint folder.",
+)
+parser.add_argument(
+    "--unet_init_ckpt",
+    type=str,
+    default=None,
+    help="If the training should continue from a checkpoint folder.",
+)
+parser.add_argument(
+    "--mixed_precision",
+    type=str,
+    default="fp16",
+    choices=["no", "fp16", "bf16"],
+    help=(
+        "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+        " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+        " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+    ),
+)
+parser.add_argument(
+    "--enable_xformers_memory_efficient_attention",
+    action="store_true",
+    help="Whether or not to use xformers.",
+)
+parser.add_argument(
+    "--max_grad_norm", default=10.0, type=float, help="Max gradient norm."
+)
+parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+args = parser.parse_args()
+print(args)

src/configs/stage2_config.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import argparse
+parser = argparse.ArgumentParser(description="Simple example of a ControlNet training script.")
+parser.add_argument(
+    "--pretrained_model_name_or_path",
+    type=str,
+    default="stabilityai/stable-diffusion-2-1-base",
+    help="Path to pretrained model or model identifier from huggingface.co/models.",)
+parser.add_argument(
+    "--seed", type=int, default=42, help="A seed for reproducible training."
+)
+parser.add_argument(
+    "--train_batch_size",
+    type=int,
+    default=8,
+    help="Batch size (per device) for the training dataloader.",
+)
+parser.add_argument("--num_train_epochs", type=int, default=10000)
+parser.add_argument(
+    "--max_train_steps",
+    type=int,
+    default=100,
+    help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+)
+parser.add_argument(
+    "--checkpointing_steps",
+    type=int,
+    default=1000,
+    help=(
+        "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+        "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+        "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+        "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+        "instructions."
+    ),
+)
+parser.add_argument("--json_path", type=str, default="./datasets/deepfashing/train_data.json", help="json path", )
+parser.add_argument("--image_root_path", type=str, default="./datasets/deepfashing/all_data_png/", help="image path", )
+parser.add_argument("--image_encoder_g_path", type=str, default="./OpenCLIP-ViT-H-14",
+                    help="Path to pretrained model or model identifier from huggingface.co/models.", )
+parser.add_argument("--image_encoder_p_path", type=str, default="./dinov2-giant",
+                    help="Path to pretrained model or model identifier from huggingface.co/models.", )
+parser.add_argument("--output_dir",type=str,default="out/",help="The output directory where the model predictions and checkpoints will be written.",)
+parser.add_argument("--img_width", type=int, default=512, help="device number", )
+parser.add_argument("--img_height", type=int, default=512, help="device number", )
+parser.add_argument(
+    "--resume_from_checkpoint",
+    type=str,
+    default="pcdms_ckpt.pt",
+    help=(
+        "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+        ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+    ),
+)
+parser.add_argument(
+    "--set_grads_to_none",
+    action="store_true",
+    help=(
+        "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+        " behaviors, so disable this argument if it causes any problems. More info:"
+        " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+    ),
+)
+parser.add_argument(
+    "--gradient_accumulation_steps",
+    type=int,
+    default=1,
+    help="Number of updates steps to accumulate before performing a backward/update pass.",
+)
+parser.add_argument(
+    "--gradient_checkpointing",
+    action="store_true",
+    help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+)
+parser.add_argument(
+    "--learning_rate",
+    type=float,
+    default=1e-4, #5e-6,
+    help="Initial learning rate (after the potential warmup period) to use.",
+)
+parser.add_argument(
+    "--scale_lr",
+    action="store_true",
+    default=False,
+    help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+)
+parser.add_argument(
+    "--lr_scheduler",
+    type=str,
+    default="constant",
+    help=(
+        'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+        ' "constant", "constant_with_warmup"]'
+    ),
+)
+parser.add_argument(
+    "--lr_warmup_steps",
+    type=int,
+    default=5000,
+    help="Number of steps for the warmup in the lr scheduler.",
+)
+parser.add_argument(
+    "--lr_num_cycles",
+    type=int,
+    default=1,
+    help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+)
+parser.add_argument(
+    "--lr_power",
+    type=float,
+    default=1.0,
+    help="Power factor of the polynomial scheduler.",
+)
+parser.add_argument(
+    "--adam_beta1",
+    type=float,
+    default=0.9,
+    help="The beta1 parameter for the Adam optimizer.",
+)
+parser.add_argument(
+    "--adam_beta2",
+    type=float,
+    default=0.999,
+    help="The beta2 parameter for the Adam optimizer.",
+)
+parser.add_argument(
+    "--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use."
+)
+parser.add_argument(
+    "--adam_epsilon",
+    type=float,
+    default=1e-08,
+    help="Epsilon value for the Adam optimizer",
+)
+parser.add_argument(
+    "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
+)
+parser.add_argument(
+    "--logging_dir",
+    type=str,
+    default="logs",
+    help=(
+        "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+        " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+    ),
+)
+parser.add_argument(
+    "--report_to",
+    type=str,
+    default="tensorboard",
+    help=(
+        'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+        ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+        "Only applicable when `--with_tracking` is passed."
+    ),
+)
+parser.add_argument(
+    "--allow_tf32",
+    action="store_true",
+    help=(
+        "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+        " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+    ),
+)
+parser.add_argument(
+    "--mixed_precision",
+    type=str,
+    default="fp16",
+    choices=["no", "fp16", "bf16"],
+    help=(
+        "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+        " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+        " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+    ),
+)
+parser.add_argument("--noise_offset", type=float, default=0.1, help="The scale of noise offset.")
+args = parser.parse_args()
+print(args)

src/configs/stage3_config.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import argparse
+parser = argparse.ArgumentParser(description="Simple example of a ControlNet training script.")
+parser.add_argument(
+    "--pretrained_model_name_or_path",
+    type=str,
+    default=None,
+    required=True,
+    help="Path to pretrained model or model identifier from huggingface.co/models.",)
+parser.add_argument("--revision",type=str,default=None,required=False,help=(
+        "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be"
+        " float32 precision."),)
+parser.add_argument("--json_path", type=str, default="./datasets/deepfashing/test_data.json", help="json path", )
+parser.add_argument("--img_path", type=str, default="./datasets/deepfashing/train_all_png/", help="image path", )
+parser.add_argument("--gen_t_img_path", type=str,default="./save_data/stage2/guidancescale2_seed42_numsteps20/",help="gen target image path", )
+parser.add_argument("--image_encoder_path", type=str, default="./dinov2-giant",
+                    help="Path to pretrained model or model identifier from huggingface.co/models.", )
+parser.add_argument("--output_dir",type=str,default="controlnet-model",help="The output directory where the model predictions and checkpoints will be written.",)
+parser.add_argument(
+    "--seed", type=int, default=None, help="A seed for reproducible training."
+)
+parser.add_argument(
+    "--resolution",
+    type=int,
+    default=512,
+    help=(
+        "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+        " resolution"
+    ),
+)
+parser.add_argument(
+    "--train_batch_size",
+    type=int,
+    default=4,
+    help="Batch size (per device) for the training dataloader.",
+)
+parser.add_argument("--num_train_epochs", type=int, default=1)
+parser.add_argument("--noise_level", type=int, default=250)
+parser.add_argument(
+    "--max_train_steps",
+    type=int,
+    default=None,
+    help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+)
+parser.add_argument(
+    "--checkpointing_steps",
+    type=int,
+    default=500,
+    help=(
+        "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. "
+        "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference."
+        "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components."
+        "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step"
+        "instructions."
+    ),
+)
+parser.add_argument(
+    "--resume_from_checkpoint",
+    type=str,
+    default=None,
+    help=(
+        "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+        ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+    ),
+)
+parser.add_argument(
+    "--gradient_accumulation_steps",
+    type=int,
+    default=1,
+    help="Number of updates steps to accumulate before performing a backward/update pass.",
+)
+parser.add_argument(
+    "--gradient_checkpointing",
+    action="store_true",
+    help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+)
+parser.add_argument(
+    "--learning_rate",
+    type=float,
+    default=5e-6,
+    help="Initial learning rate (after the potential warmup period) to use.",
+)
+parser.add_argument(
+    "--scale_lr",
+    action="store_true",
+    default=False,
+    help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+)
+parser.add_argument(
+    "--lr_scheduler",
+    type=str,
+    default="constant",
+    help=(
+        'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+        ' "constant", "constant_with_warmup"]'
+    ),
+)
+parser.add_argument(
+    "--lr_warmup_steps",
+    type=int,
+    default=500,
+    help="Number of steps for the warmup in the lr scheduler.",
+)
+parser.add_argument(
+    "--lr_num_cycles",
+    type=int,
+    default=1,
+    help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+)
+parser.add_argument(
+    "--lr_power",
+    type=float,
+    default=1.0,
+    help="Power factor of the polynomial scheduler.",
+)
+parser.add_argument(
+    "--adam_beta1",
+    type=float,
+    default=0.9,
+    help="The beta1 parameter for the Adam optimizer.",
+)
+parser.add_argument(
+    "--adam_beta2",
+    type=float,
+    default=0.999,
+    help="The beta2 parameter for the Adam optimizer.",
+)
+parser.add_argument(
+    "--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use."
+)
+parser.add_argument(
+    "--adam_epsilon",
+    type=float,
+    default=1e-08,
+    help="Epsilon value for the Adam optimizer",
+)
+parser.add_argument(
+    "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
+)
+parser.add_argument(
+    "--logging_dir",
+    type=str,
+    default="logs",
+    help=(
+        "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+        " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+    ),
+)
+parser.add_argument(
+    "--allow_tf32",
+    action="store_true",
+    help=(
+        "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+        " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+    ),
+)
+parser.add_argument(
+    "--report_to",
+    type=str,
+    default="tensorboard",
+    help=(
+        'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+        ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+    ),
+)
+parser.add_argument(
+    "--mixed_precision",
+    type=str,
+    default=None,
+    choices=["no", "fp16", "bf16"],
+    help=(
+        "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+        " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+        " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+    ),
+)
+parser.add_argument(
+    "--set_grads_to_none",
+    action="store_true",
+    help=(
+        "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+        " behaviors, so disable this argument if it causes any problems. More info:"
+        " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+    ),
+)
+parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+parser.add_argument(
+    "--tracker_project_name",
+    type=str,
+    default="train_baseline",
+    help=(
+        "The `project_name` argument passed to Accelerator.init_trackers for"
+        " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+    ),
+)
+args = parser.parse_args()
+print(args)
+if args.resolution % 8 != 0:
+    raise ValueError(
+        "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the controlnet encoder."
+    )

src/configs/yolox_l_8xb8-300e_coco.py ADDED Viewed

	@@ -0,0 +1,245 @@

+img_scale = (640, 640)  # width, height
+# model settings
+model = dict(
+    type='YOLOX',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='BatchSyncRandomResize',
+                random_size_range=(480, 800),
+                size_divisor=32,
+                interval=10)
+        ]),
+    backbone=dict(
+        type='CSPDarknet',
+        deepen_factor=1.0,
+        widen_factor=1.0,
+        out_indices=(2, 3, 4),
+        use_depthwise=False,
+        spp_kernal_sizes=(5, 9, 13),
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='Swish'),
+    ),
+    neck=dict(
+        type='YOLOXPAFPN',
+        in_channels=[256, 512, 1024],
+        out_channels=256,
+        num_csp_blocks=3,
+        use_depthwise=False,
+        upsample_cfg=dict(scale_factor=2, mode='nearest'),
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='Swish')),
+    bbox_head=dict(
+        type='YOLOXHead',
+        num_classes=80,
+        in_channels=256,
+        feat_channels=256,
+        stacked_convs=2,
+        strides=(8, 16, 32),
+        use_depthwise=False,
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='Swish'),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='IoULoss',
+            mode='square',
+            eps=1e-16,
+            reduction='sum',
+            loss_weight=5.0),
+        loss_obj=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0)),
+    train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
+    # In order to align the source code, the threshold of the val phase is
+    # 0.01, and the threshold of the test phase is 0.001.
+    test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)))
+# dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+train_pipeline = [
+    dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2)),
+    dict(
+        type='MixUp',
+        img_scale=img_scale,
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    # According to the official implementation, multi-scale
+    # training is not considered here but in the
+    # 'mmdet/models/detectors/yolox.py'.
+    # Resize and Pad are for the last 15 epochs when Mosaic,
+    # RandomAffine, and MixUp are closed by YOLOXModeSwitchHook.
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(
+        type='Pad',
+        pad_to_square=True,
+        # If the image is three-channel, the pad value needs
+        # to be set separately for each channel.
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type='PackDetInputs')
+]
+train_dataset = dict(
+    # use MultiImageMixDataset wrapper to support mosaic and mixup
+    type='MultiImageMixDataset',
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=[
+            dict(type='LoadImageFromFile', backend_args=backend_args),
+            dict(type='LoadAnnotations', with_bbox=True)
+        ],
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        backend_args=backend_args),
+    pipeline=train_pipeline)
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(
+        type='Pad',
+        pad_to_square=True,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=train_dataset)
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+# training settings
+max_epochs = 300
+num_last_epochs = 15
+interval = 10
+train_cfg = dict(max_epochs=max_epochs, val_interval=interval)
+# optimizer
+# default 8 gpu
+base_lr = 0.01
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=base_lr, momentum=0.9, weight_decay=5e-4,
+        nesterov=True),
+    paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.))
+# learning rate
+param_scheduler = [
+    dict(
+        # use quadratic formula to warm up 5 epochs
+        # and lr is updated by iteration
+        # TODO: fix default scope in get function
+        type='mmdet.QuadraticWarmupLR',
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        # use cosine lr from 5 to 285 epoch
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=5,
+        T_max=max_epochs - num_last_epochs,
+        end=max_epochs - num_last_epochs,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        # use fixed lr during last 15 epochs
+        type='ConstantLR',
+        by_epoch=True,
+        factor=1,
+        begin=max_epochs - num_last_epochs,
+        end=max_epochs,
+    )
+]
+default_hooks = dict(
+    checkpoint=dict(
+        interval=interval,
+        max_keep_ckpts=3  # only keep latest 3 checkpoints
+    ))
+custom_hooks = [
+    dict(
+        type='YOLOXModeSwitchHook',
+        num_last_epochs=num_last_epochs,
+        priority=48),
+    dict(type='SyncNormHook', priority=48),
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        priority=49)
+]
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)

src/controlnet_aux/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+__version__ = "0.0.6"
+from .hed import HEDdetector
+from .leres import LeresDetector
+from .lineart import LineartDetector
+from .lineart_anime import LineartAnimeDetector
+from .midas import MidasDetector
+from .mlsd import MLSDdetector
+from .normalbae import NormalBaeDetector
+from .open_pose import OpenposeDetector
+from .pidi import PidiNetDetector
+from .zoe import ZoeDetector
+from .canny import CannyDetector
+from .mediapipe_face import MediapipeFaceDetector
+from .segment_anything import SamDetector
+from .shuffle import ContentShuffleDetector
+from .dwpose import DWposeDetector

src/controlnet_aux/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.17 kB). View file

src/controlnet_aux/__pycache__/util.cpython-311.pyc ADDED Viewed

Binary file (13 kB). View file

src/controlnet_aux/canny/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import warnings
+import cv2
+import numpy as np
+from PIL import Image
+from ..util import HWC3, resize_image
+class CannyDetector:
+    def __call__(self, input_image=None, low_threshold=100, high_threshold=200, detect_resolution=512, image_resolution=512, output_type=None, **kwargs):
+        if "img" in kwargs:
+            warnings.warn("img is deprecated, please use `input_image=...` instead.", DeprecationWarning)
+            input_image = kwargs.pop("img")
+        if input_image is None:
+            raise ValueError("input_image must be defined.")
+        if not isinstance(input_image, np.ndarray):
+            input_image = np.array(input_image, dtype=np.uint8)
+            output_type = output_type or "pil"
+        else:
+            output_type = output_type or "np"
+        input_image = HWC3(input_image)
+        input_image = resize_image(input_image, detect_resolution)
+        detected_map = cv2.Canny(input_image, low_threshold, high_threshold)
+        detected_map = HWC3(detected_map)
+        img = resize_image(input_image, image_resolution)
+        H, W, C = img.shape
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+        if output_type == "pil":
+            detected_map = Image.fromarray(detected_map)
+        return detected_map

src/controlnet_aux/canny/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.11 kB). View file

src/controlnet_aux/dwpose/__init__.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Openpose
+# Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
+# 2nd Edited by https://github.com/Hzzone/pytorch-openpose
+# 3rd Edited by ControlNet
+# 4th Edited by ControlNet (added face and correct hands)
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
+import cv2
+import torch
+import numpy as np
+from PIL import Image
+from ..util import HWC3, resize_image
+from . import util
+def draw_pose(pose, H, W):
+    bodies = pose['bodies']
+    faces = pose['faces']
+    hands = pose['hands']
+    candidate = bodies['candidate']
+    subset = bodies['subset']
+    canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
+    canvas = util.draw_bodypose(canvas, candidate, subset)
+    canvas = util.draw_handpose(canvas, hands)
+    # canvas = util.draw_facepose(canvas, faces)
+    return canvas
+class DWposeDetector:
+    def __init__(self, det_config=None, det_ckpt=None, pose_config=None, pose_ckpt=None, device="cpu"):
+        from .wholebody import Wholebody
+        self.pose_estimation = Wholebody(det_config, det_ckpt, pose_config, pose_ckpt, device)
+    def to(self, device):
+        self.pose_estimation.to(device)
+        return self
+    def __call__(self, input_image, detect_resolution=512, image_resolution=512, output_type="pil", **kwargs):
+        input_image = cv2.cvtColor(np.array(input_image, dtype=np.uint8), cv2.COLOR_RGB2BGR)
+        input_image = HWC3(input_image)
+        input_image = resize_image(input_image, detect_resolution)
+        H, W, C = input_image.shape
+        with torch.no_grad():
+            candidate, subset = self.pose_estimation(input_image)
+            nums, keys, locs = candidate.shape
+            candidate[..., 0] /= float(W)
+            candidate[..., 1] /= float(H)
+            body = candidate[:,:18].copy()
+            body = body.reshape(nums*18, locs)
+            score = subset[:,:18]
+            for i in range(len(score)):
+                for j in range(len(score[i])):
+                    if score[i][j] > 0.3:
+                        score[i][j] = int(18*i+j)
+                    else:
+                        score[i][j] = -1
+            un_visible = subset<0.3
+            candidate[un_visible] = -1
+            foot = candidate[:,18:24]
+            faces = candidate[:,24:92]
+            hands = candidate[:,92:113]
+            hands = np.vstack([hands, candidate[:,113:]])
+            bodies = dict(candidate=body, subset=score)
+            pose = dict(bodies=bodies, hands=hands, faces=faces)
+            detected_map = draw_pose(pose, H, W)
+            detected_map = HWC3(detected_map)
+            img = resize_image(input_image, image_resolution)
+            H, W, C = img.shape
+            detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+            if output_type == "pil":
+                detected_map = Image.fromarray(detected_map)
+            return detected_map

src/controlnet_aux/dwpose/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (4.84 kB). View file

src/controlnet_aux/dwpose/__pycache__/util.cpython-311.pyc ADDED Viewed

Binary file (16.5 kB). View file

src/controlnet_aux/dwpose/__pycache__/wholebody.cpython-311.pyc ADDED Viewed

Binary file (6.2 kB). View file

src/controlnet_aux/dwpose/dwpose_config/dwpose-l_384x288.py ADDED Viewed

	@@ -0,0 +1,257 @@

+# runtime
+max_epochs = 270
+stage2_num_epochs = 30
+base_lr = 4e-3
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=21)
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0e-5,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        # use cosine lr from 150 to 300 epoch
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=512)
+# codec settings
+codec = dict(
+    type='SimCCLabel',
+    input_size=(288, 384),
+    sigma=(6., 6.93),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False)
+# model settings
+model = dict(
+    type='TopdownPoseEstimator',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        _scope_='mmdet',
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=1.,
+        widen_factor=1.,
+        out_indices=(4, ),
+        channel_attention=True,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU'),
+        init_cfg=dict(
+            type='Pretrained',
+            prefix='backbone.',
+            checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
+            'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth'  # noqa
+        )),
+    head=dict(
+        type='RTMCCHead',
+        in_channels=1024,
+        out_channels=133,
+        input_size=codec['input_size'],
+        in_featuremap_size=(9, 12),
+        simcc_split_ratio=codec['simcc_split_ratio'],
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.,
+            drop_path=0.,
+            act_fn='SiLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=dict(
+            type='KLDiscretLoss',
+            use_target_weight=True,
+            beta=10.,
+            label_softmax=True),
+        decoder=codec),
+    test_cfg=dict(flip_test=True, ))
+# base dataset settings
+dataset_type = 'CocoWholeBodyDataset'
+data_mode = 'topdown'
+data_root = '/data/'
+backend_args = dict(backend='local')
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
+#         f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
+#     }))
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0),
+        ]),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='PackPoseInputs')
+]
+train_pipeline_stage2 = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[0.75, 1.25],
+        rotate_factor=60),
+    dict(type='TopdownAffine', input_size=codec['input_size']),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=0.5),
+        ]),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+datasets = []
+dataset_coco=dict(
+    type=dataset_type,
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
+    data_prefix=dict(img='coco/train2017/'),
+    pipeline=[],
+)
+datasets.append(dataset_coco)
+scene = ['Magic_show', 'Entertainment', 'ConductMusic', 'Online_class',
+         'TalkShow', 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow',
+         'Singing', 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference']
+for i in range(len(scene)):
+    datasets.append(
+        dict(
+            type=dataset_type,
+            data_root=data_root,
+            data_mode=data_mode,
+            ann_file='UBody/annotations/'+scene[i]+'/keypoint_annotation.json',
+            data_prefix=dict(img='UBody/images/'+scene[i]+'/'),
+            pipeline=[],
+        )
+    )
+# data loaders
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CombinedDataset',
+        metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+        datasets=datasets,
+        pipeline=train_pipeline,
+        test_mode=False,
+    ))
+val_dataloader = dict(
+    batch_size=32,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_mode=data_mode,
+        ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
+        bbox_file=f'{data_root}coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
+        data_prefix=dict(img='coco/val2017/'),
+        test_mode=True,
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+# evaluators
+val_evaluator = dict(
+    type='CocoWholeBodyMetric',
+    ann_file=data_root + 'coco/annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator

src/controlnet_aux/dwpose/util.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import math
+import numpy as np
+import cv2
+eps = 0.01
+def smart_resize(x, s):
+    Ht, Wt = s
+    if x.ndim == 2:
+        Ho, Wo = x.shape
+        Co = 1
+    else:
+        Ho, Wo, Co = x.shape
+    if Co == 3 or Co == 1:
+        k = float(Ht + Wt) / float(Ho + Wo)
+        return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
+    else:
+        return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
+def smart_resize_k(x, fx, fy):
+    if x.ndim == 2:
+        Ho, Wo = x.shape
+        Co = 1
+    else:
+        Ho, Wo, Co = x.shape
+    Ht, Wt = Ho * fy, Wo * fx
+    if Co == 3 or Co == 1:
+        k = float(Ht + Wt) / float(Ho + Wo)
+        return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
+    else:
+        return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
+def padRightDownCorner(img, stride, padValue):
+    h = img.shape[0]
+    w = img.shape[1]
+    pad = 4 * [None]
+    pad[0] = 0  # up
+    pad[1] = 0  # left
+    pad[2] = 0 if (h % stride == 0) else stride - (h % stride)  # down
+    pad[3] = 0 if (w % stride == 0) else stride - (w % stride)  # right
+    img_padded = img
+    pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1))
+    img_padded = np.concatenate((pad_up, img_padded), axis=0)
+    pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1))
+    img_padded = np.concatenate((pad_left, img_padded), axis=1)
+    pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1))
+    img_padded = np.concatenate((img_padded, pad_down), axis=0)
+    pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1))
+    img_padded = np.concatenate((img_padded, pad_right), axis=1)
+    return img_padded, pad
+def transfer(model, model_weights):
+    transfered_model_weights = {}
+    for weights_name in model.state_dict().keys():
+        transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
+    return transfered_model_weights
+def draw_bodypose(canvas, candidate, subset):
+    H, W, C = canvas.shape
+    candidate = np.array(candidate)
+    subset = np.array(subset)
+    stickwidth = 4
+    limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
+               [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
+               [1, 16], [16, 18], [3, 17], [6, 18]]
+    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
+              [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
+              [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
+    for i in range(17):
+        for n in range(len(subset)):
+            index = subset[n][np.array(limbSeq[i]) - 1]
+            if -1 in index:
+                continue
+            Y = candidate[index.astype(int), 0] * float(W)
+            X = candidate[index.astype(int), 1] * float(H)
+            mX = np.mean(X)
+            mY = np.mean(Y)
+            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
+            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
+            polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
+            cv2.fillConvexPoly(canvas, polygon, colors[i])
+    canvas = (canvas * 0.6).astype(np.uint8)
+    for i in range(18):
+        for n in range(len(subset)):
+            index = int(subset[n][i])
+            if index == -1:
+                continue
+            x, y = candidate[index][0:2]
+            x = int(x * W)
+            y = int(y * H)
+            cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
+    return canvas
+def draw_handpose(canvas, all_hand_peaks):
+    import matplotlib
+    H, W, C = canvas.shape
+    edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
+             [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
+    # (person_number*2, 21, 2)
+    for i in range(len(all_hand_peaks)):
+        peaks = all_hand_peaks[i]
+        peaks = np.array(peaks)
+        for ie, e in enumerate(edges):
+            x1, y1 = peaks[e[0]]
+            x2, y2 = peaks[e[1]]
+            x1 = int(x1 * W)
+            y1 = int(y1 * H)
+            x2 = int(x2 * W)
+            y2 = int(y2 * H)
+            if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
+                cv2.line(canvas, (x1, y1), (x2, y2),
+                         matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=1)
+        for _, keyponit in enumerate(peaks):
+            x, y = keyponit
+            x = int(x * W)
+            y = int(y * H)
+            if x > eps and y > eps:
+                cv2.circle(canvas, (x, y), 1, (0, 0, 255), thickness=-1)
+    return canvas
+def draw_facepose(canvas, all_lmks):
+    H, W, C = canvas.shape
+    for lmks in all_lmks:
+        lmks = np.array(lmks)
+        for lmk in lmks:
+            x, y = lmk
+            x = int(x * W)
+            y = int(y * H)
+            if x > eps and y > eps:
+                cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
+    return canvas
+# detect hand according to body pose keypoints
+# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
+def handDetect(candidate, subset, oriImg):
+    # right hand: wrist 4, elbow 3, shoulder 2
+    # left hand: wrist 7, elbow 6, shoulder 5
+    ratioWristElbow = 0.33
+    detect_result = []
+    image_height, image_width = oriImg.shape[0:2]
+    for person in subset.astype(int):
+        # if any of three not detected
+        has_left = np.sum(person[[5, 6, 7]] == -1) == 0
+        has_right = np.sum(person[[2, 3, 4]] == -1) == 0
+        if not (has_left or has_right):
+            continue
+        hands = []
+        # left hand
+        if has_left:
+            left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
+            x1, y1 = candidate[left_shoulder_index][:2]
+            x2, y2 = candidate[left_elbow_index][:2]
+            x3, y3 = candidate[left_wrist_index][:2]
+            hands.append([x1, y1, x2, y2, x3, y3, True])
+        # right hand
+        if has_right:
+            right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
+            x1, y1 = candidate[right_shoulder_index][:2]
+            x2, y2 = candidate[right_elbow_index][:2]
+            x3, y3 = candidate[right_wrist_index][:2]
+            hands.append([x1, y1, x2, y2, x3, y3, False])
+        for x1, y1, x2, y2, x3, y3, is_left in hands:
+            # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
+            # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
+            # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
+            # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
+            # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
+            # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
+            x = x3 + ratioWristElbow * (x3 - x2)
+            y = y3 + ratioWristElbow * (y3 - y2)
+            distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
+            distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
+            width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
+            # x-y refers to the center --> offset to topLeft point
+            # handRectangle.x -= handRectangle.width / 2.f;
+            # handRectangle.y -= handRectangle.height / 2.f;
+            x -= width / 2
+            y -= width / 2  # width = height
+            # overflow the image
+            if x < 0: x = 0
+            if y < 0: y = 0
+            width1 = width
+            width2 = width
+            if x + width > image_width: width1 = image_width - x
+            if y + width > image_height: width2 = image_height - y
+            width = min(width1, width2)
+            # the max hand box value is 20 pixels
+            if width >= 20:
+                detect_result.append([int(x), int(y), int(width), is_left])
+    '''
+    return value: [[x, y, w, True if left hand else False]].
+    width=height since the network require squared input.
+    x, y is the coordinate of top left
+    '''
+    return detect_result
+# Written by Lvmin
+def faceDetect(candidate, subset, oriImg):
+    # left right eye ear 14 15 16 17
+    detect_result = []
+    image_height, image_width = oriImg.shape[0:2]
+    for person in subset.astype(int):
+        has_head = person[0] > -1
+        if not has_head:
+            continue
+        has_left_eye = person[14] > -1
+        has_right_eye = person[15] > -1
+        has_left_ear = person[16] > -1
+        has_right_ear = person[17] > -1
+        if not (has_left_eye or has_right_eye or has_left_ear or has_right_ear):
+            continue
+        head, left_eye, right_eye, left_ear, right_ear = person[[0, 14, 15, 16, 17]]
+        width = 0.0
+        x0, y0 = candidate[head][:2]
+        if has_left_eye:
+            x1, y1 = candidate[left_eye][:2]
+            d = max(abs(x0 - x1), abs(y0 - y1))
+            width = max(width, d * 3.0)
+        if has_right_eye:
+            x1, y1 = candidate[right_eye][:2]
+            d = max(abs(x0 - x1), abs(y0 - y1))
+            width = max(width, d * 3.0)
+        if has_left_ear:
+            x1, y1 = candidate[left_ear][:2]
+            d = max(abs(x0 - x1), abs(y0 - y1))
+            width = max(width, d * 1.5)
+        if has_right_ear:
+            x1, y1 = candidate[right_ear][:2]
+            d = max(abs(x0 - x1), abs(y0 - y1))
+            width = max(width, d * 1.5)
+        x, y = x0, y0
+        x -= width
+        y -= width
+        if x < 0:
+            x = 0
+        if y < 0:
+            y = 0
+        width1 = width * 2
+        width2 = width * 2
+        if x + width > image_width:
+            width1 = image_width - x
+        if y + width > image_height:
+            width2 = image_height - y
+        width = min(width1, width2)
+        if width >= 20:
+            detect_result.append([int(x), int(y), int(width)])
+    return detect_result
+# get max index of 2d array
+def npmax(array):
+    arrayindex = array.argmax(1)
+    arrayvalue = array.max(1)
+    i = arrayvalue.argmax()
+    j = arrayindex[i]
+    return i, j

src/controlnet_aux/dwpose/wholebody.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import numpy as np
+import warnings
+try:
+    import mmcv
+except ImportError:
+    warnings.warn(
+        "The module 'mmcv' is not installed. The package will have limited functionality. Please install it using the command: mim install 'mmcv>=2.0.1'"
+    )
+try:
+    from mmpose.apis import inference_topdown
+    from mmpose.apis import init_model as init_pose_estimator
+    from mmpose.evaluation.functional import nms
+    from mmpose.utils import adapt_mmdet_pipeline
+    from mmpose.structures import merge_data_samples
+except ImportError:
+    warnings.warn(
+        "The module 'mmpose' is not installed. The package will have limited functionality. Please install it using the command: mim install 'mmpose>=1.1.0'"
+    )
+try:
+    from mmdet.apis import inference_detector, init_detector
+except ImportError:
+    warnings.warn(
+        "The module 'mmdet' is not installed. The package will have limited functionality. Please install it using the command: mim install 'mmdet>=3.1.0'"
+    )
+class Wholebody:
+    def __init__(self,
+                 det_config=None, det_ckpt=None,
+                 pose_config=None, pose_ckpt=None,
+                 device="cpu"):
+        if det_config is None:
+            det_config = os.path.join(os.path.dirname(__file__), "yolox_config/yolox_l_8xb8-300e_coco.py")
+        if pose_config is None:
+            pose_config = os.path.join(os.path.dirname(__file__), "dwpose_config/dwpose-l_384x288.py")
+        if det_ckpt is None:
+            det_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth'
+        if pose_ckpt is None:
+            pose_ckpt = "https://huggingface.co/wanghaofan/dw-ll_ucoco_384/resolve/main/dw-ll_ucoco_384.pth"
+        # build detector
+        self.detector = init_detector(det_config, det_ckpt, device=device)
+        self.detector.cfg = adapt_mmdet_pipeline(self.detector.cfg)
+        # build pose estimator
+        self.pose_estimator = init_pose_estimator(
+            pose_config,
+            pose_ckpt,
+            device=device)
+    def to(self, device):
+        self.detector.to(device)
+        self.pose_estimator.to(device)
+        return self
+    def __call__(self, oriImg):
+        # predict bbox
+        det_result = inference_detector(self.detector, oriImg)
+        pred_instance = det_result.pred_instances.cpu().numpy()
+        bboxes = np.concatenate(
+            (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1)
+        bboxes = bboxes[np.logical_and(pred_instance.labels == 0,
+                                    pred_instance.scores > 0.5)]
+        # set NMS threshold
+        bboxes = bboxes[nms(bboxes, 0.7), :4]
+        # predict keypoints
+        if len(bboxes) == 0:
+            pose_results = inference_topdown(self.pose_estimator, oriImg)
+        else:
+            pose_results = inference_topdown(self.pose_estimator, oriImg, bboxes)
+        preds = merge_data_samples(pose_results)
+        preds = preds.pred_instances
+        # preds = pose_results[0].pred_instances
+        keypoints = preds.get('transformed_keypoints',
+                                        preds.keypoints)
+        if 'keypoint_scores' in preds:
+            scores = preds.keypoint_scores
+        else:
+            scores = np.ones(keypoints.shape[:-1])
+        if 'keypoints_visible' in preds:
+            visible = preds.keypoints_visible
+        else:
+            visible = np.ones(keypoints.shape[:-1])
+        keypoints_info = np.concatenate(
+            (keypoints, scores[..., None], visible[..., None]),
+            axis=-1)
+        # compute neck joint
+        neck = np.mean(keypoints_info[:, [5, 6]], axis=1)
+        # neck score when visualizing pred
+        neck[:, 2:4] = np.logical_and(
+            keypoints_info[:, 5, 2:4] > 0.3,
+            keypoints_info[:, 6, 2:4] > 0.3).astype(int)
+        new_keypoints_info = np.insert(
+            keypoints_info, 17, neck, axis=1)
+        mmpose_idx = [
+            17, 6, 8, 10, 7, 9, 12, 14, 16, 13, 15, 2, 1, 4, 3
+        ]
+        openpose_idx = [
+            1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17
+        ]
+        new_keypoints_info[:, openpose_idx] = \
+            new_keypoints_info[:, mmpose_idx]
+        keypoints_info = new_keypoints_info
+        keypoints, scores, visible = keypoints_info[
+            ..., :2], keypoints_info[..., 2], keypoints_info[..., 3]
+        return keypoints, scores

src/controlnet_aux/dwpose/yolox_config/yolox_l_8xb8-300e_coco.py ADDED Viewed

	@@ -0,0 +1,245 @@

+img_scale = (640, 640)  # width, height
+# model settings
+model = dict(
+    type='YOLOX',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='BatchSyncRandomResize',
+                random_size_range=(480, 800),
+                size_divisor=32,
+                interval=10)
+        ]),
+    backbone=dict(
+        type='CSPDarknet',
+        deepen_factor=1.0,
+        widen_factor=1.0,
+        out_indices=(2, 3, 4),
+        use_depthwise=False,
+        spp_kernal_sizes=(5, 9, 13),
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='Swish'),
+    ),
+    neck=dict(
+        type='YOLOXPAFPN',
+        in_channels=[256, 512, 1024],
+        out_channels=256,
+        num_csp_blocks=3,
+        use_depthwise=False,
+        upsample_cfg=dict(scale_factor=2, mode='nearest'),
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='Swish')),
+    bbox_head=dict(
+        type='YOLOXHead',
+        num_classes=80,
+        in_channels=256,
+        feat_channels=256,
+        stacked_convs=2,
+        strides=(8, 16, 32),
+        use_depthwise=False,
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='Swish'),
+        loss_cls=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='IoULoss',
+            mode='square',
+            eps=1e-16,
+            reduction='sum',
+            loss_weight=5.0),
+        loss_obj=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0)),
+    train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
+    # In order to align the source code, the threshold of the val phase is
+    # 0.01, and the threshold of the test phase is 0.001.
+    test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)))
+# dataset settings
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+train_pipeline = [
+    dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=(0.1, 2),
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2)),
+    dict(
+        type='MixUp',
+        img_scale=img_scale,
+        ratio_range=(0.8, 1.6),
+        pad_val=114.0),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    # According to the official implementation, multi-scale
+    # training is not considered here but in the
+    # 'mmdet/models/detectors/yolox.py'.
+    # Resize and Pad are for the last 15 epochs when Mosaic,
+    # RandomAffine, and MixUp are closed by YOLOXModeSwitchHook.
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(
+        type='Pad',
+        pad_to_square=True,
+        # If the image is three-channel, the pad value needs
+        # to be set separately for each channel.
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
+    dict(type='PackDetInputs')
+]
+train_dataset = dict(
+    # use MultiImageMixDataset wrapper to support mosaic and mixup
+    type='MultiImageMixDataset',
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        pipeline=[
+            dict(type='LoadImageFromFile', backend_args=backend_args),
+            dict(type='LoadAnnotations', with_bbox=True)
+        ],
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        backend_args=backend_args),
+    pipeline=train_pipeline)
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(
+        type='Pad',
+        pad_to_square=True,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=train_dataset)
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+# training settings
+max_epochs = 300
+num_last_epochs = 15
+interval = 10
+train_cfg = dict(max_epochs=max_epochs, val_interval=interval)
+# optimizer
+# default 8 gpu
+base_lr = 0.01
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='SGD', lr=base_lr, momentum=0.9, weight_decay=5e-4,
+        nesterov=True),
+    paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.))
+# learning rate
+param_scheduler = [
+    dict(
+        # use quadratic formula to warm up 5 epochs
+        # and lr is updated by iteration
+        # TODO: fix default scope in get function
+        type='mmdet.QuadraticWarmupLR',
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        # use cosine lr from 5 to 285 epoch
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=5,
+        T_max=max_epochs - num_last_epochs,
+        end=max_epochs - num_last_epochs,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        # use fixed lr during last 15 epochs
+        type='ConstantLR',
+        by_epoch=True,
+        factor=1,
+        begin=max_epochs - num_last_epochs,
+        end=max_epochs,
+    )
+]
+default_hooks = dict(
+    checkpoint=dict(
+        interval=interval,
+        max_keep_ckpts=3  # only keep latest 3 checkpoints
+    ))
+custom_hooks = [
+    dict(
+        type='YOLOXModeSwitchHook',
+        num_last_epochs=num_last_epochs,
+        priority=48),
+    dict(type='SyncNormHook', priority=48),
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        priority=49)
+]
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (8 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)

src/controlnet_aux/hed/__init__.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# This is an improved version and model of HED edge detection with Apache License, Version 2.0.
+# Please use this implementation in your products
+# This implementation may produce slightly different results from Saining Xie's official implementations,
+# but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations.
+# Different from official models and other implementations, this is an RGB-input model (rather than BGR)
+# and in this way it works better for gradio's RGB protocol
+import os
+import warnings
+import cv2
+import numpy as np
+import torch
+from einops import rearrange
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from ..util import HWC3, nms, resize_image, safe_step
+class DoubleConvBlock(torch.nn.Module):
+    def __init__(self, input_channel, output_channel, layer_number):
+        super().__init__()
+        self.convs = torch.nn.Sequential()
+        self.convs.append(torch.nn.Conv2d(in_channels=input_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
+        for i in range(1, layer_number):
+            self.convs.append(torch.nn.Conv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
+        self.projection = torch.nn.Conv2d(in_channels=output_channel, out_channels=1, kernel_size=(1, 1), stride=(1, 1), padding=0)
+    def __call__(self, x, down_sampling=False):
+        h = x
+        if down_sampling:
+            h = torch.nn.functional.max_pool2d(h, kernel_size=(2, 2), stride=(2, 2))
+        for conv in self.convs:
+            h = conv(h)
+            h = torch.nn.functional.relu(h)
+        return h, self.projection(h)
+class ControlNetHED_Apache2(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1)))
+        self.block1 = DoubleConvBlock(input_channel=3, output_channel=64, layer_number=2)
+        self.block2 = DoubleConvBlock(input_channel=64, output_channel=128, layer_number=2)
+        self.block3 = DoubleConvBlock(input_channel=128, output_channel=256, layer_number=3)
+        self.block4 = DoubleConvBlock(input_channel=256, output_channel=512, layer_number=3)
+        self.block5 = DoubleConvBlock(input_channel=512, output_channel=512, layer_number=3)
+    def __call__(self, x):
+        h = x - self.norm
+        h, projection1 = self.block1(h)
+        h, projection2 = self.block2(h, down_sampling=True)
+        h, projection3 = self.block3(h, down_sampling=True)
+        h, projection4 = self.block4(h, down_sampling=True)
+        h, projection5 = self.block5(h, down_sampling=True)
+        return projection1, projection2, projection3, projection4, projection5
+class HEDdetector:
+    def __init__(self, netNetwork):
+        self.netNetwork = netNetwork
+    @classmethod
+    def from_pretrained(cls, pretrained_model_or_path, filename=None, cache_dir=None):
+        filename = filename or "ControlNetHED.pth"
+        if os.path.isdir(pretrained_model_or_path):
+            model_path = os.path.join(pretrained_model_or_path, filename)
+        else:
+            model_path = hf_hub_download(pretrained_model_or_path, filename, cache_dir=cache_dir)
+        netNetwork = ControlNetHED_Apache2()
+        netNetwork.load_state_dict(torch.load(model_path, map_location='cpu'))
+        netNetwork.float().eval()
+        return cls(netNetwork)
+    def to(self, device):
+        self.netNetwork.to(device)
+        return self
+    def __call__(self, input_image, detect_resolution=512, image_resolution=512, safe=False, output_type="pil", scribble=False, **kwargs):
+        if "return_pil" in kwargs:
+            warnings.warn("return_pil is deprecated. Use output_type instead.", DeprecationWarning)
+            output_type = "pil" if kwargs["return_pil"] else "np"
+        if type(output_type) is bool:
+            warnings.warn("Passing `True` or `False` to `output_type` is deprecated and will raise an error in future versions")
+            if output_type:
+                output_type = "pil"
+        device = next(iter(self.netNetwork.parameters())).device
+        if not isinstance(input_image, np.ndarray):
+            input_image = np.array(input_image, dtype=np.uint8)
+        input_image = HWC3(input_image)
+        input_image = resize_image(input_image, detect_resolution)
+        assert input_image.ndim == 3
+        H, W, C = input_image.shape
+        with torch.no_grad():
+            image_hed = torch.from_numpy(input_image.copy()).float().to(device)
+            image_hed = rearrange(image_hed, 'h w c -> 1 c h w')
+            edges = self.netNetwork(image_hed)
+            edges = [e.detach().cpu().numpy().astype(np.float32)[0, 0] for e in edges]
+            edges = [cv2.resize(e, (W, H), interpolation=cv2.INTER_LINEAR) for e in edges]
+            edges = np.stack(edges, axis=2)
+            edge = 1 / (1 + np.exp(-np.mean(edges, axis=2).astype(np.float64)))
+            if safe:
+                edge = safe_step(edge)
+            edge = (edge * 255.0).clip(0, 255).astype(np.uint8)
+        detected_map = edge
+        detected_map = HWC3(detected_map)
+        img = resize_image(input_image, image_resolution)
+        H, W, C = img.shape
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+        if scribble:
+            detected_map = nms(detected_map, 127, 3.0)
+            detected_map = cv2.GaussianBlur(detected_map, (0, 0), 3.0)
+            detected_map[detected_map > 4] = 255
+            detected_map[detected_map < 255] = 0
+        if output_type == "pil":
+            detected_map = Image.fromarray(detected_map)
+        return detected_map

src/controlnet_aux/hed/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (10.1 kB). View file

src/controlnet_aux/leres/__init__.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import os
+import cv2
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from ..util import HWC3, resize_image
+from .leres.depthmap import estimateboost, estimateleres
+from .leres.multi_depth_model_woauxi import RelDepthModel
+from .leres.net_tools import strip_prefix_if_present
+from .pix2pix.models.pix2pix4depth_model import Pix2Pix4DepthModel
+from .pix2pix.options.test_options import TestOptions
+class LeresDetector:
+    def __init__(self, model, pix2pixmodel):
+        self.model = model
+        self.pix2pixmodel = pix2pixmodel
+    @classmethod
+    def from_pretrained(cls, pretrained_model_or_path, filename=None, pix2pix_filename=None, cache_dir=None):
+        filename = filename or "res101.pth"
+        pix2pix_filename = pix2pix_filename or "latest_net_G.pth"
+        if os.path.isdir(pretrained_model_or_path):
+            model_path = os.path.join(pretrained_model_or_path, filename)
+        else:
+            model_path = hf_hub_download(pretrained_model_or_path, filename, cache_dir=cache_dir)
+        checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
+        model = RelDepthModel(backbone='resnext101')
+        model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."), strict=True)
+        del checkpoint
+        if os.path.isdir(pretrained_model_or_path):
+            model_path = os.path.join(pretrained_model_or_path, pix2pix_filename)
+        else:
+            model_path = hf_hub_download(pretrained_model_or_path, pix2pix_filename, cache_dir=cache_dir)
+        opt = TestOptions().parse()
+        if not torch.cuda.is_available():
+            opt.gpu_ids = []  # cpu mode
+        pix2pixmodel = Pix2Pix4DepthModel(opt)
+        pix2pixmodel.save_dir = os.path.dirname(model_path)
+        pix2pixmodel.load_networks('latest')
+        pix2pixmodel.eval()
+        return cls(model, pix2pixmodel)
+    def to(self, device):
+        self.model.to(device)
+        # TODO - refactor pix2pix implementation to support device migration
+        # self.pix2pixmodel.to(device)
+        return self
+    def __call__(self, input_image, thr_a=0, thr_b=0, boost=False, detect_resolution=512, image_resolution=512, output_type="pil"):
+        device = next(iter(self.model.parameters())).device
+        if not isinstance(input_image, np.ndarray):
+            input_image = np.array(input_image, dtype=np.uint8)
+        input_image = HWC3(input_image)
+        input_image = resize_image(input_image, detect_resolution)
+        assert input_image.ndim == 3
+        height, width, dim = input_image.shape
+        with torch.no_grad():
+            if boost:
+                depth = estimateboost(input_image, self.model, 0, self.pix2pixmodel, max(width, height))
+            else:
+                depth = estimateleres(input_image, self.model, width, height)
+            numbytes=2
+            depth_min = depth.min()
+            depth_max = depth.max()
+            max_val = (2**(8*numbytes))-1
+            # check output before normalizing and mapping to 16 bit
+            if depth_max - depth_min > np.finfo("float").eps:
+                out = max_val * (depth - depth_min) / (depth_max - depth_min)
+            else:
+                out = np.zeros(depth.shape)
+            # single channel, 16 bit image
+            depth_image = out.astype("uint16")
+            # convert to uint8
+            depth_image = cv2.convertScaleAbs(depth_image, alpha=(255.0/65535.0))
+            # remove near
+            if thr_a != 0:
+                thr_a = ((thr_a/100)*255)
+                depth_image = cv2.threshold(depth_image, thr_a, 255, cv2.THRESH_TOZERO)[1]
+            # invert image
+            depth_image = cv2.bitwise_not(depth_image)
+            # remove bg
+            if thr_b != 0:
+                thr_b = ((thr_b/100)*255)
+                depth_image = cv2.threshold(depth_image, thr_b, 255, cv2.THRESH_TOZERO)[1]
+        detected_map = depth_image
+        detected_map = HWC3(detected_map)
+        img = resize_image(input_image, image_resolution)
+        H, W, C = img.shape
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+        if output_type == "pil":
+            detected_map = Image.fromarray(detected_map)
+        return detected_map

src/controlnet_aux/leres/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (6.37 kB). View file

src/controlnet_aux/leres/leres/LICENSE ADDED Viewed

	@@ -0,0 +1,23 @@

+https://github.com/thygate/stable-diffusion-webui-depthmap-script
+MIT License
+Copyright (c) 2023 Bob Thiry
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

src/controlnet_aux/leres/leres/Resnet.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import torch.nn as nn
+import torch.nn as NN
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152']
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = NN.BatchNorm2d(planes) #NN.BatchNorm2d
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = NN.BatchNorm2d(planes) #NN.BatchNorm2d
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = NN.BatchNorm2d(planes) #NN.BatchNorm2d
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = NN.BatchNorm2d(planes) #NN.BatchNorm2d
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = NN.BatchNorm2d(planes * self.expansion) #NN.BatchNorm2d
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = NN.BatchNorm2d(64)  #NN.BatchNorm2d
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        #self.avgpool = nn.AvgPool2d(7, stride=1)
+        #self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                NN.BatchNorm2d(planes * block.expansion), #NN.BatchNorm2d
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        features = []
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        features.append(x)
+        x = self.layer2(x)
+        features.append(x)
+        x = self.layer3(x)
+        features.append(x)
+        x = self.layer4(x)
+        features.append(x)
+        return features
+def resnet18(pretrained=True, **kwargs):
+    """Constructs a ResNet-18 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    return model
+def resnet34(pretrained=True, **kwargs):
+    """Constructs a ResNet-34 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+    return model
+def resnet50(pretrained=True, **kwargs):
+    """Constructs a ResNet-50 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    return model
+def resnet101(pretrained=True, **kwargs):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    return model
+def resnet152(pretrained=True, **kwargs):
+    """Constructs a ResNet-152 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+    return model

src/controlnet_aux/leres/leres/Resnext_torch.py ADDED Viewed

	@@ -0,0 +1,237 @@

+#!/usr/bin/env python
+# coding: utf-8
+import torch.nn as nn
+try:
+    from urllib import urlretrieve
+except ImportError:
+    from urllib.request import urlretrieve
+__all__ = ['resnext101_32x8d']
+model_urls = {
+    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+}
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
+                 norm_layer=None):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        #self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        #self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+        return nn.Sequential(*layers)
+    def _forward_impl(self, x):
+        # See note [TorchScript super()]
+        features = []
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        features.append(x)
+        x = self.layer2(x)
+        features.append(x)
+        x = self.layer3(x)
+        features.append(x)
+        x = self.layer4(x)
+        features.append(x)
+        #x = self.avgpool(x)
+        #x = torch.flatten(x, 1)
+        #x = self.fc(x)
+        return features
+    def forward(self, x):
+        return self._forward_impl(x)
+def resnext101_32x8d(pretrained=True, **kwargs):
+    """Constructs a ResNet-152 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    return model