Spaces:

cangcz
/

AnchorCrafter

Build error

App Files Files Community

cangcz commited on Apr 14, 2025

Commit

34ee308

verified ·

1 Parent(s): 51e2bb1

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +53 -0
README.md +5 -6
__init__.py +0 -0
anchorcrafter/__init__.py +0 -0
anchorcrafter/dwpose/__init__.py +0 -0
anchorcrafter/dwpose/dwpose_detector.py +71 -0
anchorcrafter/dwpose/onnxdet.py +145 -0
anchorcrafter/dwpose/onnxpose.py +375 -0
anchorcrafter/dwpose/preprocess.py +85 -0
anchorcrafter/dwpose/util.py +133 -0
anchorcrafter/dwpose/wholebody.py +60 -0
anchorcrafter/modules/__init__.py +0 -0
anchorcrafter/modules/attention_processor.py +466 -0
anchorcrafter/modules/obj_attn_net.py +47 -0
anchorcrafter/modules/obj_proj_net.py +33 -0
anchorcrafter/modules/pose_net.py +88 -0
anchorcrafter/modules/track_net.py +76 -0
anchorcrafter/modules/unet.py +509 -0
anchorcrafter/pipelines/pipeline.py +739 -0
anchorcrafter/utils/__init__.py +0 -0
anchorcrafter/utils/geglu_patch.py +10 -0
anchorcrafter/utils/loader.py +45 -0
anchorcrafter/utils/utils.py +51 -0
app.py +332 -0
config/test.yaml +17 -0
constants.py +4 -0
data/anchor/1.jpg +0 -0
data/anchor/2.jpg +0 -0
data/anchor/3.jpg +3 -0
data/anchor/4.jpg +3 -0
data/anchor/5.jpg +3 -0
data/depth_cut/cheese_1.mp4 +3 -0
data/depth_cut/cheese_2.mp4 +3 -0
data/depth_cut/cup_1.mp4 +3 -0
data/depth_cut/cup_2.mp4 +3 -0
data/depth_cut/earphone_1.mp4 +3 -0
data/depth_cut/earphone_2.mp4 +3 -0
data/depth_cut/hmbb_1.mp4 +3 -0
data/depth_cut/hmbb_2.mp4 +3 -0
data/depth_cut/mouse_1.mp4 +3 -0
data/depth_cut/mouse_2.mp4 +3 -0
data/hand_cut/cheese_1.mp4 +3 -0
data/hand_cut/cheese_2.mp4 +3 -0
data/hand_cut/cup_1.mp4 +3 -0
data/hand_cut/cup_2.mp4 +3 -0
data/hand_cut/earphone_1.mp4 +3 -0
data/hand_cut/earphone_2.mp4 +3 -0
data/hand_cut/hmbb_1.mp4 +3 -0
data/hand_cut/hmbb_2.mp4 +3 -0
data/hand_cut/mouse_1.mp4 +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,56 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/anchor/3.jpg filter=lfs diff=lfs merge=lfs -text
+data/anchor/4.jpg filter=lfs diff=lfs merge=lfs -text
+data/anchor/5.jpg filter=lfs diff=lfs merge=lfs -text
+data/depth_cut/cheese_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/depth_cut/cheese_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/depth_cut/cup_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/depth_cut/cup_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/depth_cut/earphone_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/depth_cut/earphone_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/depth_cut/hmbb_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/depth_cut/hmbb_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/depth_cut/mouse_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/depth_cut/mouse_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/hand_cut/cheese_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/hand_cut/cheese_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/hand_cut/cup_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/hand_cut/cup_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/hand_cut/earphone_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/hand_cut/earphone_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/hand_cut/hmbb_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/hand_cut/hmbb_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/hand_cut/mouse_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/hand_cut/mouse_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/object/cheese_0.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/cheese_1.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/cheese_2.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/cup_0.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/cup_1.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/cup_2.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/earphone_0.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/earphone_1.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/earphone_2.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/hmbb_0.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/hmbb_1.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/hmbb_2.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/mouse_0.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/mouse_1.jpg filter=lfs diff=lfs merge=lfs -text
+data/object/mouse_2.jpg filter=lfs diff=lfs merge=lfs -text
+data/out/cheese.mp4 filter=lfs diff=lfs merge=lfs -text
+data/out/cup.mp4 filter=lfs diff=lfs merge=lfs -text
+data/out/ear.mp4 filter=lfs diff=lfs merge=lfs -text
+data/out/hmbb.mp4 filter=lfs diff=lfs merge=lfs -text
+data/out/mouse.mp4 filter=lfs diff=lfs merge=lfs -text
+data/video/cheese_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/video/cheese_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/video/cup_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/video/cup_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/video/earphone_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/video/earphone_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/video/hmbb_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/video/hmbb_2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/video/mouse_1.mp4 filter=lfs diff=lfs merge=lfs -text
+data/video/mouse_2.mp4 filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
-title: Test
-emoji: 🏃
-colorFrom: indigo
-colorTo: blue
 sdk: gradio
-sdk_version: 5.25.0
 app_file: app.py
 pinned: false
-license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: New Test
+emoji: 📚
+colorFrom: yellow
+colorTo: indigo
 sdk: gradio
+sdk_version: 5.24.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__init__.py ADDED Viewed

File without changes

anchorcrafter/__init__.py ADDED Viewed

File without changes

anchorcrafter/dwpose/__init__.py ADDED Viewed

File without changes

anchorcrafter/dwpose/dwpose_detector.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+import numpy as np
+import torch
+from .wholebody import Wholebody
+from huggingface_hub import hf_hub_download
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class DWposeDetector:
+    """
+    A pose detect method for image-like data.
+    Parameters:
+        model_det: (str) serialized ONNX format model path,
+                    such as https://huggingface.co/yzd-v/DWPose/blob/main/yolox_l.onnx
+        model_pose: (str) serialized ONNX format model path,
+                    such as https://huggingface.co/yzd-v/DWPose/blob/main/dw-ll_ucoco_384.onnx
+        device: (str) 'cpu' or 'cuda:{device_id}'
+    """
+    def __init__(self, model_det, model_pose, device='cpu'):
+        self.args = model_det, model_pose, device
+        pose_estimation = Wholebody(*self.args)
+        self.pose_estimation = pose_estimation
+    def release_memory(self):
+        if hasattr(self, 'pose_estimation'):
+            del self.pose_estimation
+            import gc; gc.collect()
+    def __call__(self, oriImg):
+        oriImg = oriImg.copy()
+        H, W, C = oriImg.shape
+        with torch.no_grad():
+            candidate, score = self.pose_estimation(oriImg)
+            nums, _, locs = candidate.shape
+            candidate[..., 0] /= float(W)
+            candidate[..., 1] /= float(H)
+            body = candidate[:, :18].copy()
+            body = body.reshape(nums * 18, locs)
+            subset = score[:, :18].copy()
+            for i in range(len(subset)):
+                for j in range(len(subset[i])):
+                    if subset[i][j] > 0.3:
+                        subset[i][j] = int(18 * i + j)
+                    else:
+                        subset[i][j] = -1
+            faces = candidate[:, 24:92]
+            hands = candidate[:, 92:113]
+            hands = np.vstack([hands, candidate[:, 113:]])
+            faces_score = score[:, 24:92]
+            hands_score = np.vstack([score[:, 92:113], score[:, 113:]])
+            bodies = dict(candidate=body, subset=subset, score=score[:, :18])
+            pose = dict(bodies=bodies, hands=hands, hands_score=hands_score, faces=faces, faces_score=faces_score)
+            return pose
+model_det_path = hf_hub_download(repo_id="yzd-v/DWPose", filename="yolox_l.onnx")
+model_pose_path = hf_hub_download(repo_id="yzd-v/DWPose", filename="dw-ll_ucoco_384.onnx")
+dwpose_detector = DWposeDetector(
+    model_det=model_det_path,
+    model_pose=model_pose_path,
+    device=device)

anchorcrafter/dwpose/onnxdet.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import cv2
+import numpy as np
+def nms(boxes, scores, nms_thr):
+    """Single class NMS implemented in Numpy.
+    Args:
+        boxes (np.ndarray): shape=(N,4); N is number of boxes
+        scores (np.ndarray): the score of bboxes
+        nms_thr (float): the threshold in NMS
+    Returns:
+        List[int]: output bbox ids
+    """
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+        inds = np.where(ovr <= nms_thr)[0]
+        order = order[inds + 1]
+    return keep
+def multiclass_nms(boxes, scores, nms_thr, score_thr):
+    """Multiclass NMS implemented in Numpy. Class-aware version.
+    Args:
+        boxes (np.ndarray): shape=(N,4); N is number of boxes
+        scores (np.ndarray): the score of bboxes
+        nms_thr (float): the threshold in NMS
+        score_thr (float): the threshold of cls score
+    Returns:
+        np.ndarray: outputs bboxes coordinate
+    """
+    final_dets = []
+    num_classes = scores.shape[1]
+    for cls_ind in range(num_classes):
+        cls_scores = scores[:, cls_ind]
+        valid_score_mask = cls_scores > score_thr
+        if valid_score_mask.sum() == 0:
+            continue
+        else:
+            valid_scores = cls_scores[valid_score_mask]
+            valid_boxes = boxes[valid_score_mask]
+            keep = nms(valid_boxes, valid_scores, nms_thr)
+            if len(keep) > 0:
+                cls_inds = np.ones((len(keep), 1)) * cls_ind
+                dets = np.concatenate(
+                    [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
+                )
+                final_dets.append(dets)
+    if len(final_dets) == 0:
+        return None
+    return np.concatenate(final_dets, 0)
+def demo_postprocess(outputs, img_size, p6=False):
+    grids = []
+    expanded_strides = []
+    strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]
+    hsizes = [img_size[0] // stride for stride in strides]
+    wsizes = [img_size[1] // stride for stride in strides]
+    for hsize, wsize, stride in zip(hsizes, wsizes, strides):
+        xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
+        grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
+        grids.append(grid)
+        shape = grid.shape[:2]
+        expanded_strides.append(np.full((*shape, 1), stride))
+    grids = np.concatenate(grids, 1)
+    expanded_strides = np.concatenate(expanded_strides, 1)
+    outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
+    outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
+    return outputs
+def preprocess(img, input_size, swap=(2, 0, 1)):
+    if len(img.shape) == 3:
+        padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+    else:
+        padded_img = np.ones(input_size, dtype=np.uint8) * 114
+    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+    resized_img = cv2.resize(
+        img,
+        (int(img.shape[1] * r), int(img.shape[0] * r)),
+        interpolation=cv2.INTER_LINEAR,
+    ).astype(np.uint8)
+    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+    padded_img = padded_img.transpose(swap)
+    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+    return padded_img, r
+def inference_detector(session, oriImg):
+    """run anchor detect
+    """
+    input_shape = (640,640)
+    img, ratio = preprocess(oriImg, input_shape)
+    ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
+    output = session.run(None, ort_inputs)
+    predictions = demo_postprocess(output[0], input_shape)[0]
+    boxes = predictions[:, :4]
+    scores = predictions[:, 4:5] * predictions[:, 5:]
+    boxes_xyxy = np.ones_like(boxes)
+    boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
+    boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
+    boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
+    boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
+    boxes_xyxy /= ratio
+    dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
+    if dets is not None:
+        final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
+        isscore = final_scores>0.3
+        iscat = final_cls_inds == 0
+        isbbox = [ i and j for (i, j) in zip(isscore, iscat)]
+        final_boxes = final_boxes[isbbox]
+    else:
+        final_boxes = np.array([])
+    return final_boxes

anchorcrafter/dwpose/onnxpose.py ADDED Viewed

	@@ -0,0 +1,375 @@

+from typing import List, Tuple
+import cv2
+import numpy as np
+import onnxruntime as ort
+def preprocess(
+    img: np.ndarray, out_bbox, input_size: Tuple[int, int] = (192, 256)
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Do preprocessing for RTMPose model inference.
+    Args:
+        img (np.ndarray): Input image in shape.
+        input_size (tuple): Input image size in shape (w, h).
+    Returns:
+        tuple:
+        - resized_img (np.ndarray): Preprocessed image.
+        - center (np.ndarray): Center of image.
+        - scale (np.ndarray): Scale of image.
+    """
+    # get shape of image
+    img_shape = img.shape[:2]
+    out_img, out_center, out_scale = [], [], []
+    if len(out_bbox) == 0:
+        out_bbox = [[0, 0, img_shape[1], img_shape[0]]]
+    for i in range(len(out_bbox)):
+        x0 = out_bbox[i][0]
+        y0 = out_bbox[i][1]
+        x1 = out_bbox[i][2]
+        y1 = out_bbox[i][3]
+        bbox = np.array([x0, y0, x1, y1])
+        # get center and scale
+        center, scale = bbox_xyxy2cs(bbox, padding=1.25)
+        # do affine transformation
+        resized_img, scale = top_down_affine(input_size, scale, center, img)
+        # normalize image
+        mean = np.array([123.675, 116.28, 103.53])
+        std = np.array([58.395, 57.12, 57.375])
+        resized_img = (resized_img - mean) / std
+        out_img.append(resized_img)
+        out_center.append(center)
+        out_scale.append(scale)
+    return out_img, out_center, out_scale
+def inference(sess: ort.InferenceSession, img: np.ndarray) -> np.ndarray:
+    """Inference RTMPose model.
+    Args:
+        sess (ort.InferenceSession): ONNXRuntime session.
+        img (np.ndarray): Input image in shape.
+    Returns:
+        outputs (np.ndarray): Output of RTMPose model.
+    """
+    all_out = []
+    # build input
+    for i in range(len(img)):
+        input = [img[i].transpose(2, 0, 1)]
+        # build output
+        sess_input = {sess.get_inputs()[0].name: input}
+        sess_output = []
+        for out in sess.get_outputs():
+            sess_output.append(out.name)
+        # run model
+        outputs = sess.run(sess_output, sess_input)
+        all_out.append(outputs)
+    return all_out
+def postprocess(outputs: List[np.ndarray],
+                model_input_size: Tuple[int, int],
+                center: Tuple[int, int],
+                scale: Tuple[int, int],
+                simcc_split_ratio: float = 2.0
+                ) -> Tuple[np.ndarray, np.ndarray]:
+    """Postprocess for RTMPose model output.
+    Args:
+        outputs (np.ndarray): Output of RTMPose model.
+        model_input_size (tuple): RTMPose model Input image size.
+        center (tuple): Center of bbox in shape (x, y).
+        scale (tuple): Scale of bbox in shape (w, h).
+        simcc_split_ratio (float): Split ratio of simcc.
+    Returns:
+        tuple:
+        - keypoints (np.ndarray): Rescaled keypoints.
+        - scores (np.ndarray): Model predict scores.
+    """
+    all_key = []
+    all_score = []
+    for i in range(len(outputs)):
+        # use simcc to decode
+        simcc_x, simcc_y = outputs[i]
+        keypoints, scores = decode(simcc_x, simcc_y, simcc_split_ratio)
+        # rescale keypoints
+        keypoints = keypoints / model_input_size * scale[i] + center[i] - scale[i] / 2
+        all_key.append(keypoints[0])
+        all_score.append(scores[0])
+    return np.array(all_key), np.array(all_score)
+def bbox_xyxy2cs(bbox: np.ndarray,
+                 padding: float = 1.) -> Tuple[np.ndarray, np.ndarray]:
+    """Transform the bbox format from (x,y,w,h) into (center, scale)
+    Args:
+        bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted
+            as (left, top, right, bottom)
+        padding (float): BBox padding factor that will be multilied to scale.
+            Default: 1.0
+    Returns:
+        tuple: A tuple containing center and scale.
+        - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or
+            (n, 2)
+        - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or
+            (n, 2)
+    """
+    # convert single bbox from (4, ) to (1, 4)
+    dim = bbox.ndim
+    if dim == 1:
+        bbox = bbox[None, :]
+    # get bbox center and scale
+    x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3])
+    center = np.hstack([x1 + x2, y1 + y2]) * 0.5
+    scale = np.hstack([x2 - x1, y2 - y1]) * padding
+    if dim == 1:
+        center = center[0]
+        scale = scale[0]
+    return center, scale
+def _fix_aspect_ratio(bbox_scale: np.ndarray,
+                      aspect_ratio: float) -> np.ndarray:
+    """Extend the scale to match the given aspect ratio.
+    Args:
+        scale (np.ndarray): The image scale (w, h) in shape (2, )
+        aspect_ratio (float): The ratio of ``w/h``
+    Returns:
+        np.ndarray: The reshaped image scale in (2, )
+    """
+    w, h = np.hsplit(bbox_scale, [1])
+    bbox_scale = np.where(w > h * aspect_ratio,
+                          np.hstack([w, w / aspect_ratio]),
+                          np.hstack([h * aspect_ratio, h]))
+    return bbox_scale
+def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray:
+    """Rotate a point by an angle.
+    Args:
+        pt (np.ndarray): 2D point coordinates (x, y) in shape (2, )
+        angle_rad (float): rotation angle in radian
+    Returns:
+        np.ndarray: Rotated point in shape (2, )
+    """
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    rot_mat = np.array([[cs, -sn], [sn, cs]])
+    return rot_mat @ pt
+def _get_3rd_point(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+    Args:
+        a (np.ndarray): The 1st point (x,y) in shape (2, )
+        b (np.ndarray): The 2nd point (x,y) in shape (2, )
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    direction = a - b
+    c = b + np.r_[-direction[1], direction[0]]
+    return c
+def get_warp_matrix(center: np.ndarray,
+                    scale: np.ndarray,
+                    rot: float,
+                    output_size: Tuple[int, int],
+                    shift: Tuple[float, float] = (0., 0.),
+                    inv: bool = False) -> np.ndarray:
+    """Calculate the affine transformation matrix that can warp the bbox area
+    in the input image to the output size.
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+    Returns:
+        np.ndarray: A 2x3 transformation matrix
+    """
+    shift = np.array(shift)
+    src_w = scale[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+    # compute transformation matrix
+    rot_rad = np.deg2rad(rot)
+    src_dir = _rotate_point(np.array([0., src_w * -0.5]), rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+    # get four corners of the src rectangle in the original image
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale * shift
+    src[1, :] = center + src_dir + scale * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+    # get four corners of the dst rectangle in the input image
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+    if inv:
+        warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+    return warp_mat
+def top_down_affine(input_size: dict, bbox_scale: dict, bbox_center: dict,
+                    img: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Get the bbox image as the model input by affine transform.
+    Args:
+        input_size (dict): The input size of the model.
+        bbox_scale (dict): The bbox scale of the img.
+        bbox_center (dict): The bbox center of the img.
+        img (np.ndarray): The original image.
+    Returns:
+        tuple: A tuple containing center and scale.
+        - np.ndarray[float32]: img after affine transform.
+        - np.ndarray[float32]: bbox scale after affine transform.
+    """
+    w, h = input_size
+    warp_size = (int(w), int(h))
+    # reshape bbox to fixed aspect ratio
+    bbox_scale = _fix_aspect_ratio(bbox_scale, aspect_ratio=w / h)
+    # get the affine matrix
+    center = bbox_center
+    scale = bbox_scale
+    rot = 0
+    warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h))
+    # do affine transform
+    img = cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR)
+    return img, bbox_scale
+def get_simcc_maximum(simcc_x: np.ndarray,
+                      simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Get maximum response location and value from simcc representations.
+    Note:
+        instance number: N
+        num_keypoints: K
+        heatmap height: H
+        heatmap width: W
+    Args:
+        simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx)
+        simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy)
+    Returns:
+        tuple:
+        - locs (np.ndarray): locations of maximum heatmap responses in shape
+            (K, 2) or (N, K, 2)
+        - vals (np.ndarray): values of maximum heatmap responses in shape
+            (K,) or (N, K)
+    """
+    N, K, Wx = simcc_x.shape
+    simcc_x = simcc_x.reshape(N * K, -1)
+    simcc_y = simcc_y.reshape(N * K, -1)
+    # get maximum value locations
+    x_locs = np.argmax(simcc_x, axis=1)
+    y_locs = np.argmax(simcc_y, axis=1)
+    locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
+    max_val_x = np.amax(simcc_x, axis=1)
+    max_val_y = np.amax(simcc_y, axis=1)
+    # get maximum value across x and y axis
+    mask = max_val_x > max_val_y
+    max_val_x[mask] = max_val_y[mask]
+    vals = max_val_x
+    locs[vals <= 0.] = -1
+    # reshape
+    locs = locs.reshape(N, K, 2)
+    vals = vals.reshape(N, K)
+    return locs, vals
+def decode(simcc_x: np.ndarray, simcc_y: np.ndarray,
+           simcc_split_ratio) -> Tuple[np.ndarray, np.ndarray]:
+    """Modulate simcc distribution with Gaussian.
+    Args:
+        simcc_x (np.ndarray[K, Wx]): model predicted simcc in x.
+        simcc_y (np.ndarray[K, Wy]): model predicted simcc in y.
+        simcc_split_ratio (int): The split ratio of simcc.
+    Returns:
+        tuple: A tuple containing center and scale.
+        - np.ndarray[float32]: keypoints in shape (K, 2) or (n, K, 2)
+        - np.ndarray[float32]: scores in shape (K,) or (n, K)
+    """
+    keypoints, scores = get_simcc_maximum(simcc_x, simcc_y)
+    keypoints /= simcc_split_ratio
+    return keypoints, scores
+def inference_pose(session, out_bbox, oriImg):
+    """run pose detect
+    Args:
+        session (ort.InferenceSession): ONNXRuntime session.
+        out_bbox (np.ndarray): bbox list
+        oriImg (np.ndarray): Input image in shape.
+    Returns:
+        tuple:
+        - keypoints (np.ndarray): Rescaled keypoints.
+        - scores (np.ndarray): Model predict scores.
+    """
+    h, w = session.get_inputs()[0].shape[2:]
+    model_input_size = (w, h)
+    # preprocess for rtm-pose model inference.
+    resized_img, center, scale = preprocess(oriImg, out_bbox, model_input_size)
+    # run pose estimation for processed img
+    outputs = inference(session, resized_img)
+    # postprocess for rtm-pose model output.
+    keypoints, scores = postprocess(outputs, model_input_size, center, scale)
+    return keypoints, scores

anchorcrafter/dwpose/preprocess.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from tqdm import tqdm
+import decord
+import numpy as np
+from .util import draw_pose
+from .dwpose_detector import dwpose_detector as dwprocessor
+import pickle
+import os
+def get_video_pose(
+        video_path: str,
+        ref_image: np.ndarray,
+        sample_stride: int=1,
+        total_frames: int=28,
+):
+    """preprocess ref image pose and video pose
+    Args:
+        video_path (str): video pose path
+        ref_image (np.ndarray): reference image
+        sample_stride (int, optional): Defaults to 1.
+        total_frames(int): Defaults to 28.
+    Returns:
+        np.ndarray: sequence of video pose
+    """
+    # select ref-keypoint from reference pose for pose rescale
+    ref_pose = dwprocessor(ref_image)
+    ref_keypoint_id = [0, 1, 2, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
+    ref_keypoint_id = [i for i in ref_keypoint_id \
+        if len(ref_pose['bodies']['subset']) > 0 and ref_pose['bodies']['subset'][0][i] >= .0]
+    ref_body = ref_pose['bodies']['candidate'][ref_keypoint_id]
+    height, width, _ = ref_image.shape
+    print(f'h,w: {height}, {width}')
+    # read input video
+    vr = decord.VideoReader(video_path, ctx=decord.cpu(0))
+    pkl_path = "data/pose_pkl/" + video_path.split("/")[-1].split(".")[0] + ".pkl"
+    print("total frames:", total_frames)
+    if os.path.exists(pkl_path):  # read pose from file
+        with open(pkl_path, "rb") as f:
+            poses_frames = pickle.load(f)
+        detected_poses = [poses_frames[frm] for frm in range(0, len(poses_frames), sample_stride)]
+        detected_poses = detected_poses[:total_frames]
+    else:  # calculate pose
+        frames = vr.get_batch(list(range(0, len(vr), sample_stride))).asnumpy()
+        frames = frames[:total_frames]
+        detected_poses = [dwprocessor(frm) for frm in tqdm(frames, desc="DWPose")]
+    detected_bodies = np.stack(
+        [p['bodies']['candidate'] for p in detected_poses if p['bodies']['candidate'].shape[0] == 18])[:,
+                      ref_keypoint_id]
+    # compute linear-rescale params
+    ay, by = np.polyfit(detected_bodies[:, :, 1].flatten(), np.tile(ref_body[:, 1], len(detected_bodies)), 1)
+    fh, fw, _ = vr[0].shape
+    ax = ay / (fh / fw / height * width)
+    bx = np.mean(np.tile(ref_body[:, 0], len(detected_bodies)) - detected_bodies[:, :, 0].flatten() * ax)
+    a = np.array([ax, ay])
+    b = np.array([bx, by])
+    output_pose = []
+    # pose rescale
+    for detected_pose in detected_poses:
+        detected_pose['bodies']['candidate'] = detected_pose['bodies']['candidate'] * a + b
+        detected_pose['faces'] = detected_pose['faces'] * a + b
+        detected_pose['hands'] = detected_pose['hands'] * a + b
+        im = draw_pose(detected_pose, height, width)
+        output_pose.append(np.array(im))
+    return np.stack(output_pose), a, b
+def get_image_pose(ref_image):
+    """process image pose
+    Args:
+        ref_image (np.ndarray): reference image pixel value
+    Returns:
+        np.ndarray: pose visual image in RGB-mode
+    """
+    height, width, _ = ref_image.shape
+    ref_pose = dwprocessor(ref_image)
+    pose_img = draw_pose(ref_pose, height, width)
+    return np.array(pose_img)

anchorcrafter/dwpose/util.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import math
+import numpy as np
+import matplotlib
+import cv2
+eps = 0.01
+def alpha_blend_color(color, alpha):
+    """blend color according to point conf
+    """
+    return [int(c * alpha) for c in color]
+def draw_bodypose(canvas, candidate, subset, score):
+    H, W, C = canvas.shape
+    candidate = np.array(candidate)
+    subset = np.array(subset)
+    stickwidth = 4
+    limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
+               [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
+               [1, 16], [16, 18], [3, 17], [6, 18]]
+    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
+              [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
+              [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
+    for i in range(17):
+        for n in range(len(subset)):
+            index = subset[n][np.array(limbSeq[i]) - 1]
+            conf = score[n][np.array(limbSeq[i]) - 1]
+            if conf[0] < 0.3 or conf[1] < 0.3:
+                continue
+            Y = candidate[index.astype(int), 0] * float(W)
+            X = candidate[index.astype(int), 1] * float(H)
+            mX = np.mean(X)
+            mY = np.mean(Y)
+            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
+            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
+            polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
+            cv2.fillConvexPoly(canvas, polygon, alpha_blend_color(colors[i], conf[0] * conf[1]))
+    canvas = (canvas * 0.6).astype(np.uint8)
+    for i in range(18):
+        for n in range(len(subset)):
+            index = int(subset[n][i])
+            if index == -1:
+                continue
+            x, y = candidate[index][0:2]
+            conf = score[n][i]
+            x = int(x * W)
+            y = int(y * H)
+            cv2.circle(canvas, (int(x), int(y)), 4, alpha_blend_color(colors[i], conf), thickness=-1)
+    return canvas
+def draw_handpose(canvas, all_hand_peaks, all_hand_scores):
+    H, W, C = canvas.shape
+    edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
+             [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
+    for peaks, scores in zip(all_hand_peaks, all_hand_scores):
+        for ie, e in enumerate(edges):
+            x1, y1 = peaks[e[0]]
+            x2, y2 = peaks[e[1]]
+            x1 = int(x1 * W)
+            y1 = int(y1 * H)
+            x2 = int(x2 * W)
+            y2 = int(y2 * H)
+            score = int(scores[e[0]] * scores[e[1]] * 255)
+            if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
+                cv2.line(canvas, (x1, y1), (x2, y2),
+                         matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * score, thickness=2)
+        for i, keyponit in enumerate(peaks):
+            x, y = keyponit
+            x = int(x * W)
+            y = int(y * H)
+            score = int(scores[i] * 255)
+            if x > eps and y > eps:
+                cv2.circle(canvas, (x, y), 4, (0, 0, score), thickness=-1)
+    return canvas
+def draw_facepose(canvas, all_lmks, all_scores):
+    H, W, C = canvas.shape
+    for lmks, scores in zip(all_lmks, all_scores):
+        for lmk, score in zip(lmks, scores):
+            x, y = lmk
+            x = int(x * W)
+            y = int(y * H)
+            conf = int(score * 255)
+            if x > eps and y > eps:
+                cv2.circle(canvas, (x, y), 3, (conf, conf, conf), thickness=-1)
+    return canvas
+def draw_pose(pose, H, W, ref_w=2160):
+    """vis dwpose outputs
+    Args:
+        pose (List): DWposeDetector outputs in dwpose_detector.py
+        H (int): height
+        W (int): width
+        ref_w (int, optional) Defaults to 2160.
+    Returns:
+        np.ndarray: image pixel value in RGB mode
+    """
+    bodies = pose['bodies']
+    faces = pose['faces']
+    hands = pose['hands']
+    candidate = bodies['candidate']
+    subset = bodies['subset']
+    sz = min(H, W)
+    sr = (ref_w / sz) if sz != ref_w else 1
+    ########################################## create zero canvas ##################################################
+    canvas = np.zeros(shape=(int(H*sr), int(W*sr), 3), dtype=np.uint8)
+    ########################################### draw body pose #####################################################
+    canvas = draw_bodypose(canvas, candidate, subset, score=bodies['score'])
+    ########################################### draw hand pose #####################################################
+    canvas = draw_handpose(canvas, hands, pose['hands_score'])
+    ########################################### draw face pose #####################################################
+    canvas = draw_facepose(canvas, faces, pose['faces_score'])
+    return cv2.cvtColor(cv2.resize(canvas, (W, H)), cv2.COLOR_BGR2RGB).transpose(2, 0, 1)

anchorcrafter/dwpose/wholebody.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import numpy as np
+import onnxruntime as ort
+from .onnxdet import inference_detector
+from .onnxpose import inference_pose
+import os
+class Wholebody:
+    """detect anchor pose by dwpose
+    """
+    def __init__(self, model_det, model_pose, device="cpu"):
+        #print('wholebody init')
+        providers = ['CPUExecutionProvider'] if device == 'cpu' else ['CUDAExecutionProvider']
+        provider_options = None if device == 'cpu' else [{'device_id': 3}]
+        #print('session create')
+        self.session_det = ort.InferenceSession(
+            path_or_bytes=model_det, providers=providers,  provider_options=provider_options
+        )
+        #print('session_pose create')
+        self.session_pose = ort.InferenceSession(
+            path_or_bytes=model_pose, providers=providers, provider_options=provider_options
+        )
+    def __call__(self, oriImg):
+        """call to process dwpose-detect
+        Args:
+            oriImg (np.ndarray): detected image
+        """
+        det_result = inference_detector(self.session_det, oriImg)
+        keypoints, scores = inference_pose(self.session_pose, det_result, oriImg)
+        keypoints_info = np.concatenate(
+            (keypoints, scores[..., None]), axis=-1)
+        # compute neck joint
+        neck = np.mean(keypoints_info[:, [5, 6]], axis=1)
+        # neck score when visualizing pred
+        neck[:, 2:4] = np.logical_and(
+            keypoints_info[:, 5, 2:4] > 0.3,
+            keypoints_info[:, 6, 2:4] > 0.3).astype(int)
+        new_keypoints_info = np.insert(
+            keypoints_info, 17, neck, axis=1)
+        mmpose_idx = [
+            17, 6, 8, 10, 7, 9, 12, 14, 16, 13, 15, 2, 1, 4, 3
+        ]
+        openpose_idx = [
+            1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17
+        ]
+        new_keypoints_info[:, openpose_idx] = \
+            new_keypoints_info[:, mmpose_idx]
+        keypoints_info = new_keypoints_info
+        keypoints, scores = keypoints_info[
+            ..., :2], keypoints_info[..., 2]
+        return keypoints, scores

anchorcrafter/modules/__init__.py ADDED Viewed

File without changes

anchorcrafter/modules/attention_processor.py ADDED Viewed

	@@ -0,0 +1,466 @@

+# modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+try:
+    import xformers
+    import xformers.ops
+    xformers_available = True
+except Exception as e:
+    xformers_available = False
+class RegionControler(object):
+    def __init__(self) -> None:
+        self.prompt_image_conditioning = []
+region_control = RegionControler()
+class AttnProcessor(nn.Module):
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+    def forward(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class IPAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+    def forward(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        attn_bias=None,
+    ):
+        hidden_states=hidden_states.to(torch.float16)
+        encoder_hidden_states=encoder_hidden_states.to(torch.float16)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = encoder_hidden_states[:, :end_pos, :], encoder_hidden_states[:, end_pos:, :]
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        if xformers_available:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+        else:
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            hidden_states = torch.bmm(attention_probs, value)
+        if attn_bias is not None:
+            # print(f'ipadapter attn_bias, shape: {attn_bias.shape} sum: {attn_bias.sum()}')
+            # 目标区域注意力结果系数为1，其余地区系数为0
+            mask = attn_bias.repeat(1, 1, hidden_states.shape[2]).to(hidden_states.dtype)
+            hidden_states = hidden_states * (1 - mask)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # for ip-adapter
+        ip_hidden_states=ip_hidden_states.to(torch.float16)
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+        if xformers_available:
+            ip_hidden_states = self._memory_efficient_attention_xformers(query, ip_key, ip_value, attention_mask=None)
+        else:
+            ip_attention_probs = attn.get_attention_scores(query, ip_key, attention_mask=None)
+            ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        if attn_bias is not None:
+            # print(f'ipadapter attn_bias, shape: {attn_bias.shape} sum: {attn_bias.sum()}')
+            # 目标区域注意力结果系数为1，其余地区系数为0
+            mask = attn_bias.repeat(1, 1, ip_hidden_states.shape[2]).to(ip_hidden_states.dtype)
+            ip_hidden_states = ip_hidden_states * mask
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+        # region control
+        if len(region_control.prompt_image_conditioning) == 1:
+            region_mask = region_control.prompt_image_conditioning[0].get('region_mask', None)
+            if region_mask is not None:
+                h, w = region_mask.shape[:2]
+                ratio = (h * w / query.shape[1]) ** 0.5
+                mask = F.interpolate(region_mask[None, None], scale_factor=1/ratio, mode='nearest').reshape([1, -1, 1])
+            else:
+                mask = torch.ones_like(ip_hidden_states)
+            ip_hidden_states = ip_hidden_states * mask
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+    def _memory_efficient_attention_xformers(self, query, key, value, attention_mask):
+        # TODO attention_mask
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
+        # hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+class AttnProcessor2_0(torch.nn.Module):
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(
+        self,
+        hidden_size=None,
+        cross_attention_dim=None,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def forward(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class IPAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for IP-Adapater for PyTorch 2.0.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+    def forward(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        ip_hidden_states = F.scaled_dot_product_attention(
+            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+        with torch.no_grad():
+            self.attn_map = query @ ip_key.transpose(-2, -1).softmax(dim=-1)
+            #print(self.attn_map.shape)
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+        # region control
+        if len(region_control.prompt_image_conditioning) == 1:
+            region_mask = region_control.prompt_image_conditioning[0].get('region_mask', None)
+            if region_mask is not None:
+                query = query.reshape([-1, query.shape[-2], query.shape[-1]])
+                h, w = region_mask.shape[:2]
+                ratio = (h * w / query.shape[1]) ** 0.5
+                mask = F.interpolate(region_mask[None, None], scale_factor=1/ratio, mode='nearest').reshape([1, -1, 1])
+            else:
+                mask = torch.ones_like(ip_hidden_states)
+            ip_hidden_states = ip_hidden_states * mask
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

anchorcrafter/modules/obj_attn_net.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+from pathlib import Path
+import torch
+import torch.nn as nn
+from diffusers.models.attention import BasicTransformerBlock
+class ObjAttnNet(nn.Module):
+    """Object-centric attention network with dual transformer blocks
+    Args:
+        inner_dim (int): Dimension of internal representations (default: 1024)
+        num_heads (int): Number of attention heads (default: 32)
+        out_dim (int): Output dimension (default: 1024)
+        embedding_size (int): Base embedding size (default: 1370)
+    """
+    def __init__(self, inner_dim=1024, num_heads=32, out_dim=1024, embedding_size=1370):
+        super().__init__()
+        self.embedding_size = embedding_size
+        # Transformer blocks configuration
+        transformer_config = {
+            "dim": inner_dim,
+            "num_attention_heads": num_heads,
+            "attention_head_dim": inner_dim // num_heads
+        }
+        # Network components
+        self.space_transformer_1 = BasicTransformerBlock(**transformer_config)
+        self.space_transformer_2 = BasicTransformerBlock(**transformer_config)
+        self.proj_out = nn.Linear(inner_dim, out_dim)
+        self.norm = nn.LayerNorm(out_dim)
+    def forward(self, embeddings):  # [b, n, c]
+        # First transformer processing
+        x = self.space_transformer_1(embeddings)
+        # Select middle embeddings segment
+        x = x[:, self.embedding_size: self.embedding_size * 2, :]
+        # Second transformer processing
+        x = self.space_transformer_2(x)
+        # Select final output tokens
+        x = x[:, :12, :]
+        # Project and normalize
+        return self.norm(self.proj_out(x))

anchorcrafter/modules/obj_proj_net.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+from pathlib import Path
+import torch
+import torch.nn as nn
+class ObjProjNet(nn.Module):
+    """Projection network for CLIP embeddings to cross-attention space
+    Args:
+        cross_attention_dim (int): Dimension of cross-attention features (default: 1024)
+        clip_embeddings_dim (int): Dimension of input CLIP embeddings (default: 3072)
+        context_tokens (int): Number of additional context tokens (default: 4)
+        inner_dim (int): Intermediate projection dimension (default: 1024)
+    """
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=3072,
+                 context_tokens=4, inner_dim=1024):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.context_tokens = context_tokens
+        self.proj_in = nn.Linear(clip_embeddings_dim, inner_dim)
+        self.proj_out = nn.Linear(inner_dim, self.context_tokens * cross_attention_dim)
+        self.norm = nn.LayerNorm(cross_attention_dim)
+    def forward(self, image_embeds):
+        x = self.proj_in(image_embeds)
+        x = self.proj_out(x).reshape(
+            -1, self.context_tokens, self.cross_attention_dim
+        )
+        return self.norm(x)

anchorcrafter/modules/pose_net.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+from pathlib import Path
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+from diffusers.utils.constants import SAFETENSORS_WEIGHTS_NAME,WEIGHTS_NAME
+from typing import Union, Optional
+def _add_variant(weights_name: str, variant: Optional[str] = None) -> str:
+    if variant is not None:
+        splits = weights_name.split(".")
+        splits = splits[:-1] + [variant] + splits[-1:]
+        weights_name = ".".join(splits)
+    return weights_name
+class PoseNet(nn.Module):
+    """Convolutional network for processing pose sequence conditioning
+    Args:
+        latent_channels (int): Number of output latent channels (default: 320)
+        input_channels (int): Number of input pose channels (default: 6)
+        scale_factor (float): Initial output scaling factor (default: 2.0)
+    """
+    def __init__(
+        self,
+        latent_channels: int = 320,
+        input_channels: int = 6,
+        scale_factor: float = 2.0
+    ):
+        super().__init__()
+        # multiple convolution layers
+        self.conv_layers = nn.Sequential(
+            nn.Conv2d(input_channels, 6, kernel_size=3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(6, 16, kernel_size=4, stride=2, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, kernel_size=3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 32, kernel_size=4, stride=2, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(32, 32, kernel_size=3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(64, 64, kernel_size=3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
+            nn.SiLU()
+        )
+        # Final projection layer
+        self.final_proj = nn.Conv2d(128, latent_channels, kernel_size=1)
+        # Initialize layers
+        self._initialize_weights()
+        self.scale = nn.Parameter(torch.tensor(scale_factor, dtype=torch.float16))
+    def _initialize_weights(self):
+        """Initialize weights with He. initialization and zero out the biases
+        """
+        for m in self.conv_layers:
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
+                init.normal_(m.weight, mean=0.0, std=np.sqrt(2. / n))
+                if m.bias is not None:
+                    init.zeros_(m.bias)
+        init.zeros_(self.final_proj.weight)
+        if self.final_proj.bias is not None:
+            init.zeros_(self.final_proj.bias)
+    def forward(self, x):
+        if x.ndim == 5:
+            x = einops.rearrange(x, "b f c h w -> (b f) c h w")
+        x = self.conv_layers(x)
+        return self.final_proj(x) * self.scale

anchorcrafter/modules/track_net.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+from pathlib import Path
+import einops
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+from typing import Optional
+class TrackNet(nn.Module):
+    """Convolutional network for processing tracking sequence conditioning
+    Args:
+        latent_channels (int): Number of output latent channels (default: 320)
+        input_channels (int): Number of input tracking channels (default: 3)
+        scale_factor (float): Initial output scaling factor (default: 2.0)
+    """
+    def __init__(
+        self,
+        latent_channels=320,
+        input_channels: int = 3,
+        scale_factor: float = 2.0
+    ):
+        super().__init__()
+        # multiple convolution layers
+        self.conv_layers = nn.Sequential(
+            nn.Conv2d(input_channels, 3, kernel_size=3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(3, 16, kernel_size=4, stride=2, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, kernel_size=3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 32, kernel_size=4, stride=2, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(32, 32, kernel_size=3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(64, 64, kernel_size=3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
+            nn.SiLU()
+        )
+        # Final projection layer
+        self.final_proj = nn.Conv2d(in_channels=128, out_channels=latent_channels, kernel_size=1)
+        # Initialize layers
+        self._initialize_weights()
+        self.scale = nn.Parameter(torch.tensor(scale_factor, dtype=torch.float16))
+    def _initialize_weights(self):
+        """Initialize weights with He. initialization and zero out the biases
+        """
+        for m in self.conv_layers:
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
+                init.normal_(m.weight, mean=0.0, std=np.sqrt(2. / n))
+                if m.bias is not None:
+                    init.zeros_(m.bias)
+        init.zeros_(self.final_proj.weight)
+        if self.final_proj.bias is not None:
+            init.zeros_(self.final_proj.bias)
+    def forward(self, x):
+        if x.ndim == 5:
+            x = einops.rearrange(x, "b f c h w -> (b f) c h w")
+        x = self.conv_layers(x)
+        return self.final_proj(x) * self.scale

anchorcrafter/modules/unet.py ADDED Viewed

	@@ -0,0 +1,509 @@

+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.models.attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.unets.unet_3d_blocks import get_down_block, get_up_block, UNetMidBlockSpatioTemporal
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNetSpatioTemporalConditionOutput(BaseOutput):
+    """
+    The output of [`UNetSpatioTemporalConditionModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor = None
+class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state,
+    and a timestep and returns a sample shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal",
+            "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        addition_time_embed_dim: (`int`, defaults to 256):
+            Dimension to to encode the additional time ids.
+        projection_class_embeddings_input_dim (`int`, defaults to 768):
+            The dimension of the projection of encoded `added_time_ids`.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
+            [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
+            [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
+        num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
+            The number of attention heads.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+            self,
+            sample_size: Optional[int] = None,
+            in_channels: int = 8,
+            out_channels: int = 4,
+            down_block_types: Tuple[str] = (
+                    "CrossAttnDownBlockSpatioTemporal",
+                    "CrossAttnDownBlockSpatioTemporal",
+                    "CrossAttnDownBlockSpatioTemporal",
+                    "DownBlockSpatioTemporal",
+            ),
+            up_block_types: Tuple[str] = (
+                    "UpBlockSpatioTemporal",
+                    "CrossAttnUpBlockSpatioTemporal",
+                    "CrossAttnUpBlockSpatioTemporal",
+                    "CrossAttnUpBlockSpatioTemporal",
+            ),
+            block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+            addition_time_embed_dim: int = 256,
+            projection_class_embeddings_input_dim: int = 768,
+            layers_per_block: Union[int, Tuple[int]] = 2,
+            cross_attention_dim: Union[int, Tuple[int]] = 2048,
+            transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+            num_attention_heads: Union[int, Tuple[int]] = (5, 10, 10, 20),
+            num_frames: int = 25,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. " \
+                f"`down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. " \
+                f"`block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. " \
+                f"`num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. " \
+                f"`cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. " \
+                f"`layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            padding=1,
+        )
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, downscale_freq_shift=0)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        self.add_time_proj = Timesteps(addition_time_embed_dim, True, downscale_freq_shift=0)
+        self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-5,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlockSpatioTemporal(
+            block_out_channels[-1],
+            temb_channels=blocks_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            cross_attention_dim=cross_attention_dim[-1],
+            num_attention_heads=num_attention_heads[-1],
+        )
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=1e-5,
+                resolution_idx=i,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-5)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0],
+            out_channels,
+            kernel_size=3,
+            padding=1,
+        )
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(
+                name: str,
+                module: torch.nn.Module,
+                processors: Dict[str, AttentionProcessor],
+        ):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` " \
+                f"when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+    def forward(
+            self,
+            sample: torch.FloatTensor,
+            timestep: Union[torch.Tensor, float, int],
+            encoder_hidden_states: torch.Tensor,
+            added_time_ids: torch.Tensor,
+            pose_latents: torch.Tensor = None,
+            image_only_indicator: bool = False,
+            return_dict: bool = True,
+            obj_track_latents: torch.Tensor = None,
+    ) -> Union[UNetSpatioTemporalConditionOutput, Tuple]:
+        r"""
+        The [`UNetSpatioTemporalConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
+            added_time_ids: (`torch.FloatTensor`):
+                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
+                embeddings and added to the time embeddings.
+            pose_latents: (`torch.FloatTensor`):
+                The additional latents for pose sequences.
+            image_only_indicator (`bool`, *optional*, defaults to `False`):
+                Whether or not training with all images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`]
+                instead of a plain tuple.
+        Returns:
+            [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
+                If `return_dict` is True,
+                an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
+        """
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        batch_size, num_frames = sample.shape[:2]
+        timesteps = timesteps.expand(batch_size)
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=torch.float16)
+        emb = self.time_embedding(t_emb)
+        time_embeds = self.add_time_proj(added_time_ids.flatten())
+        time_embeds = time_embeds.reshape((batch_size, -1))
+        time_embeds = time_embeds.to(emb.dtype)
+        aug_emb = self.add_embedding(time_embeds)
+        emb = emb + aug_emb
+        # Flatten the batch and frames dimensions
+        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
+        sample = sample.flatten(0, 1)
+        # Repeat the embeddings num_video_frames times
+        # emb: [batch, channels] -> [batch * frames, channels]
+        emb = emb.repeat_interleave(num_frames, dim=0)
+        # encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels]
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        if pose_latents is not None:
+            sample = sample + pose_latents
+        if obj_track_latents is not None:
+            sample = sample + obj_track_latents
+        image_only_indicator = torch.ones(batch_size, num_frames, dtype=sample.dtype, device=sample.device) \
+            if image_only_indicator else torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    image_only_indicator=image_only_indicator,
+                )
+            down_block_res_samples += res_samples
+        # 4. mid
+        sample = self.mid_block(
+            hidden_states=sample,
+            temb=emb,
+            encoder_hidden_states=encoder_hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-len(upsample_block.resnets):]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    image_only_indicator=image_only_indicator,
+                )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        # 7. Reshape back to original shape
+        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])
+        if not return_dict:
+            return (sample,)
+        return UNetSpatioTemporalConditionOutput(sample=sample)

anchorcrafter/pipelines/pipeline.py ADDED Viewed

	@@ -0,0 +1,739 @@

+import inspect
+import math
+import os.path
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Union
+import PIL.Image
+import einops
+import numpy as np
+import torch
+from diffusers.image_processor import VaeImageProcessor, PipelineImageInput
+from diffusers.models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion \
+    import _resize_with_antialiasing, _append_dims
+from diffusers.schedulers import EulerDiscreteScheduler
+from diffusers.utils import BaseOutput, logging
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from anchorcrafter.modules.track_net import TrackNet
+import torch.nn as nn
+from transformers import AutoImageProcessor, AutoModel
+import torch.nn.functional as F
+from torchvision.transforms.functional import pil_to_tensor
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+import constants
+from ..modules.obj_proj_net import ObjProjNet
+from ..modules.obj_attn_net import ObjAttnNet
+from ..modules.pose_net import PoseNet
+def _append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
+def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+        outputs.append(batch_output)
+    if output_type == "np":
+        outputs = np.stack(outputs)
+    elif output_type == "pt":
+        outputs = torch.stack(outputs)
+    elif not output_type == "pil":
+        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil]")
+    return outputs
+@dataclass
+class AnchorCrafterPipelineOutput(BaseOutput):
+    r"""
+    Output class for anchorcrafter pipeline.
+    Args:
+        frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]):
+            List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
+            num_frames, height, width, num_channels)`.
+    """
+    frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor]
+class AnchorCrafterPipeline(DiffusionPipeline):
+    r"""
+    Pipeline to generate video from an input image using Stable Video Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKLTemporalDecoder`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K]
+            (https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
+        unet ([`UNetSpatioTemporalConditionModel`]):
+            A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
+        scheduler ([`EulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images.
+        dino_feature_extractor(['AutoImageProcessor']):
+            A `AutoImageProcessor` to extract features from images.
+        pose_net ([`PoseNet`]):
+            A net to inject pose signals into unet.
+        track_net (['TrackNet']):
+            A net to inject object pose signals into unet.
+        obj_proj_net (['ObjProjNet']):
+            A network with linearnet to extract object features.
+        obj_attn_net (['ObjAttnNet']):
+            A network with self attention to extract object features.
+    """
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents"]
+    def __init__(
+            self,
+            vae: AutoencoderKLTemporalDecoder,
+            image_encoder: CLIPVisionModelWithProjection,
+            obj_image_encoder: AutoModel,
+            unet: UNetSpatioTemporalConditionModel,
+            scheduler: EulerDiscreteScheduler,
+            feature_extractor: CLIPImageProcessor,
+            dino_feature_extractor: AutoImageProcessor,
+            pose_net: PoseNet,
+            track_net: TrackNet,
+            obj_proj_net: ObjProjNet,
+            obj_attn_net: ObjAttnNet
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            obj_image_encoder=obj_image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            dino_feature_extractor=dino_feature_extractor,
+            pose_net=pose_net,
+            track_net=track_net,
+            obj_proj_net=obj_proj_net,
+            obj_attn_net=obj_attn_net
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def _encode_image(
+            self,
+            image: PipelineImageInput,
+            obj_pixels: PipelineImageInput,
+            device: Union[str, torch.device],
+            num_videos_per_prompt: int,
+            do_classifier_free_guidance: bool):
+        dtype = next(self.image_encoder.parameters()).dtype
+        # print(image)
+        # print(obj_pixels)
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor.pil_to_numpy(image)
+            image = self.image_processor.numpy_to_pt(image)
+            # We normalize the image before resizing to match with the original implementation.
+            # Then we unnormalize it after resizing.
+            image = image * 2.0 - 1.0
+            image = _resize_with_antialiasing(image, (224, 224))
+            image = (image + 1.0) / 2.0
+            # Normalize the image with for CLIP input
+            image = self.feature_extractor(
+                images=image,
+                do_normalize=True,
+                do_center_crop=False,
+                do_resize=False,
+                do_rescale=False,
+                return_tensors="pt",
+            ).pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image=image.to(dtype=torch.float16)
+        image_embeddings = self.image_encoder(image).image_embeds
+        obj_all_embeddings = None
+        for obj in obj_pixels:
+            if not isinstance(obj, torch.Tensor):
+                obj = self.image_processor.pil_to_numpy(obj)
+                obj = self.image_processor.numpy_to_pt(obj)
+                # We normalize the image before resizing to match with the original implementation.
+                # Then we unnormalize it after resizing.
+                obj = obj * 2.0 - 1.0
+                obj = _resize_with_antialiasing(obj, (518, 518))
+                obj = (obj + 1.0) / 2.0
+                # Normalize the image with for CLIP input
+                obj = self.dino_feature_extractor(
+                    images=obj,
+                    do_normalize=True,
+                    do_center_crop=False,
+                    do_resize=False,
+                    do_rescale=False,
+                    return_tensors="pt",
+                ).pixel_values
+            obj = obj.to(device=device, dtype=self.obj_image_encoder.dtype)
+            print("[dino feature extractor] output obj image:", obj.shape)  # torch.Size([1, 3, 518, 518])
+            obj_pixels_embeddings = self.obj_image_encoder(obj).last_hidden_state  # torch.Size([1, 257, 768])
+            #obj_pixels_embeddings = obj_pixels_embeddings[:, 0, :]  # 1,768
+            if obj_all_embeddings is None:
+                obj_all_embeddings = obj_pixels_embeddings
+            else:
+                obj_all_embeddings = torch.concat((obj_all_embeddings, obj_pixels_embeddings), dim=1)
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        print("obj_all_embeddings", obj_all_embeddings)
+        return image_embeddings, obj_all_embeddings
+    def _encode_vae_image(
+            self,
+            image: torch.Tensor,
+            device: Union[str, torch.device],
+            num_videos_per_prompt: int,
+            do_classifier_free_guidance: bool,
+    ):
+        image = image.to(device=device, dtype=self.vae.dtype)
+       # image_latents = torch.zeros((image.shape[0], 4, 96, 64)).to(device=device, dtype=self.vae.dtype)
+        image_latents = torch.zeros((image.shape[0], 4, 128, 72)).to(device=device, dtype=self.vae.dtype)
+        for i in range(0, image.shape[0], 16):
+            if i + 16 > image.shape[0]:
+                image_latents[i:] = self.vae.encode(image[i:]).latent_dist.mode()
+            else:
+                image_latents[i:i + 16] = self.vae.encode(image[i:i + 16]).latent_dist.mode()
+        if do_classifier_free_guidance:
+            negative_image_latents = torch.zeros_like(image_latents)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_latents = torch.cat([negative_image_latents, image_latents])
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+        return image_latents
+    def _get_add_time_ids(
+            self,
+            fps: int,
+            motion_bucket_id: int,
+            noise_aug_strength: float,
+            dtype: torch.dtype,
+            batch_size: int,
+            num_videos_per_prompt: int,
+            do_classifier_free_guidance: bool,
+    ):
+        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, " \
+                f"but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. " \
+                f"Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
+        if do_classifier_free_guidance:
+            add_time_ids = torch.cat([add_time_ids, add_time_ids])
+        return add_time_ids
+    def decode_latents(
+            self,
+            latents: torch.Tensor,
+            num_frames: int,
+            decode_chunk_size: int = 8):
+        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
+        latents = latents.flatten(0, 1)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
+        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
+        # decode decode_chunk_size frames at a time to avoid OOM
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i: i + decode_chunk_size].shape[0]
+            decode_kwargs = {}
+            if accepts_num_frames:
+                # we only pass num_frames_in if it's expected
+                decode_kwargs["num_frames"] = num_frames_in
+            frame = self.vae.decode(latents[i: i + decode_chunk_size], **decode_kwargs).sample
+            frames.append(frame.cpu())
+        frames = torch.cat(frames, dim=0)
+        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
+        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        frames = frames.float()
+        return frames
+    def check_inputs(self, image, height, width):
+        if (
+                not isinstance(image, torch.Tensor)
+                and not isinstance(image, PIL.Image.Image)
+                and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def prepare_latents(
+            self,
+            batch_size: int,
+            num_frames: int,
+            num_channels_noise_latents: int,
+            height: int,
+            width: int,
+            dtype: torch.dtype,
+            device: Union[str, torch.device],
+            generator: torch.Generator,
+            latents: Optional[torch.Tensor] = None,
+    ):
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_noise_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        if isinstance(self.guidance_scale, (int, float)):
+            return self.guidance_scale > 1
+        return self.guidance_scale.max() > 1
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    @torch.no_grad()
+    def __call__(
+            self,
+            image_pixels: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+            pose_pixels: Union[torch.FloatTensor],
+            obj_pixels: Union[torch.FloatTensor],
+            obj_track_pixels:  Union[torch.FloatTensor],
+            hand_pixels: Union[torch.FloatTensor],
+            height: int = 576,
+            width: int = 1024,
+            num_frames: Optional[int] = None,
+            tile_size: Optional[int] = 16,
+            tile_overlap: Optional[int] = 4,
+            num_inference_steps: int = 25,
+            min_guidance_scale: float = 1.0,
+            max_guidance_scale: float = 3.0,
+            fps: int = 7,
+            motion_bucket_id: int = 127,
+            noise_aug_strength: float = 0.02,
+            image_only_indicator: bool = False,
+            decode_chunk_size: Optional[int] = None,
+            num_videos_per_prompt: Optional[int] = 1,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.FloatTensor] = None,
+            output_type: Optional[str] = "pil",
+            callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            return_dict: bool = True,
+            device: Union[str, torch.device] = None,
+            visual_output: bool = False,
+    ):
+        r"""
+        Pipeline execution function for video generation.
+        Args:
+            image_pixels: Input image(s) for guidance
+            pose_pixels: Pose data tensor
+            obj_pixels: Object reference tensor
+            obj_track_pixels: Object tracking data tensor
+            hand_pixels: Hand tracking data tensor
+            height: Output video height
+            width: Output video width
+            num_frames: Number of frames to generate
+            tile_size: Processing tile size
+            tile_overlap: Tile overlap size
+            num_inference_steps: Number of denoising steps
+            min_guidance_scale: Minimum CFG scale
+            max_guidance_scale: Maximum CFG scale
+            fps: Frames per second
+            motion_bucket_id: Motion control parameter
+            noise_aug_strength: Noise augmentation strength
+            image_only_indicator: Image-only processing flag
+            decode_chunk_size: Frame decoding chunk size
+            num_videos_per_prompt: Videos per prompt
+            generator: Random number generator
+            latents: Initial latent vectors
+            output_type: Output format
+            callback_on_step_end: Callback function
+            callback_on_step_end_tensor_inputs: Callback inputs
+            return_dict: Return type flag
+            device: Computation device
+            visual_output: Visualization flag
+        Returns:
+            Generated video output
+        """
+        pose_pixels = torch.cat([pose_pixels, hand_pixels], dim=1)
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image_pixels, height, width)
+        # 2. Define call parameters
+        if isinstance(image_pixels, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image_pixels, list):
+            batch_size = len(image_pixels)
+        else:
+            batch_size = image_pixels.shape[0]
+        device = device if device is not None else self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = max_guidance_scale
+        # 3. Encode input image
+        self.image_encoder.to(device)
+        self.obj_image_encoder.to(device)
+        encoder_hidden_states, obj_embeddings = self._encode_image(image_pixels, obj_pixels, device, num_videos_per_prompt,
+                                                                   self.do_classifier_free_guidance)
+        obj_embeddings = obj_embeddings.to(encoder_hidden_states.dtype)
+        # self.image_encoder.cpu()
+        self.image_encoder.cpu()
+        self.obj_image_encoder.cpu()
+        # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
+        # is why it is reduced here.
+        fps = fps - 1
+        # 4. Encode input image using VAE
+        image_pixels = self.image_processor.preprocess(image_pixels, height=height, width=width).to(device)
+        obj_image = pil_to_tensor(obj_pixels[1])
+        h_pad = (image_pixels.shape[-2] - obj_image.shape[-2]) // 2
+        w_pad = (image_pixels.shape[-1] - obj_image.shape[-1]) // 2
+        obj_image = F.pad(obj_image, (w_pad, w_pad, h_pad, h_pad), mode='constant', value=0)
+        print(f'obj_image before process: {obj_image.shape}')
+        obj_image = self.image_processor.preprocess(obj_image, height=height, width=width).to(device)
+        print(f'obj_image after process: {obj_image.shape}')
+        noise = randn_tensor(image_pixels.shape, generator=generator, device=device, dtype=image_pixels.dtype)
+        image_pixels = image_pixels + noise_aug_strength * noise
+        obj_image = obj_image + noise_aug_strength * noise
+        self.vae.to(device)
+        image_latents = self._encode_vae_image(
+            image_pixels,
+            device=device,
+            num_videos_per_prompt=num_videos_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        )
+        image_latents = image_latents.to(encoder_hidden_states.dtype)
+        obj_image_latents = self._encode_vae_image(
+            obj_image,
+            device=device,
+            num_videos_per_prompt=num_videos_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        )
+        obj_image_latents = obj_image_latents.to(encoder_hidden_states.dtype)
+        #print(f'image_latents: {image_latents}')
+        self.vae.cpu()
+        # Repeat the image latents for each frame so we can concatenate them with the noise
+        # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
+        image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        obj_image_latents = obj_image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            fps,
+            motion_bucket_id,
+            noise_aug_strength,
+            encoder_hidden_states.dtype,
+            batch_size,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+        )
+        added_time_ids = added_time_ids.to(device)
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None)
+        # 5. Prepare latent variables
+        # num_channels_latents = self.unet.config.in_channels
+        # print("latents",latents)
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            tile_size,
+            4,
+            height,
+            width,
+            encoder_hidden_states.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents = latents.repeat(1, num_frames // tile_size + 1, 1, 1, 1)[:, :num_frames]
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, 0.0)
+        # 7. Prepare guidance scale
+        guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
+        guidance_scale = guidance_scale.to(device, latents.dtype)
+        guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
+        guidance_scale = _append_dims(guidance_scale, latents.ndim)
+        self._guidance_scale = guidance_scale
+        # 8. Denoising loop
+        self._num_timesteps = len(timesteps)
+        self.pose_net.to(device)
+        self.track_net.to(device)
+        self.unet.to(device)
+        self.obj_proj_net.to(device)
+        self.obj_attn_net.to(device)
+        with torch.cuda.device(device):
+            torch.cuda.empty_cache()
+        obj_cls_emb = torch.cat([
+            obj_embeddings[:, 0, :], obj_embeddings[:, 1370, :], obj_embeddings[:, 1370*2, :]
+        ], dim=1).to(torch.float16)
+        obj_cls_embeddings = self.obj_proj_net(obj_cls_emb)
+        obj_embeddings = obj_embeddings.to(torch.device('cuda'))
+        obj_attn_embeddings = self.obj_attn_net(obj_embeddings)
+        encoder_hidden_states = torch.concat([
+            encoder_hidden_states, obj_cls_embeddings, obj_attn_embeddings
+        ], dim=1)
+        if self.do_classifier_free_guidance:
+            negative_image_embeddings = torch.zeros_like(encoder_hidden_states)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            encoder_hidden_states = torch.cat([negative_image_embeddings, encoder_hidden_states])
+        def hook_function(module, inputdata, output):
+            if isinstance(output, tuple):
+                print(f"Module name: {module.__class__.__name__} Output shape: {output}")
+            else:
+                print(f"Module name: {module.__class__.__name__} Output shape: {output.shape}")
+                print("Output stats - mean: {}, std: {}, min: {}, max: {}".format(output.mean().item(), output.std().item(),
+                                                                              output.min().item(), output.max().item()))
+                if torch.isnan(output).any():
+                    print(f"!!!!!!!!!!!!!!!!!!!!NaN detected after layer: {module.__class__.__name__}!!!!!!!!!!!!!!!!!!!!")
+        hooks = []
+        def register_hooks():
+            for name, module in self.unet.named_modules():
+                if isinstance(module, nn.Module):
+                    hooks.append(module.register_forward_hook(hook_function))
+        bias_start = 1
+        bias_step = 4
+        with (self.progress_bar(total=len(timesteps) * math.ceil((num_frames-1)/(tile_size-1))) as progress_bar):
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # Concatenate image_latents over channels dimension
+                print(f"{latent_model_input.shape} {image_latents.shape} {obj_image_latents.shape}")
+                latent_model_input = torch.cat([latent_model_input, image_latents, obj_image_latents], dim=2)
+                # predict the noise residual
+                noise_pred = torch.zeros_like(image_latents)
+                noise_pred_cnt = torch.zeros_like(image_latents)
+                weight = torch.ones_like(image_latents)
+                bias_start = (bias_start - 1) % (num_frames - 1) + 1
+                start_cur = bias_start
+                finished_len = 1
+                print(f'start_cur {start_cur}')
+                while finished_len < num_frames:
+                    start_cur = (start_cur - 1) % (num_frames - 1) + 1
+                    end_cur = start_cur + tile_size - 1
+                    idx = [0, ]
+                    idx.extend([(ii - 1) % (num_frames - 1) + 1 for ii in range(start_cur, end_cur)])
+                    print(idx)
+                    # classification-free inference
+                    pose_latents = self.pose_net(pose_pixels[idx].to(dtype=torch.float16).to(device))
+                    track_latents = self.track_net(obj_track_pixels[idx].to(dtype=torch.float16).to(device))
+                    if visual_output:
+                        os.makedirs('./visual_spatio_attn', exist_ok=True)
+                        for name, module in self.unet.named_modules():
+                            if '.transformer_blocks.' in name and name.endswith('.attn2'):
+                                module.visual_path = None
+                    latent_model_input=latent_model_input.to(dtype=torch.float16)
+                    encoder_hidden_states=encoder_hidden_states.to(dtype=torch.float16)
+                    t=t.to(dtype=torch.float16)
+                    _noise_pred = self.unet(
+                        latent_model_input[:1, idx],
+                        t,
+                        encoder_hidden_states=encoder_hidden_states[:1],
+                        added_time_ids=added_time_ids[:1],
+                        pose_latents=None,
+                        image_only_indicator=image_only_indicator,
+                        return_dict=False,
+                        obj_track_latents=None,
+                    )[0]
+                    noise_pred[:1, idx] += _noise_pred
+                    # normal inference
+                    if visual_output:
+                        os.makedirs('./visual_spatio_attn', exist_ok=True)
+                        for name, module in self.unet.named_modules():
+                            if '.transformer_blocks.' in name and name.endswith('.attn2'):
+                                module.visual_path = os.path.join('./visual_spatio_attn', name[:-6] + '.png')
+                    _noise_pred = self.unet(
+                        latent_model_input[1:, idx],
+                        t,
+                        encoder_hidden_states=encoder_hidden_states[1:],
+                        added_time_ids=added_time_ids[1:],
+                        pose_latents=pose_latents,
+                        image_only_indicator=image_only_indicator,
+                        return_dict=False,
+                        obj_track_latents= track_latents,
+                    )[0]
+                    noise_pred[1:, idx] += _noise_pred
+                    noise_pred_cnt[:, idx] += weight[:, idx]
+                    finished_len += tile_size - 1
+                    start_cur += tile_size - 1
+                    progress_bar.update()
+                bias_start += bias_step
+                noise_pred = noise_pred.div_(noise_pred_cnt)
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+        self.pose_net.cpu()
+        self.unet.cpu()
+        self.track_net.cpu()
+        self.obj_proj_net.cpu()
+        if not output_type == "latent":
+            self.vae.decoder.to(device)
+            frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+            frames = tensor2vid(frames, self.image_processor, output_type=output_type)
+        else:
+            frames = latents
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return AnchorCrafterPipelineOutput(frames=frames)

anchorcrafter/utils/__init__.py ADDED Viewed

File without changes

anchorcrafter/utils/geglu_patch.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import diffusers.models.activations
+def patch_geglu_inplace():
+    """Patch GEGLU with inplace multiplication to save GPU memory."""
+    def forward(self, hidden_states):
+        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
+        hidden_states = hidden_states.clone()
+        return hidden_states.mul_(self.gelu(gate))
+    diffusers.models.activations.GEGLU.forward = forward

anchorcrafter/utils/loader.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import logging
+import torch
+import torch.utils.checkpoint
+from diffusers.models import AutoencoderKLTemporalDecoder
+from diffusers.schedulers import EulerDiscreteScheduler
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from transformers import AutoImageProcessor, AutoModel
+from ..modules.unet import UNetSpatioTemporalConditionModel
+from ..modules.track_net import TrackNet
+from ..modules.obj_proj_net import ObjProjNet
+from ..modules.obj_attn_net import ObjAttnNet
+from ..modules.pose_net import PoseNet
+logger = logging.getLogger(__name__)
+class AnchorCrafter(torch.nn.Module):
+    def __init__(self, base_model_path, dino_path):
+        """construnct base model components and load pretrained svd model except pose-net
+        Args:
+            base_model_path (str): pretrained svd model path
+        """
+        super().__init__()
+        unet_config = UNetSpatioTemporalConditionModel.load_config(base_model_path, subfolder="unet")
+        unet_config["in_channels"] = 12
+        self.unet = UNetSpatioTemporalConditionModel.from_config(unet_config).to(torch.float16)
+        self.vae = AutoencoderKLTemporalDecoder.from_pretrained(
+            base_model_path, subfolder="vae", torch_dtype=torch.float16, variant="fp16")
+        self.obj_image_encoder = AutoModel.from_pretrained(dino_path).to(torch.float16)
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            base_model_path, subfolder="image_encoder", torch_dtype=torch.float16, variant="fp16")
+        self.noise_scheduler = EulerDiscreteScheduler.from_pretrained(
+            base_model_path, subfolder="scheduler")
+        self.feature_extractor = CLIPImageProcessor.from_pretrained(
+            base_model_path, subfolder="feature_extractor")
+        self.dino_feature_extractor = AutoImageProcessor.from_pretrained(dino_path)
+        # pose_net
+        self.pose_net = PoseNet(latent_channels=self.unet.config.block_out_channels[0]).to(dtype=torch.float16)
+        # track_net
+        self.track_net = TrackNet(latent_channels=self.unet.config.block_out_channels[0]).to(dtype=torch.float16)
+        self.obj_proj_net = ObjProjNet(context_tokens=3).to(dtype=torch.float16)
+        self.obj_attn_net = ObjAttnNet().to(dtype=torch.float16)

anchorcrafter/utils/utils.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import gradio as gr
+import numpy as np
+import cv2
+import tempfile
+import torch
+from typing import List, Union
+def save_video_with_cv2(frames: Union[torch.Tensor, List[np.ndarray]], output_path: str, fps: int = 24):
+    """Save video using OpenCV (supports PyTorch tensors or numpy arrays input)"""
+    if isinstance(frames, torch.Tensor):
+        frames = frames.detach().cpu().numpy()
+    # Ensure data is uint8 type in 0-255 range
+    processed_frames = []
+    for frame in frames:
+        # Convert float types (assuming 0-1 range) to 0-255
+        if frame.dtype == np.float32 or frame.dtype == np.float64:
+            frame = (frame * 255).clip(0, 255).astype(np.uint8)
+        elif frame.dtype != np.uint8:
+            frame = frame.astype(np.uint8)
+        # Convert color channel order to BGR (OpenCV requirement)
+        if frame.ndim == 3 and frame.shape[2] == 3:  # If RGB format
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        processed_frames.append(frame)
+    if not processed_frames:
+        raise ValueError("No valid video frames to save")
+    height, width = processed_frames[0].shape[:2]
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+    try:
+        for frame in processed_frames:
+            writer.write(frame)
+    finally:
+        writer.release()
+def save_to_mp4(frames: Union[torch.Tensor, List[np.ndarray]], fps: int = 7) -> str:
+    """Save to MP4 and return temporary file path"""
+    # Adjust dimensions if input is PyTorch tensor (f, c, h, w) -> (f, h, w, c)
+    if isinstance(frames, torch.Tensor):
+        frames = frames.permute(0, 2, 3, 1)
+    temp_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
+    save_video_with_cv2(frames, temp_path, fps)
+    return temp_path

app.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import logging
+import spaces
+import os
+import gradio as gr
+import numpy as np
+from PIL import Image
+from inference import process_inputs,run_pipeline
+import torch
+from omegaconf import OmegaConf
+from anchorcrafter.utils.utils import save_to_mp4
+from threading import Thread
+from anchorcrafter.utils.loader import AnchorCrafter
+from huggingface_hub import hf_hub_download
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention_processor import XFormersAttnProcessor
+from anchorcrafter.modules.attention_processor import IPAttnProcessor
+from anchorcrafter.pipelines.pipeline import AnchorCrafterPipeline
+from packaging import version
+import logging
+logger = logging.getLogger(__name__)
+css='''
+.text-container {
+    background-color: #f0faff;
+    border: 1px solid #b3d8ff;
+    border-radius: 6px;
+    padding: 5px;
+    margin: 5px auto;
+    width: fit-content;
+    box-shadow: 2px 2px 6px rgba(0, 0, 0, 0.1);
+}
+.text-container h2 {
+    font-family: Arial, sans-serif;
+    color: #000000;
+    font-size: 18px;
+    font-weight: bold;
+    margin-bottom: 5px;
+    margin-top: 5px;
+}
+.text-container p {
+    font-family: Arial, sans-serif;
+    color: #444444;
+    font-size: 18px;
+    line-height: 1.5;
+    margin-top: 5px;
+}
+'''
+global pipeline, infer_config, model_path, anchorcrafter_models
+# Path mappings
+IMAGE_VIDEO_MAP = {
+    0: ["data/video/hmbb_1.mp4", "data/video/hmbb_2.mp4"],
+    1: ["data/video/cheese_1.mp4", "data/video/cheese_2.mp4"],
+    2: ["data/video/earphone_1.mp4", "data/video/earphone_2.mp4"],
+    3: ["data/video/mouse_1.mp4", "data/video/mouse_2.mp4"],
+    4: ["data/video/cup_1.mp4", "data/video/cup_2.mp4"],}
+OBJECT_INDEX_MAP ={
+    "hmbb":0,"cheese":1,"earphone":2,"mouse":3,"cup":4
+}
+OUTPUT_PATH_MAP={
+    "hmbb":"data/out/hmbb.mp4","earphone":"data/out/earphone.mp4","cup":"data/out/cup.mp4","mouse":"data/out/mouse.mp4","cheese":"data/out/cheese.mp4"
+}
+POSE_TRACK_MAP = {
+    0: [["data/depth_cut/hmbb_1.mp4", "data/hand_cut/hmbb_1.mp4"],
+        ["data/depth_cut/hmbb_2.mp4", "data/hand_cut/hmbb_2.mp4"]],
+    1: [["data/depth_cut/cheese_1.mp4", "data/hand_cut/cheese_1.mp4"],
+        ["data/depth_cut/cheese_2.mp4", "data/hand_cut/cheese_2.mp4"]],
+    2: [["data/depth_cut/earphone_1.mp4", "data/hand_cut/earphone_1.mp4"],
+        ["data/depth_cut/earphone_2.mp4", "data/hand_cut/earphone_2.mp4"]],
+    3: [["data/depth_cut/mouse_1.mp4", "data/hand_cut/mouse_1.mp4"],
+        ["data/depth_cut/mouse_2.mp4", "data/hand_cut/mouse_2.mp4"]],
+    4: [["data/depth_cut/cup_1.mp4", "data/hand_cut/cup_1.mp4"],
+        ["data/depth_cut/cup_2.mp4", "data/hand_cut/cup_2.mp4"]]}
+EXAMPLE_IMAGES = [
+    "data/object/hmbb_1.jpg",
+    "data/object/cheese_1.jpg",
+    "data/object/earphone_1.jpg",
+    "data/object/mouse_1.jpg",
+    "data/object/cup_1.jpg",
+]
+def update_video_choices(evt: gr.SelectData, selected_state):
+    """Update video choices based on gallery selection"""
+    selected_state = evt.index
+    video1, video2 = IMAGE_VIDEO_MAP[selected_state]
+    return (
+        gr.update(value=video1, visible=True),
+        gr.update(value=video2, visible=True),
+        selected_state
+    )
+def clear_anchor():
+    """Clear anchor image input"""
+    return gr.update(value=None)
+def select_button1(selected_state, video_state):
+    """Handle first video selection"""
+    return (
+        gr.update(variant="primary"),
+        gr.update(variant="secondary"),
+        0,
+        selected_state
+    )
+def select_button2(selected_state, video_state):
+    """Handle second video selection"""
+    return (
+        gr.update(variant="secondary"),
+        gr.update(variant="primary"),
+        1,
+        selected_state
+    )
+def load_model():
+    """Initialize model components"""
+    global pipeline, infer_config, model_path, anchorcrafter_models
+    infer_config = OmegaConf.load("config/test.yaml")
+    anchorcrafter_models = AnchorCrafter(infer_config.base_model_path, infer_config.dino_path)
+    # Download model weights
+    model_path = hf_hub_download(
+        repo_id=infer_config.anchorcrafter_path,
+        filename="pytorch_model.bin"
+    )
+@spaces.GPU
+def run(infer_config,image_pixels, pose_pixels, obj_pixels, obj_track_pixels,hand_pixels):
+    """Execute the generation pipeline"""
+    global anchorcrafter_models
+    device=torch.device('cuda')
+    if is_xformers_available():
+        import xformers
+        xformers_version = version.parse(xformers.__version__)
+        if xformers_version == version.parse("0.0.16"):
+            logger.warn(
+                "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+            )
+        anchorcrafter_models.unet.enable_xformers_memory_efficient_attention()
+    else:
+        raise ValueError(
+            "xformers is not available. Make sure it is installed correctly")
+    # Configure attention processors
+    attn_procs = {}
+    for name in anchorcrafter_models.unet.attn_processors.keys():
+        cross_attention_dim = None if name.endswith(
+            "attn1.processor") else anchorcrafter_models.unet.config.cross_attention_dim
+        hidden_size = None
+        if name.startswith("mid_block"):
+            hidden_size = anchorcrafter_models.unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(anchorcrafter_models.unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = anchorcrafter_models.unet.config.block_out_channels[block_id]
+        if cross_attention_dim is None:
+            attn_procs[name] = XFormersAttnProcessor()
+        else:
+            attn_procs[name] = IPAttnProcessor(
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+                scale=1.0,
+                num_tokens=15
+            )
+    anchorcrafter_models.unet.set_attn_processor(attn_procs)
+    anchorcrafter_models=anchorcrafter_models.to(torch.float16)
+    # Load model weights
+    model_weights = torch.load(model_path)
+    missing, unexpected = anchorcrafter_models.load_state_dict(model_weights, strict=False)
+    logger.info(f"Missing keys: {len(missing)}, Unexpected keys: {len(unexpected)}")
+    # Initialize pipeline
+    pipeline = AnchorCrafterPipeline(
+        vae=anchorcrafter_models.vae,
+        image_encoder=anchorcrafter_models.image_encoder,
+        obj_image_encoder=anchorcrafter_models.obj_image_encoder,
+        unet=anchorcrafter_models.unet,
+        scheduler=anchorcrafter_models.noise_scheduler,
+        feature_extractor=anchorcrafter_models.feature_extractor,
+        dino_feature_extractor=anchorcrafter_models.dino_feature_extractor,
+        pose_net=anchorcrafter_models.pose_net,
+        track_net=anchorcrafter_models.track_net,
+        obj_proj_net=anchorcrafter_models.obj_proj_net,
+        obj_attn_net=anchorcrafter_models.obj_attn_net
+    )
+    for task in infer_config.test_case:
+        _video_frames = run_pipeline(
+            pipeline,
+            image_pixels, pose_pixels, obj_pixels, obj_track_pixels,
+            hand_pixels=hand_pixels, total_frames=infer_config.total_frames,
+            device=device, task_config=task)
+    return _video_frames
+def pre(selected_state, video_state, anchor_image):
+    """Process user inputs and generate video"""
+    if anchor_image is None:
+        raise gr.Error("Please upload an anchor image first!")
+    # Convert PIL Image to numpy array
+    if isinstance(anchor_image, Image.Image):
+        anchor_image = np.array(anchor_image)
+        logger.debug(f"Converted image shape: {anchor_image.shape}")
+    # Get resource paths
+    video_path = IMAGE_VIDEO_MAP[selected_state][video_state]
+    obj_path = EXAMPLE_IMAGES[selected_state]
+    obj_track_path = POSE_TRACK_MAP[selected_state][video_state][0]
+    hand_path = POSE_TRACK_MAP[selected_state][video_state][1]
+    # Preprocess inputs
+    pose_pixels, image_pixels, obj_pixels, obj_track_pixels, hand_pixels = process_inputs(
+        video_path=video_path,
+        image_pixels=anchor_image,
+        obj_path=obj_path,
+        obj_track_path=obj_track_path,
+        hand_path=hand_path,
+        total_frames=infer_config.total_frames,
+    )
+    # Generate video
+    _video_frames = run(infer_config, image_pixels, pose_pixels, obj_pixels, obj_track_pixels, hand_pixels)
+    temp_path = save_to_mp4(_video_frames, fps=infer_config.fps)
+    return temp_path
+def find_file_index(target_file):
+    for category_id, group in IMAGE_VIDEO_MAP.items():
+        for file_idx, file_path in enumerate(group):
+            if target_file in file_path:
+                return category_id, file_idx
+    return None
+def exam_result(anchor, object_exam, video_exam):
+    logging.info("Function entered")
+    filename = os.path.splitext(os.path.basename(video_exam))[0]  # "hmbb_2"
+    prefix = filename.rsplit("_", 1)[0]  # "hmbb"
+    file_idx = int(filename.split("_")[-1])-1  # 2
+    selected_state=OBJECT_INDEX_MAP[prefix]
+    video1 = IMAGE_VIDEO_MAP[selected_state][0]
+    video2 = IMAGE_VIDEO_MAP[selected_state][1]
+    out = OUTPUT_PATH_MAP[prefix]
+    return (
+        gr.update(value=video1),  # video_preview1
+        gr.update(value=video2),  # video_preview2
+        gr.update(variant="primary" if file_idx == 0 else "secondary"),  # btn1
+        gr.update(variant="secondary" if file_idx == 0 else "primary"),  # btn2
+        selected_state,
+        file_idx,
+        out
+    )
+# Create Gradio interface
+with gr.Blocks(title="AnchorCrafter", theme=gr.themes.Soft(), css=css) as demo:
+    selected_state = gr.State(0)
+    video_state = gr.State(0)
+    gr.Markdown("# AnchorCrafter: Animate Cyber-Anchors Selling Your Products via Human-Object Interacting Video Generation")
+    top_description = gr.HTML(f'''
+<div class="text-container">
+    <h2>To reduce inference time, we set the generated video to 28 frames, which takes approximately 5 minutes on Nvidia L4.</h2>
+    <p>If you require long video processing, please copy or download this space to run it on a private GPU and modify the config/test.yaml file accordingly.</p>
+</div>
+''', elem_id="top_description")
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown("## 1. Choose Object")
+            gallery = gr.Gallery(value=EXAMPLE_IMAGES, label="objects", columns=3, height=320, object_fit="contain")
+            gr.Markdown("## 3. Anchor Image")
+            anchor = gr.Image(label="anchor", image_mode="RGB", height=380, width=250, sources="upload")
+            with gr.Row():
+                clear_btn3 = gr.Button("🧹 Clear")
+                run_btn4 = gr.Button("🚀 Run")
+        with gr.Column(scale=3):
+            gr.Markdown("## 2. Control Video")
+            with gr.Row():
+                video_preview1 = gr.Video(label="video 1", height=260)
+                video_preview2 = gr.Video(label="video 2", height=260)
+            with gr.Row():
+                btn1 = gr.Button("choose video 1", variant="secondary")
+                btn2 = gr.Button("choose video 2", variant="secondary")
+            gr.Markdown("## 4. Results")
+            video_display = gr.Video(label="results", height=380)
+    video_exam= gr.Video(label="Control Video",visible=False)
+    object_exam = gr.Image(label="Object", visible=False)
+    examples = gr.Examples(
+        examples=[
+            ["data/anchor/1.jpg", "data/object/hmbb_1.jpg", "data/video/hmbb_2.mp4"],
+            ["data/anchor/2.jpg", "data/object/earphone_1.jpg", "data/video/earphone_1.mp4"],
+            ["data/anchor/3.jpg", "data/object/cup_1.jpg", "data/video/cup_2.mp4"],
+            ["data/anchor/4.jpg", "data/object/mouse_1.jpg", "data/video/mouse_1.mp4"],
+            ["data/anchor/5.jpg", "data/object/cheese_1.jpg", "data/video/cheese_2.mp4"],
+        ],
+        fn=exam_result,
+        run_on_click=True,
+        cache_examples=False,
+        inputs=[anchor, object_exam, video_exam],
+        outputs=[video_preview1,video_preview2,btn1, btn2, selected_state, video_state, video_display])
+    gallery.select(
+        update_video_choices,
+        inputs=[selected_state],
+        outputs=[video_preview1, video_preview2, selected_state]
+    )
+    btn1.click(
+        select_button1,
+        inputs=[selected_state, video_state],
+        outputs=[btn1, btn2, video_state, selected_state]
+    )
+    btn2.click(
+        select_button2,
+        inputs=[selected_state, video_state],
+        outputs=[btn1, btn2, video_state, selected_state]
+    )
+    clear_btn3.click(clear_anchor, outputs=[anchor])
+    run_btn4.click(
+        pre,
+        inputs=[selected_state, video_state, anchor],
+        outputs=[video_display]
+    )
+# Initialize model in background
+Thread(target=load_model, daemon=True).start()
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

config/test.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+# base svd model path
+base_model_path: stabilityai/stable-video-diffusion-img2vid-xt
+dino_path: facebook/dinov2-large
+anchorcrafter_path: cangcz/test
+fps: 11
+total_frames: 28  # The final length of the generated video
+test_case:
+  - num_frames: 15
+    resolution: 576
+    frames_overlap: 5
+    num_inference_steps: 30
+    noise_aug_strength: 0
+    guidance_scale: 4.0
+    sample_stride: 4
+    seed: 42

constants.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# w/h apsect ratio
+#ASPECT_RATIO = 2 / 3  # 512*768
+ASPECT_RATIO = 9 / 16 # 576*1024

data/anchor/1.jpg ADDED Viewed

data/anchor/2.jpg ADDED Viewed

data/anchor/3.jpg ADDED Viewed

Git LFS Details

SHA256: 2dcb46014ee99f97e09ac8a7ba55178ea7c53598d6d091c7bf7f264c8d009bf2
Pointer size: 132 Bytes
Size of remote file: 1.99 MB

data/anchor/4.jpg ADDED Viewed

Git LFS Details

SHA256: 7a2b0baaec289c488e8735482e2a4f6d34a7dfb80e476a5d81a8dd51cfd885ac
Pointer size: 131 Bytes
Size of remote file: 539 kB

data/anchor/5.jpg ADDED Viewed

Git LFS Details

SHA256: 316c3450e6152bf8131c1d84ddd7df0e1153e2137d3f46b2814e71f6e8f9618f
Pointer size: 131 Bytes
Size of remote file: 357 kB

data/depth_cut/cheese_1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef63934b250bbb51d83a241ddc8cd03d2f3dd61b06a53cf5c3eec7d28bfea393
+size 1832420

data/depth_cut/cheese_2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:173c9cea9609806de733c000405d8380cb99dcf8ca0c0b25726ae308d81bd8e7
+size 1718113

data/depth_cut/cup_1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ad2e78eb58bcc21e6ee7c74d272c6053641269ea88e046f6e0c8fee63dee1ff
+size 1802707

data/depth_cut/cup_2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03c2c1206c62381f4fa09e635f5d63bb834a9c007d07f79825b567da46e786f5
+size 2013536

data/depth_cut/earphone_1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe45d51fe485ff8402bb37fb62e4c32fc892644b7828e70056d92336e9400de3
+size 1075588

data/depth_cut/earphone_2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de88bc15419cf5813eec169fcd950a70637b85c6d7616e3ff5f1d1797f3c9681
+size 1528559

data/depth_cut/hmbb_1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00ccdb08c58da95ef5aa8e1ad97583f6e2daec32a13921ce7664b6e1e44aa49e
+size 2361117

data/depth_cut/hmbb_2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbc4db207fb5c163ab99594eb93499a70e1858c5e4f7d7133c663ac5b05c0704
+size 4774735

data/depth_cut/mouse_1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1cac2f4ee52e73fafbf14cc9411be564750978343ecfa175b2c59d70f592905
+size 1618354

data/depth_cut/mouse_2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5df30727ec4b8d1d0ea8532e95a72807ef2174107026e6f063182941acadc44b
+size 1384678

data/hand_cut/cheese_1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e502009545f2ed2fdf0815cd96a6c4354aa8100e5ac5cfe84323c3b56533f661
+size 2962933

data/hand_cut/cheese_2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60b16aa66a5b6d79ad5722e6570aa5379c2be54cdb47c5cec264568931b94a25
+size 2668692

data/hand_cut/cup_1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:729a51c840377d3634e008946eca0213d88c5ce388b76b7863cc5e6f6594946a
+size 2412734

data/hand_cut/cup_2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9cb95d1d1001237da04eac68c5439a5a00da548097016f975536271028f10dc7
+size 2707637

data/hand_cut/earphone_1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d576867e34f29a55a90ed9aa0e7027530ec6c6b39aed28e681f719ad62169df9
+size 1969306

data/hand_cut/earphone_2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a97a4b66693b5562d99b2d9dec078aad7e362b6b92bd9623f898b787547b51bb
+size 3502771

data/hand_cut/hmbb_1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d5d40a92e5bb62ea8a3240bbb2e17b01e23b63b5047891f14ba729eeeb28b9b
+size 1875012

data/hand_cut/hmbb_2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84150bd20ec8f19a31cde8b4c814a938aaf48140635ae32496fc7f27be2a3834
+size 3162317

data/hand_cut/mouse_1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:816f0525711efe17bced6554efb32ebc28924cf1710546e2af39382fe3d915e8
+size 2183209