|
|
import os |
|
|
import io |
|
|
import re |
|
|
import math |
|
|
import tempfile |
|
|
import imageio |
|
|
import random |
|
|
from tqdm import tqdm |
|
|
import subprocess |
|
|
|
|
|
import cv2 |
|
|
import numpy as np |
|
|
from decord import VideoReader |
|
|
from PIL import Image |
|
|
from moviepy.editor import AudioFileClip, VideoClip |
|
|
|
|
|
|
|
|
import torch |
|
|
from torchvision.io import write_video |
|
|
from torchvision.utils import save_image |
|
|
import torchvision.transforms as transforms |
|
|
|
|
|
import binascii |
|
|
import torchvision |
|
|
import imageio |
|
|
import os.path as osp |
|
|
|
|
|
|
|
|
def infinite_iterator(iter): |
|
|
while True: |
|
|
for sample in iter: |
|
|
yield sample |
|
|
|
|
|
|
|
|
def save_sample(x, fps=8, save_path=None, normalize=True, value_range=(-1, 1)): |
|
|
""" |
|
|
Args: |
|
|
x (Tensor): shape [C, T, H, W] |
|
|
Returns: |
|
|
x (Tensor): shape [T, H, W, C] |
|
|
""" |
|
|
assert x.ndim == 4 |
|
|
|
|
|
os.makedirs(os.path.dirname(save_path),exist_ok=True) |
|
|
|
|
|
if x.shape[1] == 1: |
|
|
save_path += ".png" |
|
|
x = x.squeeze(1) |
|
|
save_image([x], save_path, normalize=normalize, value_range=value_range) |
|
|
x = x.unsqueeze(0) |
|
|
x = x.permute(0, 2, 3, 1) |
|
|
else: |
|
|
save_path += ".mp4" |
|
|
if normalize: |
|
|
low, high = value_range |
|
|
x = x.clamp(min=low, max=high) |
|
|
x = x.sub(low).div(max(high - low, 1e-5)) |
|
|
|
|
|
x = x.mul(255).add(0.5).clamp(0, 255).permute(1, 2, 3, 0).to("cpu", torch.uint8) |
|
|
write_video(save_path, x, fps=fps, video_codec="h264") |
|
|
print(f"Saved to {save_path}") |
|
|
return x |
|
|
|
|
|
|
|
|
def video_reader_from_data_meta(datameta, use_tempfile, num_threads_decord): |
|
|
""" Get VideoReader from data meta; data meta needs to be video. |
|
|
""" |
|
|
if not datameta.is_video: |
|
|
raise NotImplementedError('Unknown data type.') |
|
|
|
|
|
if 'raw_frames' in datameta: |
|
|
raw_data = datameta.raw_frames |
|
|
if use_tempfile: |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile() as temp: |
|
|
temp.write(raw_data) |
|
|
video_reader = VideoReader(temp.name, num_threads=num_threads_decord) |
|
|
else: |
|
|
|
|
|
dataBytesIO = io.BytesIO(raw_data) |
|
|
|
|
|
|
|
|
video_reader = VideoReader(dataBytesIO, num_threads=num_threads_decord) |
|
|
elif "tar_dir" in datameta and "tar_filename" in datameta and "tar_key" in datameta: |
|
|
raw_data = datameta.load_tar_videodata() |
|
|
if use_tempfile: |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile() as temp: |
|
|
temp.write(raw_data) |
|
|
video_reader = VideoReader(temp.name, num_threads=num_threads_decord) |
|
|
else: |
|
|
|
|
|
dataBytesIO = io.BytesIO(raw_data) |
|
|
|
|
|
|
|
|
video_reader = VideoReader(dataBytesIO, num_threads=num_threads_decord) |
|
|
elif os.path.exists(datameta.filename): |
|
|
video_reader = VideoReader(datameta.filename, num_threads=num_threads_decord) |
|
|
else: |
|
|
raise NotImplementedError('Not supported data format. rawframes or filename is needed.') |
|
|
|
|
|
return video_reader |
|
|
|
|
|
|
|
|
def cap_from_data_meta(datameta): |
|
|
if not datameta.is_video: |
|
|
raise NotImplementedError('Unknown data type.') |
|
|
|
|
|
if 'raw_frames' in datameta: |
|
|
raw_data = datameta.raw_frames |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile() as temp: |
|
|
temp.write(raw_data) |
|
|
cap = cv2.VideoCapture(temp.name) |
|
|
elif "tar_dir" in datameta and "tar_filename" in datameta and "tar_key" in datameta: |
|
|
raw_data = datameta.load_tar_videodata() |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile() as temp: |
|
|
temp.write(raw_data) |
|
|
cap = cv2.VideoCapture(temp.name) |
|
|
elif os.path.exists(datameta.filename): |
|
|
cap = cv2.VideoCapture(datameta.filename) |
|
|
else: |
|
|
raise NotImplementedError('Not supported data format. rawframes or filename is needed.') |
|
|
|
|
|
return cap |
|
|
|
|
|
|
|
|
def none_node_splitter(src, group=None): |
|
|
yield from src |
|
|
|
|
|
|
|
|
def resize_and_covert_to_gray(np_frames, pixel_value=16, interpolation=cv2.INTER_LINEAR, resize_only=False): |
|
|
|
|
|
height, width, *_ = np_frames[0].shape |
|
|
|
|
|
if width < height: |
|
|
new_width = pixel_value |
|
|
new_height = int((new_width / width) * height) |
|
|
else: |
|
|
new_height = pixel_value |
|
|
new_width = int((new_height / height) * width) |
|
|
|
|
|
|
|
|
def transform(frame): |
|
|
|
|
|
frame = cv2.resize(frame, (new_width, new_height), interpolation=interpolation) |
|
|
|
|
|
if not resize_only: |
|
|
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) |
|
|
return frame |
|
|
|
|
|
|
|
|
resize_frames = [transform(frame) for frame in np_frames] |
|
|
resize_frames = np.stack(resize_frames) |
|
|
|
|
|
return resize_frames |
|
|
|
|
|
def get_top_m_percent(arr, m_percent): |
|
|
B, H, W = arr.shape |
|
|
N = int(H * W * m_percent / 100) |
|
|
result = np.zeros((B, N)) |
|
|
for i in range(B): |
|
|
flattened_frame = arr[i].flatten() |
|
|
flattened_frame = flattened_frame[~np.isnan(flattened_frame)] |
|
|
top_m_percent_values = np.partition(flattened_frame, -N)[-N:] |
|
|
result[i] = top_m_percent_values |
|
|
return np.nanmean(result,axis=1) |
|
|
|
|
|
def compute_optical_flow_score(np_frames, pixel_value=16): |
|
|
video_length = np_frames.shape[0] |
|
|
|
|
|
flow_scores = [] |
|
|
for i in range(1, video_length): |
|
|
|
|
|
flow = cv2.calcOpticalFlowFarneback(np_frames[i - 1], np_frames[i], None, 0.5, 3, 15, 3, 5, 1.2, 0) |
|
|
|
|
|
magnitude, angle = cv2.cartToPolar(flow[..., 0], flow[..., 1]) |
|
|
|
|
|
flow_scores.append(magnitude) |
|
|
|
|
|
|
|
|
return np.array(flow_scores) |
|
|
|
|
|
def get_first_frame_from_video_path(video_path): |
|
|
|
|
|
cap = cv2.VideoCapture(video_path) |
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, 0) |
|
|
|
|
|
|
|
|
ret, frame = cap.read() |
|
|
if ret is False: |
|
|
return None |
|
|
cap.release() |
|
|
|
|
|
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
|
|
|
|
frame = Image.fromarray(frame) |
|
|
|
|
|
return frame |
|
|
|
|
|
def get_first_clip_from_video(video_path, clip_len=1): |
|
|
""" |
|
|
获取视频前n帧(默认第1帧) |
|
|
|
|
|
参数: |
|
|
video_path: 视频文件路径 |
|
|
n: 需要获取的帧数(从第1帧开始) |
|
|
|
|
|
返回: |
|
|
list: 包含前n帧PIL.Image对象的列表,空列表表示读取失败 |
|
|
""" |
|
|
frames = [] |
|
|
cap = cv2.VideoCapture(video_path) |
|
|
if not cap.isOpened(): |
|
|
return frames |
|
|
|
|
|
if clip_len is None: |
|
|
clip_len = 100000000 |
|
|
|
|
|
for frame_idx in range(clip_len): |
|
|
|
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) |
|
|
ret, frame = cap.read() |
|
|
|
|
|
if not ret: |
|
|
break |
|
|
|
|
|
|
|
|
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
|
frames.append(frame) |
|
|
|
|
|
cap.release() |
|
|
return frames |
|
|
|
|
|
def get_last_clip_from_video(video_path, clip_len=1): |
|
|
""" |
|
|
获取视频最后n帧 |
|
|
|
|
|
参数: |
|
|
video_path: 视频文件路径 |
|
|
clip_len: 需要获取的帧数(从末尾开始) |
|
|
|
|
|
返回: |
|
|
list: 包含最后n帧的RGB帧列表,空列表表示读取失败 |
|
|
""" |
|
|
frames = [] |
|
|
cap = cv2.VideoCapture(video_path) |
|
|
if not cap.isOpened(): |
|
|
return frames |
|
|
|
|
|
|
|
|
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
|
|
|
|
|
|
|
start_frame = max(0, total_frames - clip_len) |
|
|
|
|
|
|
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame) |
|
|
|
|
|
|
|
|
while len(frames) < clip_len: |
|
|
ret, frame = cap.read() |
|
|
if not ret: |
|
|
break |
|
|
|
|
|
|
|
|
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
|
frames.append(frame) |
|
|
|
|
|
cap.release() |
|
|
|
|
|
|
|
|
return frames[-clip_len:] if len(frames) >= clip_len else frames |
|
|
|
|
|
|
|
|
def pad_to_square_ndarray(image, pad_value=255): |
|
|
H, W, C = image.shape |
|
|
max_size = max(H, W) |
|
|
|
|
|
padded_image = np.full((max_size, max_size, C), pad_value, dtype=image.dtype) |
|
|
|
|
|
top_left_y = (max_size - H) // 2 |
|
|
top_left_x = (max_size - W) // 2 |
|
|
|
|
|
padded_image[top_left_y:top_left_y + H, top_left_x:top_left_x + W, :] = image |
|
|
|
|
|
return padded_image |
|
|
|
|
|
def pad_to_square_pil(image, pad_value=255): |
|
|
width, height = image.size |
|
|
|
|
|
max_size = max(width, height) |
|
|
|
|
|
new_image = Image.new("RGB", (max_size, max_size), (pad_value, pad_value, pad_value)) |
|
|
|
|
|
top_left_x = (max_size - width) // 2 |
|
|
top_left_y = (max_size - height) // 2 |
|
|
|
|
|
new_image.paste(image, (top_left_x, top_left_y)) |
|
|
|
|
|
return new_image |
|
|
|
|
|
def separate_connected_components(mask): |
|
|
|
|
|
labeled_array, num_features = label(mask) |
|
|
|
|
|
separate_masks = [] |
|
|
bboxes = [] |
|
|
|
|
|
slices = find_objects(labeled_array) |
|
|
|
|
|
for i in range(1, num_features + 1): |
|
|
|
|
|
component_mask = (labeled_array == i).astype(np.uint8) |
|
|
separate_masks.append(component_mask) |
|
|
|
|
|
slice_ = slices[i - 1] |
|
|
|
|
|
bbox = (slice_[1].start, slice_[0].start, slice_[1].stop, slice_[0].stop) |
|
|
bboxes.append(bbox) |
|
|
|
|
|
return separate_masks, bboxes |
|
|
|
|
|
def bbox_random_crop(bbox): |
|
|
|
|
|
xmin, ymin, xmax, ymax = bbox |
|
|
|
|
|
width = xmax - xmin |
|
|
height = ymax - ymin |
|
|
|
|
|
if height > width: |
|
|
square_size = width |
|
|
max_y_start = ymax - square_size |
|
|
y_start = random.randint(ymin, max_y_start) |
|
|
return (xmin, y_start, xmin + square_size, y_start + square_size) |
|
|
else: |
|
|
square_size = height |
|
|
max_x_start = xmax - square_size |
|
|
x_start = random.randint(xmin, max_x_start) |
|
|
return (x_start, ymin, x_start + square_size, ymin + square_size) |
|
|
|
|
|
def inflate_bbox(bbox, d): |
|
|
|
|
|
x_min, y_min, x_max, y_max = bbox |
|
|
|
|
|
original_width = x_max - x_min |
|
|
original_height = y_max - y_min |
|
|
|
|
|
new_width = d * original_width |
|
|
new_height = new_width |
|
|
|
|
|
center_x = (x_min + x_max) / 2 |
|
|
center_y = (y_min + y_max) / 2 |
|
|
|
|
|
half_new_width = new_width / 2 |
|
|
half_new_height = new_height / 2 |
|
|
|
|
|
new_x_min = int(center_x - half_new_width) |
|
|
new_x_max = int(center_x + half_new_width) |
|
|
new_y_min = int(center_y - half_new_height) |
|
|
new_y_max = int(center_y + half_new_height) |
|
|
|
|
|
return (new_x_min, new_y_min, new_x_max, new_y_max) |
|
|
|
|
|
def get_frame_by_idx(cap, frame_idxs): |
|
|
if isinstance(frame_idxs, np.ndarray) or isinstance(frame_idxs, list): |
|
|
frames = [] |
|
|
for frame_idx in frame_idxs: |
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) |
|
|
|
|
|
ret, frame = cap.read() |
|
|
assert ret |
|
|
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
|
frames.append(frame) |
|
|
|
|
|
return frames |
|
|
else: |
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idxs) |
|
|
ret, frame = cap.read() |
|
|
assert ret |
|
|
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
|
return frame |
|
|
|
|
|
|
|
|
def recover_mask(array, shape): |
|
|
size = np.prod(shape) |
|
|
mask = np.unpackbits(array)[:size].reshape(shape).astype(np.uint8) |
|
|
return mask |
|
|
|
|
|
|
|
|
def calculate_iou(box1, box2): |
|
|
x1_min, y1_min, x1_max, y1_max = box1 |
|
|
x2_min, y2_min, x2_max, y2_max = box2 |
|
|
|
|
|
inter_x_min = max(x1_min, x2_min) |
|
|
inter_x_max = min(x1_max, x2_max) |
|
|
inter_y_min = max(y1_min, y2_min) |
|
|
inter_y_max = min(y1_max, y2_max) |
|
|
|
|
|
if inter_x_max > inter_x_min and inter_y_max > inter_y_min: |
|
|
inter_area = (inter_x_max - inter_x_min) * (inter_y_max - inter_y_min) |
|
|
else: |
|
|
inter_area = 0 |
|
|
|
|
|
area1 = (x1_max - x1_min) * (y1_max - y1_min) |
|
|
area2 = (x2_max - x2_min) * (y2_max - y2_min) |
|
|
|
|
|
union_area = area1 + area2 - inter_area |
|
|
iou = inter_area / union_area if union_area != 0 else 0 |
|
|
return iou |
|
|
|
|
|
def extract_number_from_suffix(s): |
|
|
match = re.search(r'_\[([\d.]+)\]$', s) |
|
|
if match: |
|
|
return float(match.group(1)) |
|
|
else: |
|
|
return 0 |
|
|
|
|
|
def tensor_to_video(tensor, output_video_path, input_audio_path, fps=30, dynamic_fps=True, audio_range=None, video_length=None): |
|
|
""" |
|
|
Converts a Tensor with shape [c, f, h, w] into a video and adds an audio track from the specified audio file. |
|
|
|
|
|
Args: |
|
|
tensor (Tensor): The Tensor to be converted, shaped [c, f, h, w]. |
|
|
output_video_path (str): The file path where the output video will be saved. |
|
|
input_audio_path (str): The path to the audio file (WAV file) that contains the audio track to be added. |
|
|
fps (int): The frame rate of the output video. Default is 30 fps. |
|
|
""" |
|
|
if tensor.shape[1] == 1: |
|
|
output_video_path += '.png' |
|
|
else: |
|
|
output_video_path += '.mp4' |
|
|
|
|
|
os.makedirs(os.path.dirname(output_video_path), exist_ok=True) |
|
|
|
|
|
tensor = tensor.permute(1, 2, 3, 0).cpu().numpy() |
|
|
tensor = np.clip(tensor * 255, 0, 255).astype(np.uint8) |
|
|
|
|
|
def make_frame(t): |
|
|
frame_index = min(int(t * fps), tensor.shape[0] - 1) |
|
|
return tensor[frame_index] |
|
|
|
|
|
if not dynamic_fps: |
|
|
video_duration = tensor.shape[0] / fps |
|
|
|
|
|
audio_clip = AudioFileClip(input_audio_path) |
|
|
audio_duration = audio_clip.duration |
|
|
|
|
|
if not dynamic_fps: |
|
|
final_duration = min(video_duration, audio_duration) |
|
|
audio_clip = audio_clip.subclip(0, final_duration) |
|
|
else: |
|
|
select_start, select_end = audio_range[0] / video_length, audio_range[1] / video_length |
|
|
audio_clip = audio_clip.subclip(select_start * audio_duration, select_end * audio_duration) |
|
|
final_duration = (select_end - select_start) * audio_duration |
|
|
fps = tensor.shape[0] / final_duration |
|
|
|
|
|
new_video_clip = VideoClip(make_frame, duration=final_duration) |
|
|
new_video_clip = new_video_clip.set_audio(audio_clip) |
|
|
print(f"video save fps is: {fps}") |
|
|
new_video_clip.write_videofile(output_video_path, fps=fps, audio_codec="aac") |
|
|
|
|
|
def resize_and_centercrop(cond_image, target_size): |
|
|
""" |
|
|
Resize image to the target size without padding. |
|
|
""" |
|
|
|
|
|
|
|
|
orig_h, orig_w = cond_image.height, cond_image.width |
|
|
|
|
|
target_h, target_w = target_size |
|
|
|
|
|
|
|
|
scale_h = target_h / orig_h |
|
|
scale_w = target_w / orig_w |
|
|
|
|
|
|
|
|
scale = max(scale_h, scale_w) |
|
|
final_h = math.ceil(scale * orig_h) |
|
|
final_w = math.ceil(scale * orig_w) |
|
|
|
|
|
|
|
|
resized_image = cond_image.resize((final_w, final_h), resample=Image.BILINEAR) |
|
|
resized_image = np.array(resized_image) |
|
|
|
|
|
|
|
|
resized_tensor = torch.from_numpy(resized_image)[None, ...].permute(0, 3, 1, 2).contiguous() |
|
|
cropped_tensor = transforms.functional.center_crop(resized_tensor, target_size) |
|
|
cropped_tensor = cropped_tensor[:, :, None, :, :] |
|
|
|
|
|
return cropped_tensor |
|
|
|
|
|
|
|
|
def compute_face_to_front_angle(rvec): |
|
|
|
|
|
rvec_ref = np.zeros((3, 1), dtype=np.float32) |
|
|
|
|
|
R_ref, _ = cv2.Rodrigues(rvec_ref) |
|
|
R_face, _ = cv2.Rodrigues(rvec) |
|
|
R_diff = R_face @ R_ref.T |
|
|
angle_rad = np.arccos(np.clip((np.trace(R_diff) - 1) / 2, -1.0, 1.0)) |
|
|
return 180 - angle_rad * 180 / np.pi |
|
|
|
|
|
|
|
|
|
|
|
def rotation_vector_to_euler_angles(rvec): |
|
|
R, _ = cv2.Rodrigues(rvec) |
|
|
sy = np.sqrt(R[0,0] * R[0,0] + R[1,0] * R[1,0]) |
|
|
singular = sy < 1e-6 |
|
|
|
|
|
if not singular: |
|
|
pitch = np.arctan2(R[2,1], R[2,2]) |
|
|
yaw = np.arctan2(-R[2,0], sy) |
|
|
roll = np.arctan2(R[1,0], R[0,0]) |
|
|
else: |
|
|
pitch = np.arctan2(-R[1,2], R[1,1]) |
|
|
yaw = np.arctan2(-R[2,0], sy) |
|
|
roll = 0 |
|
|
|
|
|
return np.degrees(yaw), np.degrees(pitch), np.degrees(roll) |
|
|
|
|
|
|
|
|
def head_pose_calculation(face_landmarks, image_size=(720, 480)): |
|
|
|
|
|
|
|
|
model_points = np.array([ |
|
|
[-30.0, 35.0, 0.0], |
|
|
[30.0, 35.0, 0.0], |
|
|
[0.0, 0.0, 0.0], |
|
|
[-25.0, -35.0, 0.0], |
|
|
[25.0, -35.0, 0.0], |
|
|
]) |
|
|
|
|
|
|
|
|
focal_length = image_size[0] |
|
|
center = (image_size[0] / 2, image_size[1] / 2) |
|
|
camera_matrix = np.array([ |
|
|
[focal_length, 0, center[0]], |
|
|
[0, focal_length, center[1]], |
|
|
[0, 0, 1] |
|
|
], dtype=np.float32) |
|
|
dist_coeffs = np.zeros((4, 1)) |
|
|
|
|
|
success, rvec, tvec = cv2.solvePnP( |
|
|
model_points, face_landmarks, |
|
|
camera_matrix, dist_coeffs, |
|
|
flags=cv2.SOLVEPNP_ITERATIVE |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
yaw, pitch, roll = rotation_vector_to_euler_angles(rvec) |
|
|
|
|
|
|
|
|
return abs(yaw), abs(pitch) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rand_name(length=8, suffix=''): |
|
|
name = binascii.b2a_hex(os.urandom(length)).decode('utf-8') |
|
|
if suffix: |
|
|
if not suffix.startswith('.'): |
|
|
suffix = '.' + suffix |
|
|
name += suffix |
|
|
return name |
|
|
|
|
|
|
|
|
|
|
|
def cache_video(tensor, |
|
|
save_file=None, |
|
|
fps=30, |
|
|
suffix='.mp4', |
|
|
nrow=8, |
|
|
normalize=True, |
|
|
value_range=(-1, 1), |
|
|
retry=5): |
|
|
|
|
|
|
|
|
cache_file = osp.join('/tmp', rand_name( |
|
|
suffix=suffix)) if save_file is None else save_file |
|
|
|
|
|
|
|
|
error = None |
|
|
for _ in range(retry): |
|
|
|
|
|
|
|
|
tensor = tensor.clamp(min(value_range), max(value_range)) |
|
|
tensor = torch.stack([ |
|
|
torchvision.utils.make_grid( |
|
|
u, nrow=nrow, normalize=normalize, value_range=value_range) |
|
|
for u in tensor.unbind(2) |
|
|
], |
|
|
dim=1).permute(1, 2, 3, 0) |
|
|
tensor = (tensor * 255).type(torch.uint8).cpu() |
|
|
|
|
|
|
|
|
writer = imageio.get_writer(cache_file, fps=fps, codec='libx264', quality=10, ffmpeg_params=["-crf", "10"]) |
|
|
for frame in tensor.numpy(): |
|
|
writer.append_data(frame) |
|
|
writer.close() |
|
|
return cache_file |
|
|
|
|
|
def save_silent_video(gen_video_samples, save_path, fps=25, quality=10, high_quality_save=True): |
|
|
""" |
|
|
保存无声音视频(支持追加帧到已有视频) |
|
|
|
|
|
参数: |
|
|
gen_video_samples: 生成的视频张量 [B,C,T,H,W] |
|
|
save_path: 保存路径(不带扩展名) |
|
|
fps: 视频帧率 |
|
|
quality: 视频质量 (0-10) |
|
|
high_quality_save: 是否启用高质量模式 |
|
|
""" |
|
|
gen_video_samples = gen_video_samples[0] |
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(save_path), exist_ok=True) |
|
|
|
|
|
|
|
|
final_save_path = f"{save_path}.mp4" |
|
|
|
|
|
|
|
|
video_frames = (gen_video_samples + 1) / 2 |
|
|
video_frames = video_frames.permute(1, 2, 3, 0).cpu().numpy() |
|
|
video_frames = np.clip(video_frames * 255, 0, 255).astype(np.uint8) |
|
|
|
|
|
|
|
|
all_frames = [] |
|
|
existing_fps = fps |
|
|
if os.path.exists(final_save_path): |
|
|
|
|
|
with imageio.get_reader(final_save_path) as reader: |
|
|
|
|
|
meta_data = reader.get_meta_data() |
|
|
existing_fps = meta_data['fps'] |
|
|
existing_frames = [frame for frame in reader] |
|
|
|
|
|
|
|
|
if existing_fps != fps: |
|
|
raise ValueError(f"Existing video fps {existing_fps} conflicts with new fps {fps}") |
|
|
if existing_frames[0].shape != video_frames[0].shape: |
|
|
raise ValueError("Frame resolution mismatch between existing and new video") |
|
|
|
|
|
all_frames.extend(existing_frames) |
|
|
|
|
|
|
|
|
all_frames.extend(video_frames) |
|
|
|
|
|
|
|
|
if high_quality_save: |
|
|
ffmpeg_params = [ |
|
|
'-c:v', 'libx264', |
|
|
'-crf', '0', |
|
|
'-preset', 'veryslow' |
|
|
] |
|
|
else: |
|
|
ffmpeg_params = [ |
|
|
'-c:v', 'libx264', |
|
|
'-crf', '23', |
|
|
'-preset', 'medium' |
|
|
] |
|
|
|
|
|
|
|
|
with imageio.get_writer( |
|
|
final_save_path, |
|
|
fps=existing_fps, |
|
|
codec='libx264', |
|
|
quality=quality, |
|
|
ffmpeg_params=ffmpeg_params |
|
|
) as writer: |
|
|
for frame in all_frames: |
|
|
writer.append_data(frame) |
|
|
|
|
|
print(f"Silent video saved to: {final_save_path}") |
|
|
|
|
|
def save_silent_video_overwrite(gen_video_samples, save_path, fps=25, quality=5, high_quality_save=False): |
|
|
""" |
|
|
保存无声音视频(支持追加帧到已有视频) |
|
|
|
|
|
参数: |
|
|
gen_video_samples: 生成的视频张量 [B,C,T,H,W] |
|
|
save_path: 保存路径(不带扩展名) |
|
|
fps: 视频帧率 |
|
|
quality: 视频质量 (0-10) |
|
|
high_quality_save: 是否启用高质量模式 |
|
|
""" |
|
|
gen_video_samples = gen_video_samples[0] |
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(save_path), exist_ok=True) |
|
|
|
|
|
|
|
|
final_save_path = f"{save_path}.mp4" |
|
|
|
|
|
|
|
|
video_frames = (gen_video_samples + 1) / 2 |
|
|
video_frames = video_frames.permute(1, 2, 3, 0).cpu().numpy() |
|
|
video_frames = np.clip(video_frames * 255, 0, 255).astype(np.uint8) |
|
|
|
|
|
|
|
|
all_frames = [] |
|
|
|
|
|
|
|
|
all_frames.extend(video_frames) |
|
|
|
|
|
|
|
|
if high_quality_save: |
|
|
ffmpeg_params = [ |
|
|
'-c:v', 'libx264', |
|
|
'-crf', '0', |
|
|
'-preset', 'veryslow' |
|
|
] |
|
|
else: |
|
|
ffmpeg_params = [ |
|
|
'-c:v', 'libx264', |
|
|
'-crf', '23', |
|
|
'-preset', 'medium' |
|
|
] |
|
|
|
|
|
|
|
|
with imageio.get_writer( |
|
|
final_save_path, |
|
|
fps=fps, |
|
|
codec='libx264', |
|
|
quality=quality, |
|
|
ffmpeg_params=ffmpeg_params |
|
|
) as writer: |
|
|
for frame in all_frames: |
|
|
writer.append_data(frame) |
|
|
|
|
|
print(f"Silent video saved to: {final_save_path}") |
|
|
|
|
|
def save_video_ffmpeg(gen_video_samples, save_path, vocal_audio_list, fps=25, quality=5, high_quality_save=False): |
|
|
|
|
|
gen_video_samples = gen_video_samples[0] |
|
|
|
|
|
def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None): |
|
|
writer = imageio.get_writer( |
|
|
save_path, fps=fps, quality=quality, ffmpeg_params=ffmpeg_params |
|
|
) |
|
|
for frame in tqdm(frames, desc="Saving video"): |
|
|
frame = np.array(frame) |
|
|
writer.append_data(frame) |
|
|
writer.close() |
|
|
save_path_tmp = save_path + "-temp.mp4" |
|
|
|
|
|
os.makedirs(os.path.dirname(save_path_tmp), exist_ok=True) |
|
|
|
|
|
|
|
|
if high_quality_save: |
|
|
|
|
|
|
|
|
cache_video( |
|
|
tensor=gen_video_samples.unsqueeze(0), |
|
|
save_file=save_path_tmp, |
|
|
fps=fps, |
|
|
nrow=1, |
|
|
normalize=True, |
|
|
value_range=(-1, 1) |
|
|
) |
|
|
else: |
|
|
video_audio = (gen_video_samples+1)/2 |
|
|
video_audio = video_audio.permute(1, 2, 3, 0).cpu().numpy() |
|
|
video_audio = np.clip(video_audio * 255, 0, 255).astype(np.uint8) |
|
|
save_video(video_audio, save_path_tmp, fps=fps, quality=quality) |
|
|
|
|
|
|
|
|
|
|
|
_, T, _, _ = gen_video_samples.shape |
|
|
duration = T / fps |
|
|
save_path_crop_audio = save_path + "-cropaudio.wav" |
|
|
final_command = [ |
|
|
"/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/gaofeng49/conda/memo/bin/ffmpeg", |
|
|
"-i", |
|
|
vocal_audio_list[0], |
|
|
"-t", |
|
|
f'{duration}', |
|
|
save_path_crop_audio, |
|
|
] |
|
|
subprocess.run(final_command, check=True) |
|
|
|
|
|
|
|
|
save_path = save_path + ".mp4" |
|
|
if high_quality_save: |
|
|
final_command = [ |
|
|
"/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/gaofeng49/conda/memo/bin/ffmpeg", |
|
|
"-y", |
|
|
"-i", save_path_tmp, |
|
|
"-i", save_path_crop_audio, |
|
|
"-c:v", "libx264", |
|
|
"-crf", "0", |
|
|
"-preset", "veryslow", |
|
|
"-c:a", "aac", |
|
|
"-shortest", |
|
|
save_path, |
|
|
] |
|
|
subprocess.run(final_command, check=True) |
|
|
os.remove(save_path_tmp) |
|
|
os.remove(save_path_crop_audio) |
|
|
else: |
|
|
final_command = [ |
|
|
"/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/gaofeng49/conda/memo/bin/ffmpeg", |
|
|
"-y", |
|
|
"-i", |
|
|
save_path_tmp, |
|
|
"-i", |
|
|
save_path_crop_audio, |
|
|
"-c:v", |
|
|
"libx264", |
|
|
"-c:a", |
|
|
"aac", |
|
|
"-shortest", |
|
|
save_path, |
|
|
] |
|
|
subprocess.run(final_command, check=True) |
|
|
os.remove(save_path_tmp) |
|
|
os.remove(save_path_crop_audio) |
|
|
|
|
|
def audio_move_from_hdfs(src_path): |
|
|
map_dict = { |
|
|
"/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/data_digitalhuman/talkingbody/yt_runway_sub/singlehuman_lipsync/yt_runway_0808_35w_merge/tar_record_caption_qwen2vlm_pose_audioemb_lipsync_camera_face_chinese": |
|
|
"/mnt/hdfs/user/hadoop-vision-data/llm/dataset/videogen_dataset/data/digital_human_video/talkingbody/runway_chinese/singlehuman_lipsync/yt_runway_0808_35w_merge/tar_record_caption_qwen2vlm_pose_audioemb_lipsync_camera_face_chinese", |
|
|
|
|
|
"/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/data_digitalhuman/talkingbody/yt_runway_sub/singlehuman_lipsync/yt_runway_0829_52w_merge/tar_record_caption_qwen2vlm_pose_audioemb_part2_lipsync_camera_face_chinese": |
|
|
"/mnt/hdfs/user/hadoop-vision-data/llm/dataset/videogen_dataset/data/digital_human_video/talkingbody/runway_chinese/singlehuman_lipsync/yt_runway_0829_52w_merge/tar_record_caption_qwen2vlm_pose_audioemb_part2_lipsync_camera_face_chinese", |
|
|
|
|
|
"/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/data_digitalhuman/talkingbody/yt_runway_sub/singlehuman_lipsync/yt_runway_0912_28w_merge/tar_record_caption_qwen2vlm_pose_audioemb_lipsync_camera_face_chinese": |
|
|
"/mnt/hdfs/user/hadoop-vision-data/llm/dataset/videogen_dataset/data/digital_human_video/talkingbody/runway_chinese/singlehuman_lipsync/yt_runway_0912_28w_merge/tar_record_caption_qwen2vlm_pose_audioemb_lipsync_camera_face_chinese", |
|
|
|
|
|
"/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/data_digitalhuman/talkingbody/yt_runway_sub/singlehuman_lipsync/yt_runway_0926_105w_merge/tar_record_caption_qwen2vlm_pose_audioemb_lipsync_camera_face_chinese": |
|
|
"/mnt/hdfs/user/hadoop-vision-data/llm/dataset/videogen_dataset/data/digital_human_video/talkingbody/runway_chinese/singlehuman_lipsync/yt_runway_0926_105w_merge/tar_record_caption_qwen2vlm_pose_audioemb_lipsync_camera_face_chinese", |
|
|
|
|
|
"/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/data_digitalhuman/talkingbody/yt_runway_sub/singlehuman_lipsync/yt_runway_1129_65w_part1/tar_record_caption_qwen2vlm_pose_audioemb_lipsync_camera_face_facecropcaption_chinese": |
|
|
"/mnt/hdfs/user/hadoop-vision-data/llm/dataset/videogen_dataset/data/digital_human_video/talkingbody/runway_chinese/singlehuman_lipsync/yt_runway_1129_65w_part1/tar_record_caption_qwen2vlm_pose_audioemb_lipsync_camera_face_facecropcaption_chinese", |
|
|
|
|
|
"/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/data_digitalhuman/talkingbody/yt_runway_sub/singlehuman_lipsync/yt_runway_1129_65w_part2/tar_record_caption_qwen2vlm_pose_audioemb_lipsync_camera_face_facecropcaption_chinese": |
|
|
"/mnt/hdfs/user/hadoop-vision-data/llm/dataset/videogen_dataset/data/digital_human_video/talkingbody/runway_chinese/singlehuman_lipsync/yt_runway_1129_65w_part2/tar_record_caption_qwen2vlm_pose_audioemb_lipsync_camera_face_facecropcaption_chinese" |
|
|
} |
|
|
|
|
|
for src_p in map_dict: |
|
|
if src_p in src_path: |
|
|
src_path = src_path.replace(src_p, map_dict[src_p]) |
|
|
|
|
|
return src_path |