| import os |
| from multiprocessing import Pool |
|
|
| from mmengine.logging import MMLogger |
| from scenedetect import ContentDetector, detect |
| from tqdm import tqdm |
|
|
| from opensora.utils.misc import get_timestamp |
|
|
| from .utils import check_mp4_integrity, clone_folder_structure, iterate_files, split_video |
|
|
| |
| target_fps = 30 |
| shorter_size = 512 |
| min_seconds = 1 |
| max_seconds = 5 |
| assert max_seconds > min_seconds |
| cfg = dict( |
| target_fps=target_fps, |
| min_seconds=min_seconds, |
| max_seconds=max_seconds, |
| shorter_size=shorter_size, |
| ) |
|
|
|
|
| def process_folder(root_src, root_dst): |
| |
| folder_path_log = os.path.dirname(root_dst) |
| log_name = os.path.basename(root_dst) |
| timestamp = get_timestamp() |
| log_path = os.path.join(folder_path_log, f"{log_name}_{timestamp}.log") |
| logger = MMLogger.get_instance(log_name, log_file=log_path) |
|
|
| |
| clone_folder_structure(root_src, root_dst) |
|
|
| |
| mp4_list = [x for x in iterate_files(root_src) if x.endswith(".mp4")] |
| mp4_list = sorted(mp4_list) |
|
|
| for idx, sample_path in tqdm(enumerate(mp4_list)): |
| folder_src = os.path.dirname(sample_path) |
| folder_dst = os.path.join(root_dst, os.path.relpath(folder_src, root_src)) |
|
|
| |
| if not check_mp4_integrity(sample_path, logger=logger): |
| continue |
|
|
| |
| scene_list = detect(sample_path, ContentDetector(), start_in_scene=True) |
|
|
| |
| save_path_list = split_video(sample_path, scene_list, save_dir=folder_dst, **cfg, logger=logger) |
|
|
| |
| for x in save_path_list: |
| check_mp4_integrity(x, logger=logger) |
|
|
|
|
| def scene_detect(): |
| """detect & cut scenes using a single process |
| Expected dataset structure: |
| data/ |
| your_dataset/ |
| raw_videos/ |
| xxx.mp4 |
| yyy.mp4 |
| |
| This function results in: |
| data/ |
| your_dataset/ |
| raw_videos/ |
| xxx.mp4 |
| yyy.mp4 |
| zzz.mp4 |
| clips/ |
| xxx_scene-0.mp4 |
| yyy_scene-0.mp4 |
| yyy_scene-1.mp4 |
| """ |
| |
| root_src = f"./data/your_dataset/raw_videos" |
| root_dst = f"./data/your_dataset/clips" |
|
|
| process_folder(root_src, root_dst) |
|
|
|
|
| def scene_detect_mp(): |
| """detect & cut scenes using multiple processes |
| Expected dataset structure: |
| data/ |
| your_dataset/ |
| raw_videos/ |
| split_0/ |
| xxx.mp4 |
| yyy.mp4 |
| split_1/ |
| xxx.mp4 |
| yyy.mp4 |
| |
| This function results in: |
| data/ |
| your_dataset/ |
| raw_videos/ |
| split_0/ |
| xxx.mp4 |
| yyy.mp4 |
| split_1/ |
| xxx.mp4 |
| yyy.mp4 |
| clips/ |
| split_0/ |
| xxx_scene-0.mp4 |
| yyy_scene-0.mp4 |
| split_1/ |
| xxx_scene-0.mp4 |
| yyy_scene-0.mp4 |
| yyy_scene-1.mp4 |
| """ |
| |
| root_src = f"./data/your_dataset/raw_videos" |
| root_dst = f"./data/your_dataset/clips" |
|
|
| |
| splits = ["split_0", "split_1"] |
|
|
| |
| root_src_list = [os.path.join(root_src, x) for x in splits] |
| root_dst_list = [os.path.join(root_dst, x) for x in splits] |
|
|
| with Pool(processes=len(splits)) as pool: |
| pool.starmap(process_folder, list(zip(root_src_list, root_dst_list))) |
|
|
|
|
| if __name__ == "__main__": |
| |
| scene_detect() |
| |
|
|