File size: 7,223 Bytes

7daf628

"""
Faster clip cutting script generated by Claude.

S=/datasets/EpicKitchens-100/
D=/work/piyush/from_nfs2/datasets/EPIC-Kitchens-100/cut_clips
csv=$D/../epic-kitchens-100-annotations/EPIC_100_train_with_id.csv
python shared/scripts/cut_clips_fast.py --csv $csv --video_id_key path_id --start_time_key start_sec --end_time_key stop_sec --video_dir $S/ --cut_dir $D/ --ext MP4 --max_workers 4

"""
import os
from os.path import join, exists
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
import pandas as pd
from tqdm import tqdm
from moviepy.editor import VideoFileClip
from moviepy.video.fx.resize import resize

def time_float_to_str(time_in_seconds):
    import datetime
    hours, remainder = divmod(time_in_seconds, 3600)
    minutes, seconds_with_ms = divmod(remainder, 60)
    seconds, milliseconds = divmod(int(seconds_with_ms * 1000), 1000)
    time_delta = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds, milliseconds=milliseconds)
    return str(time_delta)

def process_video(row, args):
    """Process a single video clip"""
    try:
        f = row["video_path"]
        v, s, e = row[args.video_id_key], float(row[args.start_time_key]), float(row[args.end_time_key])
        
        if args.no_round_times:
            clip_filename = f"{v}_{s}_{e}.{args.ext}"
        else:
            clip_filename = f"{v}_{np.round(s, 1)}_{np.round(e, 1)}.{args.ext}"
            
        clip_filepath = join(args.cut_dir, clip_filename)
        os.makedirs(os.path.dirname(clip_filepath), exist_ok=True)

        if os.path.exists(clip_filepath) and not args.overwrite:
            return None

        # Load video and extract clip
        with VideoFileClip(f) as video:
            # Calculate target width maintaining aspect ratio with max height 480
            aspect_ratio = video.w / video.h
            target_height = 480
            target_width = int(target_height * aspect_ratio)
            
            # Extract and resize clip
            clip = video.subclip(s, e)
            clip = clip.resize(width=target_width, height=target_height)
            
            # Write clip with optimized settings
            clip.write_videofile(
                clip_filepath,
                codec='libx264',
                audio_codec='aac',
                preset='faster',  # Faster encoding
                threads=2,  # Use multiple threads for encoding
                logger=None if not args.verbose else None
            )
            
        return clip_filepath
    except Exception as e:
        if args.verbose:
            print(f"Error processing {row[args.video_id_key]}: {str(e)}")
        return None

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--csv", type=str, required=True,
        help="Path to CSV file containing video IDs and timestamps",
    )
    parser.add_argument(
        "--video_id_key", type=str, default="video_id",
    )
    parser.add_argument(
        "--start_time_key", type=str, default="start_time",
    )
    parser.add_argument(
        "--end_time_key", type=str, default="end_time",
    )
    parser.add_argument(
        "--video_dir", type=str, required=True,
        help="Path to directory containing downloaded videos",
    )
    parser.add_argument(
        "--cut_dir", type=str, required=True,
        help="Path to directory where cut videos will be saved",
    )
    parser.add_argument(
        "--overwrite", action="store_true",
        help="Whether to overwrite existing cut videos",
    )
    parser.add_argument(
        "--verbose", action="store_true",
    )
    parser.add_argument(
        "--no_round_times", action="store_true",
        help="Whether to round start and end times to nearest second in filenames",
    )
    parser.add_argument(
        "--debug", action="store_true",
    )
    parser.add_argument(
        "--ext", type=str, default="mp4",
    )
    parser.add_argument(
        "--si", type=int, default=0,
    )
    parser.add_argument(
        "--ei", type=int, default=None,
    )
    parser.add_argument(
        "--filter_csv", type=str, default=None, required=False,
    )
    parser.add_argument(
        "--filter_key", type=str, default=None, required=False,
    )
    parser.add_argument(
        "--max_workers", type=int, default=4,
        help="Number of parallel workers for processing videos",
    )
    args = parser.parse_args()
    
    # Make cut_dir
    os.makedirs(args.cut_dir, exist_ok=True)
    
    # Load and filter CSV
    assert os.path.exists(args.csv), f"CSV file {args.csv} does not exist."
    df = pd.read_csv(args.csv)
    print(">>> Loaded CSV file with shape", df.shape)
    assert {args.video_id_key, args.start_time_key, args.end_time_key}.issubset(df.columns)

    # Filter CSV if needed
    if args.filter_csv is not None:
        path = args.filter_csv
        assert os.path.exists(path), f"CSV file {path} does not exist."
        key = args.filter_key
        df_filter = pd.read_csv(path)
        assert key in df_filter.columns, f"CSV file must contain column {key}."
        keep_values = df_filter[key].unique()
        df = df[df[key].isin(keep_values)]
        print(">>> Filtered CSV file with shape", df.shape)
    
    # Apply index slicing
    si = args.si
    ei = args.ei if args.ei is not None else len(df)
    df = df.iloc[si:ei]
    print("Start index:", si, "End index:", ei)
    
    # More efficient way to add video path
    print(">>> Adding video paths to dataframe")
    video_ids = df[args.video_id_key].unique()
    video_paths = [join(args.video_dir, f"{video_id}.{args.ext}") for video_id in video_ids]
    video_id_to_path = {video_id: path for video_id, path in zip(video_ids, video_paths)}
    df["video_path"] = df[args.video_id_key].map(video_id_to_path)
    # df = df[df["video_path"].apply(exists)]
    df['check_video'] = df['video_path'].apply(exists)
    df = df[df['check_video']]
    del df['check_video']
    print(">>> Found videos for", df.shape[0], "rows.")

    # # Filter out videos that don't exist
    # df["video_path"] = df[args.video_id_key].apply(
    #     lambda video_id: join(args.video_dir, f"{video_id}.{args.ext}"),
    # )
    # df["check_video"] = df["video_path"].apply(exists)
    # df = df[df["check_video"]]
    # del df["check_video"]
    # print(">>> Found videos for", df.shape[0], "rows.")

    if len(df) == 0:
        print(">>> No videos to cut.")
        exit()


    if args.debug:
        args.verbose = True
        # Process only one video in debug mode
        process_video(df.iloc[0], args)
    else:
        # Process videos in parallel
        with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
            futures = [executor.submit(process_video, row, args) 
                      for _, row in df.iterrows()]
            
            # Show progress bar
            with tqdm(total=len(futures), desc="Cutting clips") as pbar:
                for future in as_completed(futures):
                    result = future.result()
                    pbar.update(1)
    
    print(">>> Number of cut files:", len(os.listdir(args.cut_dir)))