""" Faster clip cutting script generated by Claude. S=/datasets/EpicKitchens-100/ D=/work/piyush/from_nfs2/datasets/EPIC-Kitchens-100/cut_clips csv=$D/../epic-kitchens-100-annotations/EPIC_100_train_with_id.csv python shared/scripts/cut_clips_fast.py --csv $csv --video_id_key path_id --start_time_key start_sec --end_time_key stop_sec --video_dir $S/ --cut_dir $D/ --ext MP4 --max_workers 4 """ import os from os.path import join, exists import time from concurrent.futures import ThreadPoolExecutor, as_completed import numpy as np import pandas as pd from tqdm import tqdm from moviepy.editor import VideoFileClip from moviepy.video.fx.resize import resize def time_float_to_str(time_in_seconds): import datetime hours, remainder = divmod(time_in_seconds, 3600) minutes, seconds_with_ms = divmod(remainder, 60) seconds, milliseconds = divmod(int(seconds_with_ms * 1000), 1000) time_delta = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds, milliseconds=milliseconds) return str(time_delta) def process_video(row, args): """Process a single video clip""" try: f = row["video_path"] v, s, e = row[args.video_id_key], float(row[args.start_time_key]), float(row[args.end_time_key]) if args.no_round_times: clip_filename = f"{v}_{s}_{e}.{args.ext}" else: clip_filename = f"{v}_{np.round(s, 1)}_{np.round(e, 1)}.{args.ext}" clip_filepath = join(args.cut_dir, clip_filename) os.makedirs(os.path.dirname(clip_filepath), exist_ok=True) if os.path.exists(clip_filepath) and not args.overwrite: return None # Load video and extract clip with VideoFileClip(f) as video: # Calculate target width maintaining aspect ratio with max height 480 aspect_ratio = video.w / video.h target_height = 480 target_width = int(target_height * aspect_ratio) # Extract and resize clip clip = video.subclip(s, e) clip = clip.resize(width=target_width, height=target_height) # Write clip with optimized settings clip.write_videofile( clip_filepath, codec='libx264', audio_codec='aac', preset='faster', # Faster encoding threads=2, # Use multiple threads for encoding logger=None if not args.verbose else None ) return clip_filepath except Exception as e: if args.verbose: print(f"Error processing {row[args.video_id_key]}: {str(e)}") return None if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument( "--csv", type=str, required=True, help="Path to CSV file containing video IDs and timestamps", ) parser.add_argument( "--video_id_key", type=str, default="video_id", ) parser.add_argument( "--start_time_key", type=str, default="start_time", ) parser.add_argument( "--end_time_key", type=str, default="end_time", ) parser.add_argument( "--video_dir", type=str, required=True, help="Path to directory containing downloaded videos", ) parser.add_argument( "--cut_dir", type=str, required=True, help="Path to directory where cut videos will be saved", ) parser.add_argument( "--overwrite", action="store_true", help="Whether to overwrite existing cut videos", ) parser.add_argument( "--verbose", action="store_true", ) parser.add_argument( "--no_round_times", action="store_true", help="Whether to round start and end times to nearest second in filenames", ) parser.add_argument( "--debug", action="store_true", ) parser.add_argument( "--ext", type=str, default="mp4", ) parser.add_argument( "--si", type=int, default=0, ) parser.add_argument( "--ei", type=int, default=None, ) parser.add_argument( "--filter_csv", type=str, default=None, required=False, ) parser.add_argument( "--filter_key", type=str, default=None, required=False, ) parser.add_argument( "--max_workers", type=int, default=4, help="Number of parallel workers for processing videos", ) args = parser.parse_args() # Make cut_dir os.makedirs(args.cut_dir, exist_ok=True) # Load and filter CSV assert os.path.exists(args.csv), f"CSV file {args.csv} does not exist." df = pd.read_csv(args.csv) print(">>> Loaded CSV file with shape", df.shape) assert {args.video_id_key, args.start_time_key, args.end_time_key}.issubset(df.columns) # Filter CSV if needed if args.filter_csv is not None: path = args.filter_csv assert os.path.exists(path), f"CSV file {path} does not exist." key = args.filter_key df_filter = pd.read_csv(path) assert key in df_filter.columns, f"CSV file must contain column {key}." keep_values = df_filter[key].unique() df = df[df[key].isin(keep_values)] print(">>> Filtered CSV file with shape", df.shape) # Apply index slicing si = args.si ei = args.ei if args.ei is not None else len(df) df = df.iloc[si:ei] print("Start index:", si, "End index:", ei) # More efficient way to add video path print(">>> Adding video paths to dataframe") video_ids = df[args.video_id_key].unique() video_paths = [join(args.video_dir, f"{video_id}.{args.ext}") for video_id in video_ids] video_id_to_path = {video_id: path for video_id, path in zip(video_ids, video_paths)} df["video_path"] = df[args.video_id_key].map(video_id_to_path) # df = df[df["video_path"].apply(exists)] df['check_video'] = df['video_path'].apply(exists) df = df[df['check_video']] del df['check_video'] print(">>> Found videos for", df.shape[0], "rows.") # # Filter out videos that don't exist # df["video_path"] = df[args.video_id_key].apply( # lambda video_id: join(args.video_dir, f"{video_id}.{args.ext}"), # ) # df["check_video"] = df["video_path"].apply(exists) # df = df[df["check_video"]] # del df["check_video"] # print(">>> Found videos for", df.shape[0], "rows.") if len(df) == 0: print(">>> No videos to cut.") exit() if args.debug: args.verbose = True # Process only one video in debug mode process_video(df.iloc[0], args) else: # Process videos in parallel with ThreadPoolExecutor(max_workers=args.max_workers) as executor: futures = [executor.submit(process_video, row, args) for _, row in df.iterrows()] # Show progress bar with tqdm(total=len(futures), desc="Cutting clips") as pbar: for future in as_completed(futures): result = future.result() pbar.update(1) print(">>> Number of cut files:", len(os.listdir(args.cut_dir)))