Safetensors
English
llava
video-retrieval
text-to-video-search
multimodal-embedding
TARA / shared /scripts /check_cut_files.py
bpiyush's picture
Update TARA to latest Tarsier2 checkpoint and runnable demo.
7daf628
"""Checks cut files."""
import os
import sys
from glob import glob
from tqdm import tqdm
from joblib import Parallel, delayed
import decord
import numpy as np
import pandas as pd
if __name__ == "__main__":
video_dir = "/work/piyush/from_nfs2/datasets/EPIC-Kitchens-100/cut_clips"
files = glob(os.path.join(video_dir, "*/*/*.MP4"))
print("Total files:", len(files))
parallel = True
if not parallel:
failed = []
iterator = tqdm(files, desc="Checking files")
for f in iterator:
try:
vr = decord.VideoReader(f, ctx=decord.cpu(), num_threads=1)
random_frame = np.random.randint(0, len(vr))
random_frame = vr.get_batch([random_frame]).asnumpy()
except Exception as e:
failed.append(f)
import ipdb; ipdb.set_trace()
else:
def check_file(f):
try:
vr = decord.VideoReader(f, ctx=decord.cpu(), num_threads=1)
random_frame = np.random.randint(0, len(vr))
random_frame = len(vr) - 1
random_frame = vr.get_batch([random_frame]).asnumpy()
return None
except Exception as e:
return f
status = Parallel(n_jobs=24)(
delayed(check_file)(f) for f in tqdm(files, desc="Checking files")
)
failed = [f for f in status if f is not None]
print("Number of files on which loading failed:", len(failed))
import ipdb; ipdb.set_trace()
for f in failed: os.remove(f)