| from scipy.misc import imsave | |
| import dlib | |
| import os | |
| import glob | |
| import numpy as np | |
| import cv2 | |
| from multiprocessing import Pool | |
| import pdb | |
| from torch.utils.data import DataLoader, Dataset | |
| import time | |
| class MyDataset(Dataset): | |
| def __init__(self): | |
| self.IN = 'GRID/' | |
| self.OUT = 'GRID_imgs/' | |
| self.wav = 'GRID_wavs/' | |
| with open('GRID_files.txt', 'r') as f: | |
| files = [line.strip() for line in f.readlines()] | |
| self.files = [] | |
| for file in files: | |
| _, ext = os.path.splitext(file) | |
| if ext == '.XML': continue | |
| self.files.append(file) | |
| print(file) | |
| wav = file.replace(self.IN, self.wav).replace(ext, '.wav') | |
| path = os.path.split(wav)[0] | |
| if not os.path.exists(path): | |
| os.makedirs(path) | |
| def __len__(self): | |
| return len(self.files) | |
| def __getitem__(self, idx): | |
| file = self.files[idx] | |
| _, ext = os.path.splitext(file) | |
| dst = file.replace(self.IN, self.OUT).replace(ext, '') | |
| if not os.path.exists(dst): | |
| os.makedirs(dst) | |
| cmd = 'ffmpeg -i \'{}\' -qscale:v 2 -r 25 \'{}/%d.jpg\''.format(file, dst) | |
| os.system(cmd) | |
| wav = file.replace(self.IN, self.wav).replace(ext, '.wav') | |
| cmd = 'ffmpeg -y -i \'{}\' -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 \'{}\' '.format(file, wav) | |
| os.system(cmd) | |
| return dst | |
| if __name__ == '__main__': | |
| dataset = MyDataset() | |
| loader = DataLoader( | |
| dataset, num_workers=32, batch_size=128, | |
| shuffle=False, drop_last=False | |
| ) | |
| tic = time.time() | |
| for (i, batch) in enumerate(loader): | |
| eta = (1.0*time.time()-tic)/(i+1) * (len(loader)-i) | |
| print('eta:{}'.format(eta/3600.0)) | |