Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| import traceback | |
| from tqdm import tqdm | |
| from multiprocessing import Pool | |
| ROOT_FROM = 'XXX' # the path of laion-ocr-zip | |
| ROOT_TO = 'XXX' # the path for saving dataset | |
| MULTIPROCESSING_NUM = 64 | |
| DOWNLOAD_IMAGES = False # whether to download images from urls | |
| def unzip_file(idx): | |
| if not os.path.exists(f'{ROOT_FROM}/{idx}.zip') or os.path.exists(f'{ROOT_TO}/{idx}'): | |
| return | |
| cmd = f'unzip -q {ROOT_FROM}/{idx}.zip -d {ROOT_TO}' | |
| os.system(cmd) | |
| def multiprocess_unzip_file(idxs): | |
| os.makedirs(ROOT_TO, exist_ok=True) | |
| with Pool(processes=MULTIPROCESSING_NUM) as p: | |
| with tqdm(total=len(idxs), desc='total') as pbar: | |
| for i, _ in enumerate(p.imap_unordered(unzip_file, idxs)): | |
| pbar.update() | |
| print("multiprocess_unzip_file done!") | |
| if __name__ == '__main__': | |
| files = os.listdir(ROOT_FROM) | |
| idxs = [str(idx[:-4]).zfill(5) for idx in files] | |
| multiprocess_unzip_file(idxs) | |
| print("Finished!") | |