In [None]:
import os
import zipfile
import zlib
import shutil
from pathlib import Path
from PIL import Image
from tqdm import tqdm 

maxsize = 1280

def unzip_files_in_directory(directory):
 for root, _, files in os.walk(directory):
 for file in files:
 if file.endswith('.zip'):
 zip_path = os.path.join(root, file)
 # Create a new subdirectory for the extracted files
 extract_dir = os.path.join(root, os.path.splitext(file)[0])
 os.makedirs(extract_dir, exist_ok=True)
 try:
 with zipfile.ZipFile(zip_path, 'r') as zip_ref:
 zip_ref.extractall(extract_dir)
 except (zipfile.BadZipFile, FileNotFoundError, OSError) as e:
 print(f"Skipping corrupted or unreadable file: {zip_path}. Error: {e}")

 # Optionally, remove the zip file after extraction
 os.remove(zip_path)

def list_image_files(directory, image_extensions=('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff')):
 image_files = []
 for root, _, files in os.walk(directory):
 for file in files:
 if file.lower().endswith(image_extensions):
 image_files.append(os.path.join(root, file))
 return image_files

def compute_crc32(file_path):
 crc_value = 0
 with open(file_path, 'rb') as file:
 for chunk in iter(lambda: file.read(4096), b''):
 crc_value = zlib.crc32(chunk, crc_value)
 return format(crc_value & 0xFFFFFFFF, '08x')


def copy_images_with_crc32(image_files, output_directory, base_folder_name):
 output_path = Path(output_directory) / base_folder_name
 output_path.mkdir(parents=True, exist_ok=True)

 for image_file in tqdm(image_files, desc="Processing images"):
 try:
 path = Path(image_file)
 ext = path.suffix.lower()
 crc = compute_crc32(image_file)

 with Image.open(image_file) as img:
 needs_processing = (
 (ext !=".jpg") or
 (img.mode != "RGB") or
 (max(img.size) > maxsize)
 )

 if needs_processing:
 # Convert to RGB (handle transparency)
 if img.mode in ("RGBA", "LA", "P"):
 bg = Image.new("RGB", img.size, (255, 255, 255))
 if img.mode == "P":
 img = img.convert("RGBA")
 if img.mode == "RGBA":
 bg.paste(img, mask=img.split()[-1])
 img = bg
 elif img.mode != "RGB":
 img = img.convert("RGB")

 # Resize if too big
 if max(img.size) > maxsize:
 img.thumbnail((maxsize, maxsize), Image.BICUBIC)

 # Save as PNG
 out_name = f"{base_folder_name}_{crc}.png"
 img.save(output_path / out_name, "PNG")#, optimize=True)
 else:
 # Copy as-is
 out_name = f"{base_folder_name}_{crc}{ext}"
 shutil.move(image_file, output_path / out_name)

 # Copy .txt if exists
 txt_path = path.with_suffix(".txt")
 if txt_path.exists():
 shutil.move(txt_path, output_path / Path(out_name).with_suffix(".txt"))

 except Exception as e:
 print(f"Skipping {image_file}: {e}")

def main(directory, output_directory):
 base_folder_name = os.path.basename(directory.rstrip('/').rstrip('\\'))

 # Unzip all zip files in the directory
 unzip_files_in_directory(directory)
 print('unziped')

 # List all image files
 image_files = list_image_files(directory)
 print(f"Total number of images: {len(image_files)}")

 # Copy images with CRC32 hash in their names
 print('copy from',base_folder_name)
 copy_images_with_crc32(image_files, output_directory, base_folder_name)
 print('ok')

# Example usage
directory_path = '/workspace/ds/eshooshoo2019'
output_directory_path = '/workspace/ds/2015-19'
main(directory_path, output_directory_path)

unziped
Total number of images: 30340
copy from eshooshoo2019


Processing images: 1%| | 320/30340 [01:38<2:31:15, 3.31it/s]