{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "574d024a-3ffc-40d7-98f4-e74744e65435", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "unziped\n", "Total number of images: 30340\n", "copy from eshooshoo2019\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Processing images: 1%| | 320/30340 [01:38<2:31:15, 3.31it/s]" ] } ], "source": [ "import os\n", "import zipfile\n", "import zlib\n", "import shutil\n", "from pathlib import Path\n", "from PIL import Image\n", "from tqdm import tqdm \n", "\n", "maxsize = 1280\n", "\n", "def unzip_files_in_directory(directory):\n", " for root, _, files in os.walk(directory):\n", " for file in files:\n", " if file.endswith('.zip'):\n", " zip_path = os.path.join(root, file)\n", " # Create a new subdirectory for the extracted files\n", " extract_dir = os.path.join(root, os.path.splitext(file)[0])\n", " os.makedirs(extract_dir, exist_ok=True)\n", " try:\n", " with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n", " zip_ref.extractall(extract_dir)\n", " except (zipfile.BadZipFile, FileNotFoundError, OSError) as e:\n", " print(f\"Skipping corrupted or unreadable file: {zip_path}. Error: {e}\")\n", "\n", " # Optionally, remove the zip file after extraction\n", " os.remove(zip_path)\n", "\n", "def list_image_files(directory, image_extensions=('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff')):\n", " image_files = []\n", " for root, _, files in os.walk(directory):\n", " for file in files:\n", " if file.lower().endswith(image_extensions):\n", " image_files.append(os.path.join(root, file))\n", " return image_files\n", "\n", "def compute_crc32(file_path):\n", " crc_value = 0\n", " with open(file_path, 'rb') as file:\n", " for chunk in iter(lambda: file.read(4096), b''):\n", " crc_value = zlib.crc32(chunk, crc_value)\n", " return format(crc_value & 0xFFFFFFFF, '08x')\n", "\n", "\n", "def copy_images_with_crc32(image_files, output_directory, base_folder_name):\n", " output_path = Path(output_directory) / base_folder_name\n", " output_path.mkdir(parents=True, exist_ok=True)\n", "\n", " for image_file in tqdm(image_files, desc=\"Processing images\"):\n", " try:\n", " path = Path(image_file)\n", " ext = path.suffix.lower()\n", " crc = compute_crc32(image_file)\n", "\n", " with Image.open(image_file) as img:\n", " needs_processing = (\n", " (ext !=\".jpg\") or\n", " (img.mode != \"RGB\") or\n", " (max(img.size) > maxsize)\n", " )\n", "\n", " if needs_processing:\n", " # Convert to RGB (handle transparency)\n", " if img.mode in (\"RGBA\", \"LA\", \"P\"):\n", " bg = Image.new(\"RGB\", img.size, (255, 255, 255))\n", " if img.mode == \"P\":\n", " img = img.convert(\"RGBA\")\n", " if img.mode == \"RGBA\":\n", " bg.paste(img, mask=img.split()[-1])\n", " img = bg\n", " elif img.mode != \"RGB\":\n", " img = img.convert(\"RGB\")\n", "\n", " # Resize if too big\n", " if max(img.size) > maxsize:\n", " img.thumbnail((maxsize, maxsize), Image.BICUBIC)\n", "\n", " # Save as PNG\n", " out_name = f\"{base_folder_name}_{crc}.png\"\n", " img.save(output_path / out_name, \"PNG\")#, optimize=True)\n", " else:\n", " # Copy as-is\n", " out_name = f\"{base_folder_name}_{crc}{ext}\"\n", " shutil.move(image_file, output_path / out_name)\n", "\n", " # Copy .txt if exists\n", " txt_path = path.with_suffix(\".txt\")\n", " if txt_path.exists():\n", " shutil.move(txt_path, output_path / Path(out_name).with_suffix(\".txt\"))\n", "\n", " except Exception as e:\n", " print(f\"Skipping {image_file}: {e}\")\n", "\n", "def main(directory, output_directory):\n", " base_folder_name = os.path.basename(directory.rstrip('/').rstrip('\\\\'))\n", "\n", " # Unzip all zip files in the directory\n", " unzip_files_in_directory(directory)\n", " print('unziped')\n", "\n", " # List all image files\n", " image_files = list_image_files(directory)\n", " print(f\"Total number of images: {len(image_files)}\")\n", "\n", " # Copy images with CRC32 hash in their names\n", " print('copy from',base_folder_name)\n", " copy_images_with_crc32(image_files, output_directory, base_folder_name)\n", " print('ok')\n", "\n", "# Example usage\n", "directory_path = '/workspace/ds/eshooshoo2019'\n", "output_directory_path = '/workspace/ds/2015-19'\n", "main(directory_path, output_directory_path)" ] }, { "cell_type": "code", "execution_count": null, "id": "e047aa2a-f7cc-4332-91cf-17c373f61ea2", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }