File size: 6,413 Bytes
3a840b2 bcd912d 3a840b2 bcd912d 3a840b2 41b440f 3a840b2 41b440f 3a840b2 41b440f 3a840b2 41b440f 3a840b2 41b440f 3a840b2 41b440f bcd912d 3a840b2 41b440f bcd912d 41b440f bcd912d 41b440f 3a840b2 41b440f 3a840b2 bcd912d 3a840b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "574d024a-3ffc-40d7-98f4-e74744e65435",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"unziped\n",
"Total number of images: 30340\n",
"copy from eshooshoo2019\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processing images: 1%| | 320/30340 [01:38<2:31:15, 3.31it/s]"
]
}
],
"source": [
"import os\n",
"import zipfile\n",
"import zlib\n",
"import shutil\n",
"from pathlib import Path\n",
"from PIL import Image\n",
"from tqdm import tqdm \n",
"\n",
"maxsize = 1280\n",
"\n",
"def unzip_files_in_directory(directory):\n",
" for root, _, files in os.walk(directory):\n",
" for file in files:\n",
" if file.endswith('.zip'):\n",
" zip_path = os.path.join(root, file)\n",
" # Create a new subdirectory for the extracted files\n",
" extract_dir = os.path.join(root, os.path.splitext(file)[0])\n",
" os.makedirs(extract_dir, exist_ok=True)\n",
" try:\n",
" with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n",
" zip_ref.extractall(extract_dir)\n",
" except (zipfile.BadZipFile, FileNotFoundError, OSError) as e:\n",
" print(f\"Skipping corrupted or unreadable file: {zip_path}. Error: {e}\")\n",
"\n",
" # Optionally, remove the zip file after extraction\n",
" os.remove(zip_path)\n",
"\n",
"def list_image_files(directory, image_extensions=('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff')):\n",
" image_files = []\n",
" for root, _, files in os.walk(directory):\n",
" for file in files:\n",
" if file.lower().endswith(image_extensions):\n",
" image_files.append(os.path.join(root, file))\n",
" return image_files\n",
"\n",
"def compute_crc32(file_path):\n",
" crc_value = 0\n",
" with open(file_path, 'rb') as file:\n",
" for chunk in iter(lambda: file.read(4096), b''):\n",
" crc_value = zlib.crc32(chunk, crc_value)\n",
" return format(crc_value & 0xFFFFFFFF, '08x')\n",
"\n",
"\n",
"def copy_images_with_crc32(image_files, output_directory, base_folder_name):\n",
" output_path = Path(output_directory) / base_folder_name\n",
" output_path.mkdir(parents=True, exist_ok=True)\n",
"\n",
" for image_file in tqdm(image_files, desc=\"Processing images\"):\n",
" try:\n",
" path = Path(image_file)\n",
" ext = path.suffix.lower()\n",
" crc = compute_crc32(image_file)\n",
"\n",
" with Image.open(image_file) as img:\n",
" needs_processing = (\n",
" (ext !=\".jpg\") or\n",
" (img.mode != \"RGB\") or\n",
" (max(img.size) > maxsize)\n",
" )\n",
"\n",
" if needs_processing:\n",
" # Convert to RGB (handle transparency)\n",
" if img.mode in (\"RGBA\", \"LA\", \"P\"):\n",
" bg = Image.new(\"RGB\", img.size, (255, 255, 255))\n",
" if img.mode == \"P\":\n",
" img = img.convert(\"RGBA\")\n",
" if img.mode == \"RGBA\":\n",
" bg.paste(img, mask=img.split()[-1])\n",
" img = bg\n",
" elif img.mode != \"RGB\":\n",
" img = img.convert(\"RGB\")\n",
"\n",
" # Resize if too big\n",
" if max(img.size) > maxsize:\n",
" img.thumbnail((maxsize, maxsize), Image.BICUBIC)\n",
"\n",
" # Save as PNG\n",
" out_name = f\"{base_folder_name}_{crc}.png\"\n",
" img.save(output_path / out_name, \"PNG\")#, optimize=True)\n",
" else:\n",
" # Copy as-is\n",
" out_name = f\"{base_folder_name}_{crc}{ext}\"\n",
" shutil.move(image_file, output_path / out_name)\n",
"\n",
" # Copy .txt if exists\n",
" txt_path = path.with_suffix(\".txt\")\n",
" if txt_path.exists():\n",
" shutil.move(txt_path, output_path / Path(out_name).with_suffix(\".txt\"))\n",
"\n",
" except Exception as e:\n",
" print(f\"Skipping {image_file}: {e}\")\n",
"\n",
"def main(directory, output_directory):\n",
" base_folder_name = os.path.basename(directory.rstrip('/').rstrip('\\\\'))\n",
"\n",
" # Unzip all zip files in the directory\n",
" unzip_files_in_directory(directory)\n",
" print('unziped')\n",
"\n",
" # List all image files\n",
" image_files = list_image_files(directory)\n",
" print(f\"Total number of images: {len(image_files)}\")\n",
"\n",
" # Copy images with CRC32 hash in their names\n",
" print('copy from',base_folder_name)\n",
" copy_images_with_crc32(image_files, output_directory, base_folder_name)\n",
" print('ok')\n",
"\n",
"# Example usage\n",
"directory_path = '/workspace/ds/eshooshoo2019'\n",
"output_directory_path = '/workspace/ds/2015-19'\n",
"main(directory_path, output_directory_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e047aa2a-f7cc-4332-91cf-17c373f61ea2",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|