Commit
·
fe7005f
1
Parent(s):
c7411c1
update the datloading.ipynb file
Browse files- dataloading.ipynb +130 -926
dataloading.ipynb
CHANGED
|
@@ -2,123 +2,37 @@
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
-
"execution_count":
|
| 6 |
"id": "ae9bc87a",
|
| 7 |
"metadata": {},
|
| 8 |
"outputs": [],
|
| 9 |
"source": [
|
| 10 |
"from datasets import load_dataset\n",
|
| 11 |
-
"import datasets"
|
|
|
|
| 12 |
]
|
| 13 |
},
|
| 14 |
{
|
| 15 |
"cell_type": "code",
|
| 16 |
-
"execution_count":
|
| 17 |
-
"id": "
|
| 18 |
"metadata": {},
|
| 19 |
-
"outputs": [
|
| 20 |
-
{
|
| 21 |
-
"data": {
|
| 22 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 23 |
-
"model_id": "0ccb4dc0c6bf4c8f89a0be03b742598f",
|
| 24 |
-
"version_major": 2,
|
| 25 |
-
"version_minor": 0
|
| 26 |
-
},
|
| 27 |
-
"text/plain": [
|
| 28 |
-
"Resolving data files: 0%| | 0/119 [00:00<?, ?it/s]"
|
| 29 |
-
]
|
| 30 |
-
},
|
| 31 |
-
"metadata": {},
|
| 32 |
-
"output_type": "display_data"
|
| 33 |
-
},
|
| 34 |
-
{
|
| 35 |
-
"data": {
|
| 36 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 37 |
-
"model_id": "27ddf8817cfb45d1b4da85d0e14e6ad4",
|
| 38 |
-
"version_major": 2,
|
| 39 |
-
"version_minor": 0
|
| 40 |
-
},
|
| 41 |
-
"text/plain": [
|
| 42 |
-
"Loading dataset shards: 0%| | 0/64 [00:00<?, ?it/s]"
|
| 43 |
-
]
|
| 44 |
-
},
|
| 45 |
-
"metadata": {},
|
| 46 |
-
"output_type": "display_data"
|
| 47 |
-
}
|
| 48 |
-
],
|
| 49 |
"source": [
|
| 50 |
"ds = load_dataset(\"chainyo/rvl-cdip\")"
|
| 51 |
]
|
| 52 |
},
|
| 53 |
{
|
| 54 |
-
"cell_type": "
|
| 55 |
-
"
|
| 56 |
-
"id": "f5aa7605",
|
| 57 |
"metadata": {},
|
| 58 |
-
"outputs": [
|
| 59 |
-
{
|
| 60 |
-
"data": {
|
| 61 |
-
"text/plain": [
|
| 62 |
-
"DatasetDict({\n",
|
| 63 |
-
" train: Dataset({\n",
|
| 64 |
-
" features: ['image', 'label'],\n",
|
| 65 |
-
" num_rows: 319999\n",
|
| 66 |
-
" })\n",
|
| 67 |
-
" test: Dataset({\n",
|
| 68 |
-
" features: ['image', 'label'],\n",
|
| 69 |
-
" num_rows: 40000\n",
|
| 70 |
-
" })\n",
|
| 71 |
-
" val: Dataset({\n",
|
| 72 |
-
" features: ['image', 'label'],\n",
|
| 73 |
-
" num_rows: 40000\n",
|
| 74 |
-
" })\n",
|
| 75 |
-
"})"
|
| 76 |
-
]
|
| 77 |
-
},
|
| 78 |
-
"execution_count": 6,
|
| 79 |
-
"metadata": {},
|
| 80 |
-
"output_type": "execute_result"
|
| 81 |
-
}
|
| 82 |
-
],
|
| 83 |
"source": [
|
| 84 |
-
"
|
| 85 |
]
|
| 86 |
},
|
| 87 |
{
|
| 88 |
"cell_type": "code",
|
| 89 |
-
"execution_count":
|
| 90 |
-
"id": "9b19f0c5",
|
| 91 |
-
"metadata": {},
|
| 92 |
-
"outputs": [
|
| 93 |
-
{
|
| 94 |
-
"name": "stdout",
|
| 95 |
-
"output_type": "stream",
|
| 96 |
-
"text": [
|
| 97 |
-
"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n",
|
| 98 |
-
"16\n",
|
| 99 |
-
"ClassLabel(names=['advertisement', 'budget', 'email', 'file folder', 'form', 'handwritten', 'invoice', 'letter', 'memo', 'news article', 'presentation', 'questionnaire', 'resume', 'scientific publication', 'scientific report', 'specification'])\n",
|
| 100 |
-
"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n",
|
| 101 |
-
"16\n",
|
| 102 |
-
"ClassLabel(names=['advertisement', 'budget', 'email', 'file folder', 'form', 'handwritten', 'invoice', 'letter', 'memo', 'news article', 'presentation', 'questionnaire', 'resume', 'scientific publication', 'scientific report', 'specification'])\n",
|
| 103 |
-
"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n",
|
| 104 |
-
"16\n",
|
| 105 |
-
"ClassLabel(names=['advertisement', 'budget', 'email', 'file folder', 'form', 'handwritten', 'invoice', 'letter', 'memo', 'news article', 'presentation', 'questionnaire', 'resume', 'scientific publication', 'scientific report', 'specification'])\n"
|
| 106 |
-
]
|
| 107 |
-
}
|
| 108 |
-
],
|
| 109 |
-
"source": [
|
| 110 |
-
"for split in ds.keys():\n",
|
| 111 |
-
" unique_labels = ds[split].unique('label')\n",
|
| 112 |
-
" num_labels = len(unique_labels)\n",
|
| 113 |
-
" class_names = ds[split].features['label']\n",
|
| 114 |
-
" print(unique_labels)\n",
|
| 115 |
-
" print(num_labels)\n",
|
| 116 |
-
" print(class_names)"
|
| 117 |
-
]
|
| 118 |
-
},
|
| 119 |
-
{
|
| 120 |
-
"cell_type": "code",
|
| 121 |
-
"execution_count": 48,
|
| 122 |
"id": "936deafa",
|
| 123 |
"metadata": {},
|
| 124 |
"outputs": [
|
|
@@ -126,43 +40,35 @@
|
|
| 126 |
"name": "stdout",
|
| 127 |
"output_type": "stream",
|
| 128 |
"text": [
|
| 129 |
-
"
|
| 130 |
-
"
|
| 131 |
-
"\n",
|
| 132 |
-
"
|
| 133 |
]
|
| 134 |
},
|
| 135 |
{
|
| 136 |
"data": {
|
| 137 |
"application/vnd.jupyter.widget-view+json": {
|
| 138 |
-
"model_id": "
|
| 139 |
"version_major": 2,
|
| 140 |
"version_minor": 0
|
| 141 |
},
|
| 142 |
"text/plain": [
|
| 143 |
-
"
|
| 144 |
]
|
| 145 |
},
|
| 146 |
"metadata": {},
|
| 147 |
"output_type": "display_data"
|
| 148 |
},
|
| 149 |
-
{
|
| 150 |
-
"name": "stdout",
|
| 151 |
-
"output_type": "stream",
|
| 152 |
-
"text": [
|
| 153 |
-
"\n",
|
| 154 |
-
"🚀 Processing VAL split...\n"
|
| 155 |
-
]
|
| 156 |
-
},
|
| 157 |
{
|
| 158 |
"data": {
|
| 159 |
"application/vnd.jupyter.widget-view+json": {
|
| 160 |
-
"model_id": "
|
| 161 |
"version_major": 2,
|
| 162 |
"version_minor": 0
|
| 163 |
},
|
| 164 |
"text/plain": [
|
| 165 |
-
"
|
| 166 |
]
|
| 167 |
},
|
| 168 |
"metadata": {},
|
|
@@ -172,367 +78,177 @@
|
|
| 172 |
"name": "stdout",
|
| 173 |
"output_type": "stream",
|
| 174 |
"text": [
|
|
|
|
|
|
|
| 175 |
"\n",
|
| 176 |
-
"
|
| 177 |
]
|
| 178 |
},
|
| 179 |
{
|
| 180 |
"data": {
|
| 181 |
"application/vnd.jupyter.widget-view+json": {
|
| 182 |
-
"model_id": "
|
| 183 |
"version_major": 2,
|
| 184 |
"version_minor": 0
|
| 185 |
},
|
| 186 |
"text/plain": [
|
| 187 |
-
"Saving
|
| 188 |
]
|
| 189 |
},
|
| 190 |
"metadata": {},
|
| 191 |
"output_type": "display_data"
|
| 192 |
},
|
| 193 |
-
{
|
| 194 |
-
"name": "stderr",
|
| 195 |
-
"output_type": "stream",
|
| 196 |
-
"text": [
|
| 197 |
-
"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/TiffImagePlugin.py:950: UserWarning: Corrupt EXIF data. Expecting to read 2 bytes but only got 0. \n",
|
| 198 |
-
" warnings.warn(str(msg))\n"
|
| 199 |
-
]
|
| 200 |
-
},
|
| 201 |
-
{
|
| 202 |
-
"ename": "UnidentifiedImageError",
|
| 203 |
-
"evalue": "cannot identify image file <_io.BytesIO object at 0x3250f9d50>",
|
| 204 |
-
"output_type": "error",
|
| 205 |
-
"traceback": [
|
| 206 |
-
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
| 207 |
-
"\u001b[31mRemoteTraceback\u001b[39m Traceback (most recent call last)",
|
| 208 |
-
"\u001b[31mRemoteTraceback\u001b[39m: \n\"\"\"\nTraceback (most recent call last):\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n result = (True, func(*args, **kwds))\n ^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n for i, result in enumerate(func(**kwargs)):\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n for i, batch in iter_outputs(shard_iterable):\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n yield i, apply_function(example, i, offset=offset)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_65139/2515864513.py\", line 27, in save_batch_raw\n images = batch['image']\n ~~~~~^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py\", line 285, in __getitem__\n value = self.format(key)\n ^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py\", line 385, in format\n return self.formatter.format_column(self.pa_table.select([key]))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py\", line 465, in format_column\n column = self.python_features_decoder.decode_column(column, pa_table.column_names[0])\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py\", line 228, in decode_column\n self.features.decode_column(column, column_name, token_per_repo_id=self.token_per_repo_id)\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/features/features.py\", line 2130, in decode_column\n decode_nested_example(self[column_name], value, token_per_repo_id=token_per_repo_id)\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/features/features.py\", line 1414, in decode_nested_example\n return schema.decode_example(obj, token_per_repo_id=token_per_repo_id) if obj is not None else None\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/features/image.py\", line 192, in decode_example\n image = PIL.Image.open(BytesIO(bytes_))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 3580, in open\n raise UnidentifiedImageError(msg)\nPIL.UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x3250f9d50>\n\"\"\"",
|
| 209 |
-
"\nThe above exception was the direct cause of the following exception:\n",
|
| 210 |
-
"\u001b[31mUnidentifiedImageError\u001b[39m Traceback (most recent call last)",
|
| 211 |
-
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[48]\u001b[39m\u001b[32m, line 51\u001b[39m\n\u001b[32m 48\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m splits:\n\u001b[32m 49\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m🚀 Processing \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msplit.upper()\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m split...\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m51\u001b[39m \u001b[43mds\u001b[49m\u001b[43m[\u001b[49m\u001b[43msplit\u001b[49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 52\u001b[39m \u001b[43m \u001b[49m\u001b[43msave_batch_raw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 53\u001b[39m \u001b[43m \u001b[49m\u001b[43mbatched\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 54\u001b[39m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m100\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Process 100 images per chunk\u001b[39;49;00m\n\u001b[32m 55\u001b[39m \u001b[43m \u001b[49m\u001b[43mwith_indices\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Needed for unique filenames\u001b[39;49;00m\n\u001b[32m 56\u001b[39m \u001b[43m \u001b[49m\u001b[43mnum_proc\u001b[49m\u001b[43m=\u001b[49m\u001b[43mNUM_PROC\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# <--- THIS IS THE SPEED BOOST\u001b[39;49;00m\n\u001b[32m 57\u001b[39m \u001b[43m \u001b[49m\u001b[43mfn_kwargs\u001b[49m\u001b[43m=\u001b[49m\u001b[43m{\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43msplit_name\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 58\u001b[39m \u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mSaving \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43msplit\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m 59\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 61\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m✅ DONE! Raw data saved to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mos.path.abspath(OUTPUT_DIR)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
|
| 212 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py:562\u001b[39m, in \u001b[36mtransmit_format.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 555\u001b[39m self_format = {\n\u001b[32m 556\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mtype\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._format_type,\n\u001b[32m 557\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mformat_kwargs\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._format_kwargs,\n\u001b[32m 558\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mcolumns\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._format_columns,\n\u001b[32m 559\u001b[39m \u001b[33m\"\u001b[39m\u001b[33moutput_all_columns\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._output_all_columns,\n\u001b[32m 560\u001b[39m }\n\u001b[32m 561\u001b[39m \u001b[38;5;66;03m# apply actual function\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m562\u001b[39m out: Union[\u001b[33m\"\u001b[39m\u001b[33mDataset\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mDatasetDict\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 563\u001b[39m datasets: \u001b[38;5;28mlist\u001b[39m[\u001b[33m\"\u001b[39m\u001b[33mDataset\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mlist\u001b[39m(out.values()) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(out, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m [out]\n\u001b[32m 564\u001b[39m \u001b[38;5;66;03m# re-apply format to the output\u001b[39;00m\n",
|
| 213 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py:3332\u001b[39m, in \u001b[36mDataset.map\u001b[39m\u001b[34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc, try_original_type)\u001b[39m\n\u001b[32m 3329\u001b[39m os.environ = prev_env\n\u001b[32m 3330\u001b[39m logger.info(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSpawning \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnum_proc\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m processes\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m3332\u001b[39m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrank\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdone\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43miflatmap_unordered\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 3333\u001b[39m \u001b[43m \u001b[49m\u001b[43mpool\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mDataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_map_single\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs_iterable\u001b[49m\u001b[43m=\u001b[49m\u001b[43munprocessed_kwargs_per_job\u001b[49m\n\u001b[32m 3334\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 3335\u001b[39m \u001b[43m \u001b[49m\u001b[43mcheck_if_shard_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrank\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdone\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 3337\u001b[39m pool.close()\n",
|
| 214 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py:626\u001b[39m, in \u001b[36miflatmap_unordered\u001b[39m\u001b[34m(pool, func, kwargs_iterable)\u001b[39m\n\u001b[32m 623\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 624\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m pool_changed:\n\u001b[32m 625\u001b[39m \u001b[38;5;66;03m# we get the result in case there's an error to raise\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m626\u001b[39m [\u001b[43masync_result\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.05\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m async_result \u001b[38;5;129;01min\u001b[39;00m async_results]\n",
|
| 215 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py:774\u001b[39m, in \u001b[36mApplyResult.get\u001b[39m\u001b[34m(self, timeout)\u001b[39m\n\u001b[32m 772\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._value\n\u001b[32m 773\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m774\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m._value\n",
|
| 216 |
-
"\u001b[31mUnidentifiedImageError\u001b[39m: cannot identify image file <_io.BytesIO object at 0x3250f9d50>"
|
| 217 |
-
]
|
| 218 |
-
}
|
| 219 |
-
],
|
| 220 |
-
"source": [
|
| 221 |
-
"import os\n",
|
| 222 |
-
"import multiprocessing\n",
|
| 223 |
-
"\n",
|
| 224 |
-
"# 1. Configuration\n",
|
| 225 |
-
"OUTPUT_DIR = \"rvl_cdip\"\n",
|
| 226 |
-
"NUM_PROC = os.cpu_count() # Automatically use all CPU cores\n",
|
| 227 |
-
"\n",
|
| 228 |
-
"# 2. Pre-Calculate Class Names\n",
|
| 229 |
-
"# We do this once so workers don't have to look it up repeatedly\n",
|
| 230 |
-
"labels_feature = ds['train'].features['label']\n",
|
| 231 |
-
"idx_to_class = {idx: name for idx, name in enumerate(labels_feature.names)}\n",
|
| 232 |
-
"print(f\"✅ Using {NUM_PROC} workers to save RAW images.\")\n",
|
| 233 |
-
"\n",
|
| 234 |
-
"# 3. Pre-Create Directories\n",
|
| 235 |
-
"# Create all folders upfront to prevent collision errors\n",
|
| 236 |
-
"print(\"Creating directory structure...\")\n",
|
| 237 |
-
"splits = ['train', 'val', 'test']\n",
|
| 238 |
-
"for split in splits:\n",
|
| 239 |
-
" for class_name in idx_to_class.values():\n",
|
| 240 |
-
" os.makedirs(os.path.join(OUTPUT_DIR, split, class_name), exist_ok=True)\n",
|
| 241 |
-
"\n",
|
| 242 |
-
"# 4. The Worker Function (Raw Save)\n",
|
| 243 |
-
"def save_batch_raw(batch, indices, split_name):\n",
|
| 244 |
-
" \"\"\"\n",
|
| 245 |
-
" Saves a batch of images in their original, raw format.\n",
|
| 246 |
-
" \"\"\"\n",
|
| 247 |
-
" images = batch['image']\n",
|
| 248 |
-
" labels = batch['label']\n",
|
| 249 |
-
" \n",
|
| 250 |
-
" for img, label_idx, original_idx in zip(images, labels, indices):\n",
|
| 251 |
-
" class_name = idx_to_class[label_idx]\n",
|
| 252 |
-
" \n",
|
| 253 |
-
" # Define Path\n",
|
| 254 |
-
" filename = f\"{original_idx}.png\"\n",
|
| 255 |
-
" file_path = os.path.join(OUTPUT_DIR, split_name, class_name, filename)\n",
|
| 256 |
-
" \n",
|
| 257 |
-
" # Save RAW (No Resize)\n",
|
| 258 |
-
" # We only convert to RGB if absolutely necessary (e.g. CMYK/Transparency issues)\n",
|
| 259 |
-
" # otherwise we save as is.\n",
|
| 260 |
-
" if img.mode not in ['RGB', 'L']: # 'L' is standard grayscale\n",
|
| 261 |
-
" img = img.convert('RGB')\n",
|
| 262 |
-
" \n",
|
| 263 |
-
" img.save(file_path)\n",
|
| 264 |
-
" \n",
|
| 265 |
-
" return batch\n",
|
| 266 |
-
"\n",
|
| 267 |
-
"# 5. Execute Parallel Processing\n",
|
| 268 |
-
"for split in splits:\n",
|
| 269 |
-
" print(f\"\\n🚀 Processing {split.upper()} split...\")\n",
|
| 270 |
-
" \n",
|
| 271 |
-
" ds[split].map(\n",
|
| 272 |
-
" save_batch_raw,\n",
|
| 273 |
-
" batched=True,\n",
|
| 274 |
-
" batch_size=100, # Process 100 images per chunk\n",
|
| 275 |
-
" with_indices=True, # Needed for unique filenames\n",
|
| 276 |
-
" num_proc=NUM_PROC, # <--- THIS IS THE SPEED BOOST\n",
|
| 277 |
-
" fn_kwargs={'split_name': split},\n",
|
| 278 |
-
" desc=f\"Saving {split}\"\n",
|
| 279 |
-
" )\n",
|
| 280 |
-
"\n",
|
| 281 |
-
"print(f\"\\n✅ DONE! Raw data saved to {os.path.abspath(OUTPUT_DIR)}\")"
|
| 282 |
-
]
|
| 283 |
-
},
|
| 284 |
-
{
|
| 285 |
-
"cell_type": "code",
|
| 286 |
-
"execution_count": 51,
|
| 287 |
-
"id": "5645bccb",
|
| 288 |
-
"metadata": {},
|
| 289 |
-
"outputs": [
|
| 290 |
{
|
| 291 |
"name": "stdout",
|
| 292 |
"output_type": "stream",
|
| 293 |
"text": [
|
| 294 |
-
"
|
|
|
|
| 295 |
]
|
| 296 |
},
|
| 297 |
{
|
| 298 |
"data": {
|
| 299 |
"application/vnd.jupyter.widget-view+json": {
|
| 300 |
-
"model_id": "
|
| 301 |
"version_major": 2,
|
| 302 |
"version_minor": 0
|
| 303 |
},
|
| 304 |
"text/plain": [
|
| 305 |
-
"
|
| 306 |
]
|
| 307 |
},
|
| 308 |
"metadata": {},
|
| 309 |
"output_type": "display_data"
|
| 310 |
},
|
| 311 |
-
{
|
| 312 |
-
"name": "stderr",
|
| 313 |
-
"output_type": "stream",
|
| 314 |
-
"text": [
|
| 315 |
-
"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/TiffImagePlugin.py:950: UserWarning: Corrupt EXIF data. Expecting to read 2 bytes but only got 0. \n",
|
| 316 |
-
" warnings.warn(str(msg))\n"
|
| 317 |
-
]
|
| 318 |
-
},
|
| 319 |
-
{
|
| 320 |
-
"ename": "UnidentifiedImageError",
|
| 321 |
-
"evalue": "cannot identify image file <_io.BytesIO object at 0x10abe6f20>",
|
| 322 |
-
"output_type": "error",
|
| 323 |
-
"traceback": [
|
| 324 |
-
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
| 325 |
-
"\u001b[31mUnidentifiedImageError\u001b[39m Traceback (most recent call last)",
|
| 326 |
-
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[51]\u001b[39m\u001b[32m, line 18\u001b[39m\n\u001b[32m 15\u001b[39m current_ds = ds[split]\n\u001b[32m 16\u001b[39m skipped_count = \u001b[32m0\u001b[39m\n\u001b[32m---> \u001b[39m\u001b[32m18\u001b[39m \u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43menumerate\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtqdm\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcurrent_ds\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mChecking \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43msplit\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 19\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mtry\u001b[39;49;00m\u001b[43m:\u001b[49m\n\u001b[32m 20\u001b[39m \u001b[43m \u001b[49m\u001b[43mlabel_idx\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mlabel\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n",
|
| 327 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/tqdm/notebook.py:250\u001b[39m, in \u001b[36mtqdm_notebook.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 248\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 249\u001b[39m it = \u001b[38;5;28msuper\u001b[39m().\u001b[34m__iter__\u001b[39m()\n\u001b[32m--> \u001b[39m\u001b[32m250\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mit\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 251\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# return super(tqdm...) will not catch exception\u001b[39;49;00m\n\u001b[32m 252\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\n\u001b[32m 253\u001b[39m \u001b[38;5;66;03m# NB: except ... [ as ...] breaks IPython async KeyboardInterrupt\u001b[39;00m\n",
|
| 328 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/tqdm/std.py:1181\u001b[39m, in \u001b[36mtqdm.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 1178\u001b[39m time = \u001b[38;5;28mself\u001b[39m._time\n\u001b[32m 1180\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1181\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43miterable\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 1182\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\n\u001b[32m 1183\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Update and possibly print the progressbar.\u001b[39;49;00m\n\u001b[32m 1184\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Note: does not call self.update(1) for speed optimisation.\u001b[39;49;00m\n",
|
| 329 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py:2483\u001b[39m, in \u001b[36mDataset.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 2481\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(pa_subtable.num_rows):\n\u001b[32m 2482\u001b[39m pa_subtable_ex = pa_subtable.slice(i, \u001b[32m1\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m2483\u001b[39m formatted_output = \u001b[43mformat_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2484\u001b[39m \u001b[43m \u001b[49m\u001b[43mpa_subtable_ex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2485\u001b[39m \u001b[43m \u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 2486\u001b[39m \u001b[43m \u001b[49m\u001b[43mformatter\u001b[49m\u001b[43m=\u001b[49m\u001b[43mformatter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2487\u001b[39m \u001b[43m \u001b[49m\u001b[43mformat_columns\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_format_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2488\u001b[39m \u001b[43m \u001b[49m\u001b[43moutput_all_columns\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_output_all_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2489\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2490\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m formatted_output\n\u001b[32m 2491\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
|
| 330 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py:658\u001b[39m, in \u001b[36mformat_table\u001b[39m\u001b[34m(table, key, formatter, format_columns, output_all_columns)\u001b[39m\n\u001b[32m 656\u001b[39m python_formatter = PythonFormatter(features=formatter.features)\n\u001b[32m 657\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m format_columns \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m658\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mformatter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquery_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mquery_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 659\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m query_type == \u001b[33m\"\u001b[39m\u001b[33mcolumn\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 660\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m format_columns:\n",
|
| 331 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py:411\u001b[39m, in \u001b[36mFormatter.__call__\u001b[39m\u001b[34m(self, pa_table, query_type)\u001b[39m\n\u001b[32m 409\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, pa_table: pa.Table, query_type: \u001b[38;5;28mstr\u001b[39m) -> Union[RowFormat, ColumnFormat, BatchFormat]:\n\u001b[32m 410\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m query_type == \u001b[33m\"\u001b[39m\u001b[33mrow\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m411\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mformat_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 412\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m query_type == \u001b[33m\"\u001b[39m\u001b[33mcolumn\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 413\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.format_column(pa_table)\n",
|
| 332 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py:460\u001b[39m, in \u001b[36mPythonFormatter.format_row\u001b[39m\u001b[34m(self, pa_table)\u001b[39m\n\u001b[32m 458\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m LazyRow(pa_table, \u001b[38;5;28mself\u001b[39m)\n\u001b[32m 459\u001b[39m row = \u001b[38;5;28mself\u001b[39m.python_arrow_extractor().extract_row(pa_table)\n\u001b[32m--> \u001b[39m\u001b[32m460\u001b[39m row = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mpython_features_decoder\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdecode_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 461\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m row\n",
|
| 333 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/formatting/formatting.py:224\u001b[39m, in \u001b[36mPythonFeaturesDecoder.decode_row\u001b[39m\u001b[34m(self, row)\u001b[39m\n\u001b[32m 223\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdecode_row\u001b[39m(\u001b[38;5;28mself\u001b[39m, row: \u001b[38;5;28mdict\u001b[39m) -> \u001b[38;5;28mdict\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m224\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtoken_per_repo_id\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtoken_per_repo_id\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.features \u001b[38;5;28;01melse\u001b[39;00m row\n",
|
| 334 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/features/features.py:2106\u001b[39m, in \u001b[36mFeatures.decode_example\u001b[39m\u001b[34m(self, example, token_per_repo_id)\u001b[39m\n\u001b[32m 2091\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m, token_per_repo_id: Optional[\u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Union[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mbool\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m]]] = \u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[32m 2092\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[32m 2093\u001b[39m \n\u001b[32m 2094\u001b[39m \u001b[33;03m Args:\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 2102\u001b[39m \u001b[33;03m `dict[str, Any]`\u001b[39;00m\n\u001b[32m 2103\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m 2105\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[32m-> \u001b[39m\u001b[32m2106\u001b[39m column_name: \u001b[43mdecode_nested_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtoken_per_repo_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken_per_repo_id\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2107\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._column_requires_decoding[column_name]\n\u001b[32m 2108\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[32m 2109\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m zip_dict(\n\u001b[32m 2110\u001b[39m {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.items() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[32m 2111\u001b[39m )\n\u001b[32m 2112\u001b[39m }\n",
|
| 335 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/features/features.py:1414\u001b[39m, in \u001b[36mdecode_nested_example\u001b[39m\u001b[34m(schema, obj, token_per_repo_id)\u001b[39m\n\u001b[32m 1411\u001b[39m \u001b[38;5;66;03m# Object with special decoding:\u001b[39;00m\n\u001b[32m 1412\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(schema, \u001b[33m\"\u001b[39m\u001b[33mdecode_example\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(schema, \u001b[33m\"\u001b[39m\u001b[33mdecode\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m):\n\u001b[32m 1413\u001b[39m \u001b[38;5;66;03m# we pass the token to read and decode files from private repositories in streaming mode\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1414\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mschema\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtoken_per_repo_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken_per_repo_id\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1415\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n",
|
| 336 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/features/image.py:192\u001b[39m, in \u001b[36mImage.decode_example\u001b[39m\u001b[34m(self, value, token_per_repo_id)\u001b[39m\n\u001b[32m 190\u001b[39m image = PIL.Image.open(bytes_)\n\u001b[32m 191\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m192\u001b[39m image = \u001b[43mPIL\u001b[49m\u001b[43m.\u001b[49m\u001b[43mImage\u001b[49m\u001b[43m.\u001b[49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBytesIO\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbytes_\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 193\u001b[39m image.load() \u001b[38;5;66;03m# to avoid \"Too many open files\" errors\u001b[39;00m\n\u001b[32m 194\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m image.getexif().get(PIL.Image.ExifTags.Base.Orientation) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
|
| 337 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py:3580\u001b[39m, in \u001b[36mopen\u001b[39m\u001b[34m(fp, mode, formats)\u001b[39m\n\u001b[32m 3578\u001b[39m warnings.warn(message)\n\u001b[32m 3579\u001b[39m msg = \u001b[33m\"\u001b[39m\u001b[33mcannot identify image file \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[33m\"\u001b[39m % (filename \u001b[38;5;28;01mif\u001b[39;00m filename \u001b[38;5;28;01melse\u001b[39;00m fp)\n\u001b[32m-> \u001b[39m\u001b[32m3580\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m UnidentifiedImageError(msg)\n",
|
| 338 |
-
"\u001b[31mUnidentifiedImageError\u001b[39m: cannot identify image file <_io.BytesIO object at 0x10abe6f20>"
|
| 339 |
-
]
|
| 340 |
-
}
|
| 341 |
-
],
|
| 342 |
-
"source": [
|
| 343 |
-
"import os\n",
|
| 344 |
-
"from tqdm.auto import tqdm\n",
|
| 345 |
-
"from PIL import UnidentifiedImageError\n",
|
| 346 |
-
"\n",
|
| 347 |
-
"# Configuration\n",
|
| 348 |
-
"OUTPUT_DIR = \"rvl_cdip\"\n",
|
| 349 |
-
"split = \"test\" \n",
|
| 350 |
-
"\n",
|
| 351 |
-
"# Get Class Mapping\n",
|
| 352 |
-
"labels_feature = ds['train'].features['label']\n",
|
| 353 |
-
"idx_to_class = {idx: name for idx, name in enumerate(labels_feature.names)}\n",
|
| 354 |
-
"\n",
|
| 355 |
-
"print(f\"🛠️ Repairing {split.upper()} split (with integrity check)...\")\n",
|
| 356 |
-
"\n",
|
| 357 |
-
"current_ds = ds[split]\n",
|
| 358 |
-
"skipped_count = 0\n",
|
| 359 |
-
"\n",
|
| 360 |
-
"for i, example in enumerate(tqdm(current_ds, desc=f\"Checking {split}\")):\n",
|
| 361 |
-
" try:\n",
|
| 362 |
-
" label_idx = example['label']\n",
|
| 363 |
-
" class_name = idx_to_class[label_idx]\n",
|
| 364 |
-
" \n",
|
| 365 |
-
" target_folder = os.path.join(OUTPUT_DIR, split, class_name)\n",
|
| 366 |
-
" filename = f\"{i}.png\"\n",
|
| 367 |
-
" file_path = os.path.join(target_folder, filename)\n",
|
| 368 |
-
" \n",
|
| 369 |
-
" # --- IMPROVED CHECK ---\n",
|
| 370 |
-
" # Only skip if file exists AND is not empty (larger than 0 bytes)\n",
|
| 371 |
-
" # This fixes the edge case where the crash left a 0-byte file\n",
|
| 372 |
-
" if os.path.exists(file_path) and os.path.getsize(file_path) > 0:\n",
|
| 373 |
-
" continue\n",
|
| 374 |
-
" \n",
|
| 375 |
-
" # If we reach here, the file is missing OR corrupt (empty). So we save it.\n",
|
| 376 |
-
" if not os.path.exists(target_folder):\n",
|
| 377 |
-
" os.makedirs(target_folder, exist_ok=True)\n",
|
| 378 |
-
"\n",
|
| 379 |
-
" image = example['image'] \n",
|
| 380 |
-
" if image.mode not in ['RGB', 'L']:\n",
|
| 381 |
-
" image = image.convert('RGB')\n",
|
| 382 |
-
" \n",
|
| 383 |
-
" image.save(file_path)\n",
|
| 384 |
-
"\n",
|
| 385 |
-
" except (UnidentifiedImageError, OSError) as e:\n",
|
| 386 |
-
" print(f\"\\n❌ SKIPPING CORRUPT IMAGE: Index {i}\")\n",
|
| 387 |
-
" skipped_count += 1\n",
|
| 388 |
-
"\n",
|
| 389 |
-
"print(f\"\\n✅ Repair Complete.\")\n",
|
| 390 |
-
"print(f\"Total corrupt/unreadable images skipped: {skipped_count}\")"
|
| 391 |
-
]
|
| 392 |
-
},
|
| 393 |
-
{
|
| 394 |
-
"cell_type": "code",
|
| 395 |
-
"execution_count": 57,
|
| 396 |
-
"id": "41f94f27",
|
| 397 |
-
"metadata": {},
|
| 398 |
-
"outputs": [
|
| 399 |
{
|
| 400 |
"name": "stdout",
|
| 401 |
"output_type": "stream",
|
| 402 |
"text": [
|
| 403 |
-
"
|
|
|
|
| 404 |
]
|
| 405 |
},
|
| 406 |
{
|
| 407 |
"data": {
|
| 408 |
"application/vnd.jupyter.widget-view+json": {
|
| 409 |
-
"model_id": "
|
| 410 |
"version_major": 2,
|
| 411 |
"version_minor": 0
|
| 412 |
},
|
| 413 |
"text/plain": [
|
| 414 |
-
"
|
| 415 |
]
|
| 416 |
},
|
| 417 |
"metadata": {},
|
| 418 |
"output_type": "display_data"
|
| 419 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
{
|
| 421 |
"name": "stdout",
|
| 422 |
"output_type": "stream",
|
| 423 |
"text": [
|
|
|
|
| 424 |
"\n",
|
| 425 |
-
"
|
| 426 |
-
"
|
| 427 |
-
"
|
| 428 |
-
"Skipped 1 corrupt files.\n"
|
| 429 |
]
|
| 430 |
}
|
| 431 |
],
|
| 432 |
"source": [
|
| 433 |
"import os\n",
|
| 434 |
"import io\n",
|
| 435 |
-
"from
|
| 436 |
"from PIL import Image, UnidentifiedImageError\n",
|
| 437 |
"\n",
|
| 438 |
-
"#
|
| 439 |
-
"
|
| 440 |
-
"
|
| 441 |
"\n",
|
| 442 |
-
"
|
| 443 |
-
"
|
| 444 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
"\n",
|
| 446 |
-
"
|
| 447 |
-
"
|
| 448 |
-
"
|
|
|
|
| 449 |
"\n",
|
| 450 |
-
"
|
| 451 |
-
"
|
| 452 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
"\n",
|
| 454 |
-
"
|
| 455 |
-
"
|
| 456 |
-
"
|
| 457 |
-
"
|
| 458 |
-
" example = current_ds[i] \n",
|
| 459 |
-
" \n",
|
| 460 |
-
" label_idx = example['label']\n",
|
| 461 |
-
" class_name = idx_to_class[label_idx]\n",
|
| 462 |
-
" \n",
|
| 463 |
-
" # B. Define Paths\n",
|
| 464 |
-
" target_folder = os.path.join(OUTPUT_DIR, split, class_name)\n",
|
| 465 |
-
" filename = f\"{i}.png\"\n",
|
| 466 |
-
" file_path = os.path.join(target_folder, filename)\n",
|
| 467 |
"\n",
|
| 468 |
-
"
|
| 469 |
-
"
|
| 470 |
-
"
|
| 471 |
-
"
|
| 472 |
-
" # D. Create Folder\n",
|
| 473 |
-
" if not os.path.exists(target_folder):\n",
|
| 474 |
-
" os.makedirs(target_folder, exist_ok=True)\n",
|
| 475 |
"\n",
|
| 476 |
-
"
|
| 477 |
-
"
|
| 478 |
-
"
|
| 479 |
" \n",
|
| 480 |
-
" #
|
| 481 |
-
"
|
| 482 |
-
"
|
| 483 |
-
"
|
| 484 |
-
"
|
| 485 |
-
"
|
| 486 |
-
"
|
| 487 |
-
"
|
| 488 |
-
"
|
| 489 |
-
"
|
| 490 |
-
"
|
| 491 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
"\n",
|
| 493 |
-
"
|
| 494 |
-
"
|
| 495 |
-
"
|
| 496 |
-
" skipped_count += 1\n",
|
| 497 |
"\n",
|
| 498 |
-
"
|
| 499 |
-
"
|
| 500 |
-
]
|
| 501 |
-
},
|
| 502 |
-
{
|
| 503 |
-
"cell_type": "code",
|
| 504 |
-
"execution_count": 29,
|
| 505 |
-
"id": "b829c704",
|
| 506 |
-
"metadata": {},
|
| 507 |
-
"outputs": [
|
| 508 |
-
{
|
| 509 |
-
"data": {
|
| 510 |
-
"text/plain": [
|
| 511 |
-
"Counter({13: 2572,\n",
|
| 512 |
-
" 12: 2537,\n",
|
| 513 |
-
" 5: 2532,\n",
|
| 514 |
-
" 3: 2527,\n",
|
| 515 |
-
" 2: 2516,\n",
|
| 516 |
-
" 0: 2515,\n",
|
| 517 |
-
" 4: 2506,\n",
|
| 518 |
-
" 1: 2505,\n",
|
| 519 |
-
" 14: 2498,\n",
|
| 520 |
-
" 8: 2492,\n",
|
| 521 |
-
" 10: 2489,\n",
|
| 522 |
-
" 6: 2477,\n",
|
| 523 |
-
" 15: 2472,\n",
|
| 524 |
-
" 7: 2464,\n",
|
| 525 |
-
" 9: 2463,\n",
|
| 526 |
-
" 11: 2435})"
|
| 527 |
-
]
|
| 528 |
-
},
|
| 529 |
-
"execution_count": 29,
|
| 530 |
-
"metadata": {},
|
| 531 |
-
"output_type": "execute_result"
|
| 532 |
-
}
|
| 533 |
-
],
|
| 534 |
-
"source": [
|
| 535 |
-
"Counter(ds[split]['label'])"
|
| 536 |
]
|
| 537 |
},
|
| 538 |
{
|
|
@@ -540,12 +256,12 @@
|
|
| 540 |
"id": "c8530c8e",
|
| 541 |
"metadata": {},
|
| 542 |
"source": [
|
| 543 |
-
"## Checking the
|
| 544 |
]
|
| 545 |
},
|
| 546 |
{
|
| 547 |
"cell_type": "code",
|
| 548 |
-
"execution_count":
|
| 549 |
"id": "2785360c",
|
| 550 |
"metadata": {},
|
| 551 |
"outputs": [
|
|
@@ -613,29 +329,29 @@
|
|
| 613 |
"from collections import Counter\n",
|
| 614 |
"import pandas as pd\n",
|
| 615 |
"\n",
|
| 616 |
-
"#
|
| 617 |
"splits = ['train', 'val', 'test']\n",
|
| 618 |
"label_feature = ds['train'].features['label']\n",
|
| 619 |
-
"int2str = label_feature.int2str
|
| 620 |
"\n",
|
| 621 |
"print(f\"{'SPLIT':<10} {'CLASS NAME':<25} {'COUNT':<10} {'STATUS'}\")\n",
|
| 622 |
"print(\"-\" * 60)\n",
|
| 623 |
"\n",
|
| 624 |
"for split in splits:\n",
|
| 625 |
-
" #
|
| 626 |
" # This is instant compared to loading images\n",
|
| 627 |
" labels = ds[split]['label']\n",
|
| 628 |
" \n",
|
| 629 |
-
" #
|
| 630 |
" counts = Counter(labels)\n",
|
| 631 |
" \n",
|
| 632 |
-
" #
|
| 633 |
" # We sort by class ID to keep it organized\n",
|
| 634 |
" for label_id in sorted(counts.keys()):\n",
|
| 635 |
" count = counts[label_id]\n",
|
| 636 |
" class_name = int2str(label_id)\n",
|
| 637 |
" \n",
|
| 638 |
-
" #
|
| 639 |
" # Train: 320k / 16 = 20,000\n",
|
| 640 |
" # Test/Val: 40k / 16 = 2,500\n",
|
| 641 |
" if split == 'train':\n",
|
|
@@ -655,12 +371,12 @@
|
|
| 655 |
"id": "5f7b75a2",
|
| 656 |
"metadata": {},
|
| 657 |
"source": [
|
| 658 |
-
"##
|
| 659 |
]
|
| 660 |
},
|
| 661 |
{
|
| 662 |
"cell_type": "code",
|
| 663 |
-
"execution_count":
|
| 664 |
"id": "059bfaa5",
|
| 665 |
"metadata": {},
|
| 666 |
"outputs": [
|
|
@@ -668,7 +384,7 @@
|
|
| 668 |
"name": "stdout",
|
| 669 |
"output_type": "stream",
|
| 670 |
"text": [
|
| 671 |
-
"📂 Scanning directory: /Users/arpit-zstch1557/Projects/
|
| 672 |
"SPLIT CLASS NAME FILES STATUS\n",
|
| 673 |
"-----------------------------------------------------------------\n",
|
| 674 |
"TRAIN advertisement 19963 ❌ MISMATCH (Exp: 20000)\n",
|
|
@@ -732,7 +448,7 @@
|
|
| 732 |
"import pandas as pd\n",
|
| 733 |
"\n",
|
| 734 |
"# Configuration\n",
|
| 735 |
-
"DATA_DIR = \"
|
| 736 |
"splits = ['train', 'val', 'test']\n",
|
| 737 |
"\n",
|
| 738 |
"print(f\"📂 Scanning directory: {os.path.abspath(DATA_DIR)}\")\n",
|
|
@@ -769,12 +485,10 @@
|
|
| 769 |
" # Determine Expected Count based on the paper\n",
|
| 770 |
" if split == 'train':\n",
|
| 771 |
" expected = 20000 \n",
|
| 772 |
-
" # Note: We know 'train' has 1 missing file in total from the source (319,999)\n",
|
| 773 |
" else:\n",
|
| 774 |
" expected = 2500\n",
|
| 775 |
"\n",
|
| 776 |
" # Status Check\n",
|
| 777 |
-
" # We allow a small tolerance because we know source data has noise\n",
|
| 778 |
" if file_count == expected:\n",
|
| 779 |
" status = \"✅ OK\"\n",
|
| 780 |
" elif abs(file_count - expected) < 5: \n",
|
|
@@ -790,520 +504,10 @@
|
|
| 790 |
"print(\"\\nAnalysis Complete.\")"
|
| 791 |
]
|
| 792 |
},
|
| 793 |
-
{
|
| 794 |
-
"cell_type": "code",
|
| 795 |
-
"execution_count": 10,
|
| 796 |
-
"id": "99ca1af8",
|
| 797 |
-
"metadata": {},
|
| 798 |
-
"outputs": [
|
| 799 |
-
{
|
| 800 |
-
"name": "stdout",
|
| 801 |
-
"output_type": "stream",
|
| 802 |
-
"text": [
|
| 803 |
-
"🚀 Starting RVL-CDIP Downloader\n",
|
| 804 |
-
" Target Folder: /Users/arpit-zstch1557/Projects/DL/Course 4/document-classification/rvl_cdip_data\n",
|
| 805 |
-
" Workers: 12\n",
|
| 806 |
-
" Loading dataset structure from Hugging Face...\n"
|
| 807 |
-
]
|
| 808 |
-
},
|
| 809 |
-
{
|
| 810 |
-
"data": {
|
| 811 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 812 |
-
"model_id": "cf6fb62548ea45ebad0d16066ad8a895",
|
| 813 |
-
"version_major": 2,
|
| 814 |
-
"version_minor": 0
|
| 815 |
-
},
|
| 816 |
-
"text/plain": [
|
| 817 |
-
"Resolving data files: 0%| | 0/119 [00:00<?, ?it/s]"
|
| 818 |
-
]
|
| 819 |
-
},
|
| 820 |
-
"metadata": {},
|
| 821 |
-
"output_type": "display_data"
|
| 822 |
-
},
|
| 823 |
-
{
|
| 824 |
-
"data": {
|
| 825 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 826 |
-
"model_id": "90ff7339d3014385987f7a133cea179a",
|
| 827 |
-
"version_major": 2,
|
| 828 |
-
"version_minor": 0
|
| 829 |
-
},
|
| 830 |
-
"text/plain": [
|
| 831 |
-
"Loading dataset shards: 0%| | 0/64 [00:00<?, ?it/s]"
|
| 832 |
-
]
|
| 833 |
-
},
|
| 834 |
-
"metadata": {},
|
| 835 |
-
"output_type": "display_data"
|
| 836 |
-
},
|
| 837 |
-
{
|
| 838 |
-
"name": "stdout",
|
| 839 |
-
"output_type": "stream",
|
| 840 |
-
"text": [
|
| 841 |
-
" Found 16 categories.\n",
|
| 842 |
-
" Configuring dataset for safe raw access...\n",
|
| 843 |
-
"\n",
|
| 844 |
-
"📦 Processing SPLIT: TRAIN\n"
|
| 845 |
-
]
|
| 846 |
-
},
|
| 847 |
-
{
|
| 848 |
-
"data": {
|
| 849 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 850 |
-
"model_id": "e83199d60f794dedb211c17775c27808",
|
| 851 |
-
"version_major": 2,
|
| 852 |
-
"version_minor": 0
|
| 853 |
-
},
|
| 854 |
-
"text/plain": [
|
| 855 |
-
"Saving train (num_proc=12): 0%| | 0/319999 [00:00<?, ? examples/s]"
|
| 856 |
-
]
|
| 857 |
-
},
|
| 858 |
-
"metadata": {},
|
| 859 |
-
"output_type": "display_data"
|
| 860 |
-
},
|
| 861 |
-
{
|
| 862 |
-
"name": "stderr",
|
| 863 |
-
"output_type": "stream",
|
| 864 |
-
"text": [
|
| 865 |
-
"Process ForkPoolWorker-17:\n",
|
| 866 |
-
"Process ForkPoolWorker-16:\n",
|
| 867 |
-
"Process ForkPoolWorker-18:\n",
|
| 868 |
-
"Process ForkPoolWorker-22:\n",
|
| 869 |
-
"Process ForkPoolWorker-23:\n",
|
| 870 |
-
"Process ForkPoolWorker-27:\n",
|
| 871 |
-
"Traceback (most recent call last):\n",
|
| 872 |
-
"Traceback (most recent call last):\n",
|
| 873 |
-
"Traceback (most recent call last):\n",
|
| 874 |
-
"Traceback (most recent call last):\n",
|
| 875 |
-
"Traceback (most recent call last):\n",
|
| 876 |
-
"Traceback (most recent call last):\n",
|
| 877 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
|
| 878 |
-
" fh = fp.fileno()\n",
|
| 879 |
-
" ^^^^^^^^^\n",
|
| 880 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
|
| 881 |
-
" fh = fp.fileno()\n",
|
| 882 |
-
" ^^^^^^^^^\n",
|
| 883 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
|
| 884 |
-
" fh = fp.fileno()\n",
|
| 885 |
-
" ^^^^^^^^^\n",
|
| 886 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
|
| 887 |
-
" fh = fp.fileno()\n",
|
| 888 |
-
" ^^^^^^^^^\n",
|
| 889 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
|
| 890 |
-
" fh = fp.fileno()\n",
|
| 891 |
-
" ^^^^^^^^^\n",
|
| 892 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
|
| 893 |
-
" fh = fp.fileno()\n",
|
| 894 |
-
" ^^^^^^^^^\n",
|
| 895 |
-
"AttributeError: '_idat' object has no attribute 'fileno'\n",
|
| 896 |
-
"AttributeError: '_idat' object has no attribute 'fileno'\n",
|
| 897 |
-
"AttributeError: '_idat' object has no attribute 'fileno'\n",
|
| 898 |
-
"AttributeError: '_idat' object has no attribute 'fileno'\n",
|
| 899 |
-
"AttributeError: '_idat' object has no attribute 'fileno'\n",
|
| 900 |
-
"AttributeError: '_idat' object has no attribute 'fileno'\n",
|
| 901 |
-
"\n",
|
| 902 |
-
"During handling of the above exception, another exception occurred:\n",
|
| 903 |
-
"\n",
|
| 904 |
-
"\n",
|
| 905 |
-
"During handling of the above exception, another exception occurred:\n",
|
| 906 |
-
"\n",
|
| 907 |
-
"\n",
|
| 908 |
-
"During handling of the above exception, another exception occurred:\n",
|
| 909 |
-
"\n",
|
| 910 |
-
"\n",
|
| 911 |
-
"During handling of the above exception, another exception occurred:\n",
|
| 912 |
-
"\n",
|
| 913 |
-
"\n",
|
| 914 |
-
"During handling of the above exception, another exception occurred:\n",
|
| 915 |
-
"\n",
|
| 916 |
-
"\n",
|
| 917 |
-
"During handling of the above exception, another exception occurred:\n",
|
| 918 |
-
"\n",
|
| 919 |
-
"Traceback (most recent call last):\n",
|
| 920 |
-
"Traceback (most recent call last):\n",
|
| 921 |
-
"Traceback (most recent call last):\n",
|
| 922 |
-
"Traceback (most recent call last):\n",
|
| 923 |
-
"Traceback (most recent call last):\n",
|
| 924 |
-
"Traceback (most recent call last):\n",
|
| 925 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
|
| 926 |
-
" self.run()\n",
|
| 927 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
|
| 928 |
-
" self.run()\n",
|
| 929 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
|
| 930 |
-
" self.run()\n",
|
| 931 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
|
| 932 |
-
" self.run()\n",
|
| 933 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
|
| 934 |
-
" self.run()\n",
|
| 935 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
|
| 936 |
-
" self._target(*self._args, **self._kwargs)\n",
|
| 937 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
|
| 938 |
-
" self.run()\n",
|
| 939 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
|
| 940 |
-
" self._target(*self._args, **self._kwargs)\n",
|
| 941 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
|
| 942 |
-
" self._target(*self._args, **self._kwargs)\n",
|
| 943 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
|
| 944 |
-
" self._target(*self._args, **self._kwargs)\n",
|
| 945 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
|
| 946 |
-
" self._target(*self._args, **self._kwargs)\n",
|
| 947 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
|
| 948 |
-
" result = (True, func(*args, **kwds))\n",
|
| 949 |
-
" ^^^^^^^^^^^^^^^^^^^\n",
|
| 950 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
|
| 951 |
-
" self._target(*self._args, **self._kwargs)\n",
|
| 952 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
|
| 953 |
-
" result = (True, func(*args, **kwds))\n",
|
| 954 |
-
" ^^^^^^^^^^^^^^^^^^^\n",
|
| 955 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
|
| 956 |
-
" result = (True, func(*args, **kwds))\n",
|
| 957 |
-
" ^^^^^^^^^^^^^^^^^^^\n",
|
| 958 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
|
| 959 |
-
" result = (True, func(*args, **kwds))\n",
|
| 960 |
-
" ^^^^^^^^^^^^^^^^^^^\n",
|
| 961 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
|
| 962 |
-
" result = (True, func(*args, **kwds))\n",
|
| 963 |
-
" ^^^^^^^^^^^^^^^^^^^\n",
|
| 964 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
|
| 965 |
-
" for i, result in enumerate(func(**kwargs)):\n",
|
| 966 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 967 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
|
| 968 |
-
" for i, result in enumerate(func(**kwargs)):\n",
|
| 969 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 970 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
|
| 971 |
-
" result = (True, func(*args, **kwds))\n",
|
| 972 |
-
" ^^^^^^^^^^^^^^^^^^^\n",
|
| 973 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
|
| 974 |
-
" for i, result in enumerate(func(**kwargs)):\n",
|
| 975 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 976 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
|
| 977 |
-
" for i, result in enumerate(func(**kwargs)):\n",
|
| 978 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 979 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
|
| 980 |
-
" for i, result in enumerate(func(**kwargs)):\n",
|
| 981 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 982 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n",
|
| 983 |
-
" for i, batch in iter_outputs(shard_iterable):\n",
|
| 984 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 985 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n",
|
| 986 |
-
" for i, batch in iter_outputs(shard_iterable):\n",
|
| 987 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 988 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n",
|
| 989 |
-
" for i, batch in iter_outputs(shard_iterable):\n",
|
| 990 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 991 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
|
| 992 |
-
" for i, result in enumerate(func(**kwargs)):\n",
|
| 993 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 994 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n",
|
| 995 |
-
" for i, batch in iter_outputs(shard_iterable):\n",
|
| 996 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 997 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n",
|
| 998 |
-
" yield i, apply_function(example, i, offset=offset)\n",
|
| 999 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1000 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n",
|
| 1001 |
-
" for i, batch in iter_outputs(shard_iterable):\n",
|
| 1002 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1003 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n",
|
| 1004 |
-
" yield i, apply_function(example, i, offset=offset)\n",
|
| 1005 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1006 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3697, in _map_single\n",
|
| 1007 |
-
" for i, batch in iter_outputs(shard_iterable):\n",
|
| 1008 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1009 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n",
|
| 1010 |
-
" yield i, apply_function(example, i, offset=offset)\n",
|
| 1011 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1012 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n",
|
| 1013 |
-
" processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n",
|
| 1014 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1015 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n",
|
| 1016 |
-
" yield i, apply_function(example, i, offset=offset)\n",
|
| 1017 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1018 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n",
|
| 1019 |
-
" yield i, apply_function(example, i, offset=offset)\n",
|
| 1020 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1021 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n",
|
| 1022 |
-
" processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n",
|
| 1023 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1024 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3647, in iter_outputs\n",
|
| 1025 |
-
" yield i, apply_function(example, i, offset=offset)\n",
|
| 1026 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1027 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n",
|
| 1028 |
-
" processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n",
|
| 1029 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1030 |
-
" File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_72359/180152894.py\", line 49, in save_image_worker\n",
|
| 1031 |
-
" img.save(file_path)\n",
|
| 1032 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n",
|
| 1033 |
-
" processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n",
|
| 1034 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1035 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n",
|
| 1036 |
-
" processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n",
|
| 1037 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1038 |
-
" File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_72359/180152894.py\", line 49, in save_image_worker\n",
|
| 1039 |
-
" img.save(file_path)\n",
|
| 1040 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py\", line 3570, in apply_function\n",
|
| 1041 |
-
" processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)\n",
|
| 1042 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1043 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 2588, in save\n",
|
| 1044 |
-
" save_handler(self, fp, filename)\n",
|
| 1045 |
-
" File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_72359/180152894.py\", line 49, in save_image_worker\n",
|
| 1046 |
-
" img.save(file_path)\n",
|
| 1047 |
-
" File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_72359/180152894.py\", line 49, in save_image_worker\n",
|
| 1048 |
-
" img.save(file_path)\n",
|
| 1049 |
-
" File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_72359/180152894.py\", line 49, in save_image_worker\n",
|
| 1050 |
-
" img.save(file_path)\n",
|
| 1051 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 2588, in save\n",
|
| 1052 |
-
" save_handler(self, fp, filename)\n",
|
| 1053 |
-
" File \"/var/folders/07/1hr7xxpj3sx52fsnpz87jfj40000gp/T/ipykernel_72359/180152894.py\", line 49, in save_image_worker\n",
|
| 1054 |
-
" img.save(file_path)\n",
|
| 1055 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/PngImagePlugin.py\", line 1495, in _save\n",
|
| 1056 |
-
" ImageFile._save(\n",
|
| 1057 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 2588, in save\n",
|
| 1058 |
-
" save_handler(self, fp, filename)\n",
|
| 1059 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 2588, in save\n",
|
| 1060 |
-
" save_handler(self, fp, filename)\n",
|
| 1061 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 2588, in save\n",
|
| 1062 |
-
" save_handler(self, fp, filename)\n",
|
| 1063 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/PngImagePlugin.py\", line 1495, in _save\n",
|
| 1064 |
-
" ImageFile._save(\n",
|
| 1065 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/Image.py\", line 2588, in save\n",
|
| 1066 |
-
" save_handler(self, fp, filename)\n",
|
| 1067 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 648, in _save\n",
|
| 1068 |
-
" _encode_tile(im, fp, tile, bufsize, None, exc)\n",
|
| 1069 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/PngImagePlugin.py\", line 1495, in _save\n",
|
| 1070 |
-
" ImageFile._save(\n",
|
| 1071 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/PngImagePlugin.py\", line 1495, in _save\n",
|
| 1072 |
-
" ImageFile._save(\n",
|
| 1073 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 674, in _encode_tile\n",
|
| 1074 |
-
" errcode, data = encoder.encode(bufsize)[1:]\n",
|
| 1075 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1076 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/PngImagePlugin.py\", line 1495, in _save\n",
|
| 1077 |
-
" ImageFile._save(\n",
|
| 1078 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/PngImagePlugin.py\", line 1495, in _save\n",
|
| 1079 |
-
" ImageFile._save(\n",
|
| 1080 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 648, in _save\n",
|
| 1081 |
-
" _encode_tile(im, fp, tile, bufsize, None, exc)\n",
|
| 1082 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 648, in _save\n",
|
| 1083 |
-
" _encode_tile(im, fp, tile, bufsize, None, exc)\n",
|
| 1084 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 648, in _save\n",
|
| 1085 |
-
" _encode_tile(im, fp, tile, bufsize, None, exc)\n",
|
| 1086 |
-
"KeyboardInterrupt\n",
|
| 1087 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 674, in _encode_tile\n",
|
| 1088 |
-
" errcode, data = encoder.encode(bufsize)[1:]\n",
|
| 1089 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1090 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 648, in _save\n",
|
| 1091 |
-
" _encode_tile(im, fp, tile, bufsize, None, exc)\n",
|
| 1092 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 648, in _save\n",
|
| 1093 |
-
" _encode_tile(im, fp, tile, bufsize, None, exc)\n",
|
| 1094 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 674, in _encode_tile\n",
|
| 1095 |
-
" errcode, data = encoder.encode(bufsize)[1:]\n",
|
| 1096 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1097 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 674, in _encode_tile\n",
|
| 1098 |
-
" errcode, data = encoder.encode(bufsize)[1:]\n",
|
| 1099 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1100 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 674, in _encode_tile\n",
|
| 1101 |
-
" errcode, data = encoder.encode(bufsize)[1:]\n",
|
| 1102 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1103 |
-
"KeyboardInterrupt\n",
|
| 1104 |
-
"KeyboardInterrupt\n",
|
| 1105 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 674, in _encode_tile\n",
|
| 1106 |
-
" errcode, data = encoder.encode(bufsize)[1:]\n",
|
| 1107 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1108 |
-
"KeyboardInterrupt\n",
|
| 1109 |
-
"KeyboardInterrupt\n",
|
| 1110 |
-
"KeyboardInterrupt\n",
|
| 1111 |
-
"Process ForkPoolWorker-26:\n",
|
| 1112 |
-
"Process ForkPoolWorker-19:\n",
|
| 1113 |
-
"Process ForkPoolWorker-20:\n",
|
| 1114 |
-
"Process ForkPoolWorker-25:\n",
|
| 1115 |
-
"Traceback (most recent call last):\n",
|
| 1116 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
|
| 1117 |
-
" fh = fp.fileno()\n",
|
| 1118 |
-
" ^^^^^^^^^\n",
|
| 1119 |
-
"Traceback (most recent call last):\n",
|
| 1120 |
-
"AttributeError: '_idat' object has no attribute 'fileno'\n",
|
| 1121 |
-
"Traceback (most recent call last):\n",
|
| 1122 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
|
| 1123 |
-
" fh = fp.fileno()\n",
|
| 1124 |
-
" ^^^^^^^^^\n",
|
| 1125 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
|
| 1126 |
-
" fh = fp.fileno()\n",
|
| 1127 |
-
" ^^^^^^^^^\n",
|
| 1128 |
-
"\n",
|
| 1129 |
-
"During handling of the above exception, another exception occurred:\n",
|
| 1130 |
-
"\n",
|
| 1131 |
-
"AttributeError: '_idat' object has no attribute 'fileno'\n",
|
| 1132 |
-
"Traceback (most recent call last):\n",
|
| 1133 |
-
"AttributeError: '_idat' object has no attribute 'fileno'\n",
|
| 1134 |
-
"Traceback (most recent call last):\n",
|
| 1135 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/PIL/ImageFile.py\", line 644, in _save\n",
|
| 1136 |
-
" fh = fp.fileno()\n",
|
| 1137 |
-
" ^^^^^^^^^\n",
|
| 1138 |
-
"\n",
|
| 1139 |
-
"During handling of the above exception, another exception occurred:\n",
|
| 1140 |
-
"\n",
|
| 1141 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
|
| 1142 |
-
" self.run()\n",
|
| 1143 |
-
"\n",
|
| 1144 |
-
"During handling of the above exception, another exception occurred:\n",
|
| 1145 |
-
"\n",
|
| 1146 |
-
"AttributeError: '_idat' object has no attribute 'fileno'\n",
|
| 1147 |
-
"Traceback (most recent call last):\n",
|
| 1148 |
-
"Traceback (most recent call last):\n",
|
| 1149 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
|
| 1150 |
-
" self._target(*self._args, **self._kwargs)\n",
|
| 1151 |
-
"\n",
|
| 1152 |
-
"During handling of the above exception, another exception occurred:\n",
|
| 1153 |
-
"\n",
|
| 1154 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
|
| 1155 |
-
" self.run()\n",
|
| 1156 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py\", line 125, in worker\n",
|
| 1157 |
-
" result = (True, func(*args, **kwds))\n",
|
| 1158 |
-
" ^^^^^^^^^^^^^^^^^^^\n",
|
| 1159 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 314, in _bootstrap\n",
|
| 1160 |
-
" self.run()\n",
|
| 1161 |
-
"Traceback (most recent call last):\n",
|
| 1162 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
|
| 1163 |
-
" self._target(*self._args, **self._kwargs)\n",
|
| 1164 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py\", line 586, in _write_generator_to_queue\n",
|
| 1165 |
-
" for i, result in enumerate(func(**kwargs)):\n",
|
| 1166 |
-
" ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 1167 |
-
" File \"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/process.py\", line 108, in run\n",
|
| 1168 |
-
" self._target(*self._args, **self._kwargs)\n"
|
| 1169 |
-
]
|
| 1170 |
-
},
|
| 1171 |
-
{
|
| 1172 |
-
"ename": "TimeoutError",
|
| 1173 |
-
"evalue": "",
|
| 1174 |
-
"output_type": "error",
|
| 1175 |
-
"traceback": [
|
| 1176 |
-
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
| 1177 |
-
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
|
| 1178 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py:612\u001b[39m, in \u001b[36miflatmap_unordered\u001b[39m\u001b[34m(pool, func, kwargs_iterable)\u001b[39m\n\u001b[32m 611\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m612\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m \u001b[43mqueue\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.05\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 613\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m Empty:\n",
|
| 1179 |
-
"\u001b[36mFile \u001b[39m\u001b[32m<string>:2\u001b[39m, in \u001b[36mget\u001b[39m\u001b[34m(self, *args, **kwds)\u001b[39m\n",
|
| 1180 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/managers.py:828\u001b[39m, in \u001b[36mBaseProxy._callmethod\u001b[39m\u001b[34m(self, methodname, args, kwds)\u001b[39m\n\u001b[32m 827\u001b[39m conn.send((\u001b[38;5;28mself\u001b[39m._id, methodname, args, kwds))\n\u001b[32m--> \u001b[39m\u001b[32m828\u001b[39m kind, result = \u001b[43mconn\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrecv\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 830\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m kind == \u001b[33m'\u001b[39m\u001b[33m#RETURN\u001b[39m\u001b[33m'\u001b[39m:\n",
|
| 1181 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/connection.py:253\u001b[39m, in \u001b[36m_ConnectionBase.recv\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 252\u001b[39m \u001b[38;5;28mself\u001b[39m._check_readable()\n\u001b[32m--> \u001b[39m\u001b[32m253\u001b[39m buf = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_recv_bytes\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 254\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m _ForkingPickler.loads(buf.getbuffer())\n",
|
| 1182 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/connection.py:433\u001b[39m, in \u001b[36mConnection._recv_bytes\u001b[39m\u001b[34m(self, maxsize)\u001b[39m\n\u001b[32m 432\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_recv_bytes\u001b[39m(\u001b[38;5;28mself\u001b[39m, maxsize=\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[32m--> \u001b[39m\u001b[32m433\u001b[39m buf = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_recv\u001b[49m\u001b[43m(\u001b[49m\u001b[32;43m4\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 434\u001b[39m size, = struct.unpack(\u001b[33m\"\u001b[39m\u001b[33m!i\u001b[39m\u001b[33m\"\u001b[39m, buf.getvalue())\n",
|
| 1183 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/connection.py:398\u001b[39m, in \u001b[36mConnection._recv\u001b[39m\u001b[34m(self, size, read)\u001b[39m\n\u001b[32m 397\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m remaining > \u001b[32m0\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m398\u001b[39m chunk = \u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mremaining\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 399\u001b[39m n = \u001b[38;5;28mlen\u001b[39m(chunk)\n",
|
| 1184 |
-
"\u001b[31mKeyboardInterrupt\u001b[39m: ",
|
| 1185 |
-
"\nDuring handling of the above exception, another exception occurred:\n",
|
| 1186 |
-
"\u001b[31mTimeoutError\u001b[39m Traceback (most recent call last)",
|
| 1187 |
-
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 103\u001b[39m\n\u001b[32m 100\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m datasets.ImageFolder(root=\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mOUTPUT_DIR\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/train\u001b[39m\u001b[33m'\u001b[39m\u001b[33m)\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 102\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[34m__name__\u001b[39m == \u001b[33m\"\u001b[39m\u001b[33m__main__\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 1188 |
-
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 84\u001b[39m, in \u001b[36mmain\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 81\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m SPLITS:\n\u001b[32m 82\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m📦 Processing SPLIT: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msplit.upper()\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m84\u001b[39m \u001b[43mdataset\u001b[49m\u001b[43m[\u001b[49m\u001b[43msplit\u001b[49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 85\u001b[39m \u001b[43m \u001b[49m\u001b[43msave_image_worker\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 86\u001b[39m \u001b[43m \u001b[49m\u001b[43mbatched\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 87\u001b[39m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m100\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Process 100 images per task\u001b[39;49;00m\n\u001b[32m 88\u001b[39m \u001b[43m \u001b[49m\u001b[43mwith_indices\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# We need the index for the filename\u001b[39;49;00m\n\u001b[32m 89\u001b[39m \u001b[43m \u001b[49m\u001b[43mnum_proc\u001b[49m\u001b[43m=\u001b[49m\u001b[43mNUM_PROC\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Parallel speed!\u001b[39;49;00m\n\u001b[32m 90\u001b[39m \u001b[43m \u001b[49m\u001b[43mfn_kwargs\u001b[49m\u001b[43m=\u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m 91\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43msplit_name\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 92\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43moutput_root\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mOUTPUT_DIR\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 93\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43midx_to_class\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43midx_to_class\u001b[49m\n\u001b[32m 94\u001b[39m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 95\u001b[39m \u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mSaving \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43msplit\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m 96\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 98\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m✅ Download and Extraction Complete!\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 99\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m You can now load this in PyTorch using:\u001b[39m\u001b[33m\"\u001b[39m)\n",
|
| 1189 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py:562\u001b[39m, in \u001b[36mtransmit_format.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 555\u001b[39m self_format = {\n\u001b[32m 556\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mtype\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._format_type,\n\u001b[32m 557\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mformat_kwargs\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._format_kwargs,\n\u001b[32m 558\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mcolumns\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._format_columns,\n\u001b[32m 559\u001b[39m \u001b[33m\"\u001b[39m\u001b[33moutput_all_columns\u001b[39m\u001b[33m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m._output_all_columns,\n\u001b[32m 560\u001b[39m }\n\u001b[32m 561\u001b[39m \u001b[38;5;66;03m# apply actual function\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m562\u001b[39m out: Union[\u001b[33m\"\u001b[39m\u001b[33mDataset\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mDatasetDict\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 563\u001b[39m datasets: \u001b[38;5;28mlist\u001b[39m[\u001b[33m\"\u001b[39m\u001b[33mDataset\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mlist\u001b[39m(out.values()) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(out, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m [out]\n\u001b[32m 564\u001b[39m \u001b[38;5;66;03m# re-apply format to the output\u001b[39;00m\n",
|
| 1190 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/arrow_dataset.py:3332\u001b[39m, in \u001b[36mDataset.map\u001b[39m\u001b[34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc, try_original_type)\u001b[39m\n\u001b[32m 3329\u001b[39m os.environ = prev_env\n\u001b[32m 3330\u001b[39m logger.info(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSpawning \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnum_proc\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m processes\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m3332\u001b[39m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrank\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdone\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43miflatmap_unordered\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 3333\u001b[39m \u001b[43m \u001b[49m\u001b[43mpool\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mDataset\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_map_single\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs_iterable\u001b[49m\u001b[43m=\u001b[49m\u001b[43munprocessed_kwargs_per_job\u001b[49m\n\u001b[32m 3334\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 3335\u001b[39m \u001b[43m \u001b[49m\u001b[43mcheck_if_shard_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrank\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdone\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 3337\u001b[39m pool.close()\n",
|
| 1191 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/datasets/utils/py_utils.py:626\u001b[39m, in \u001b[36miflatmap_unordered\u001b[39m\u001b[34m(pool, func, kwargs_iterable)\u001b[39m\n\u001b[32m 623\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 624\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m pool_changed:\n\u001b[32m 625\u001b[39m \u001b[38;5;66;03m# we get the result in case there's an error to raise\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m626\u001b[39m [\u001b[43masync_result\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.05\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m async_result \u001b[38;5;129;01min\u001b[39;00m async_results]\n",
|
| 1192 |
-
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/lab_env/lib/python3.12/site-packages/multiprocess/pool.py:770\u001b[39m, in \u001b[36mApplyResult.get\u001b[39m\u001b[34m(self, timeout)\u001b[39m\n\u001b[32m 768\u001b[39m \u001b[38;5;28mself\u001b[39m.wait(timeout)\n\u001b[32m 769\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.ready():\n\u001b[32m--> \u001b[39m\u001b[32m770\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m\n\u001b[32m 771\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._success:\n\u001b[32m 772\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._value\n",
|
| 1193 |
-
"\u001b[31mTimeoutError\u001b[39m: "
|
| 1194 |
-
]
|
| 1195 |
-
}
|
| 1196 |
-
],
|
| 1197 |
-
"source": [
|
| 1198 |
-
"import os\n",
|
| 1199 |
-
"import io\n",
|
| 1200 |
-
"import multiprocessing\n",
|
| 1201 |
-
"from datasets import load_dataset, Image as HFImage\n",
|
| 1202 |
-
"from PIL import Image, UnidentifiedImageError\n",
|
| 1203 |
-
"\n",
|
| 1204 |
-
"# ================= CONFIGURATION =================\n",
|
| 1205 |
-
"OUTPUT_DIR = \"rvl_cdip_data\" # Where data will be saved\n",
|
| 1206 |
-
"NUM_PROC = os.cpu_count() # Use all available CPU cores\n",
|
| 1207 |
-
"SPLITS = ['train', 'val', 'test'] # Splits to process\n",
|
| 1208 |
-
"# =================================================\n",
|
| 1209 |
-
"\n",
|
| 1210 |
-
"def save_image_worker(batch, indices, split_name, output_root, idx_to_class):\n",
|
| 1211 |
-
" \"\"\"\n",
|
| 1212 |
-
" Worker function that runs on multiple CPU cores.\n",
|
| 1213 |
-
" Receives raw image bytes, decodes them safely, and saves to disk.\n",
|
| 1214 |
-
" \"\"\"\n",
|
| 1215 |
-
" # 1. Unpack batch\n",
|
| 1216 |
-
" # Since we used decode=False, 'image' contains a dict with 'bytes'\n",
|
| 1217 |
-
" images_data = batch['image'] \n",
|
| 1218 |
-
" labels = batch['label']\n",
|
| 1219 |
-
" \n",
|
| 1220 |
-
" for i, (img_data, label_idx, original_idx) in enumerate(zip(images_data, labels, indices)):\n",
|
| 1221 |
-
" try:\n",
|
| 1222 |
-
" # 2. Determine Paths\n",
|
| 1223 |
-
" class_name = idx_to_class[label_idx]\n",
|
| 1224 |
-
" target_folder = os.path.join(output_root, split_name, class_name)\n",
|
| 1225 |
-
" filename = f\"{original_idx}.png\"\n",
|
| 1226 |
-
" file_path = os.path.join(target_folder, filename)\n",
|
| 1227 |
-
" \n",
|
| 1228 |
-
" # 3. RESUME LOGIC (The \"Skip\" Check)\n",
|
| 1229 |
-
" # If file exists and is not empty, skip it.\n",
|
| 1230 |
-
" if os.path.exists(file_path) and os.path.getsize(file_path) > 0:\n",
|
| 1231 |
-
" continue\n",
|
| 1232 |
-
" \n",
|
| 1233 |
-
" # 4. Create Directory (Lazy Creation)\n",
|
| 1234 |
-
" # We do this here to ensure it exists before writing\n",
|
| 1235 |
-
" os.makedirs(target_folder, exist_ok=True)\n",
|
| 1236 |
-
" \n",
|
| 1237 |
-
" # 5. Decode Image Safely\n",
|
| 1238 |
-
" # We manually open the bytes. If this fails, we catch the error below.\n",
|
| 1239 |
-
" image_bytes = img_data['bytes']\n",
|
| 1240 |
-
" with Image.open(io.BytesIO(image_bytes)) as img:\n",
|
| 1241 |
-
" # Convert to RGB (standard for PyTorch ResNet)\n",
|
| 1242 |
-
" if img.mode != 'RGB':\n",
|
| 1243 |
-
" img = img.convert('RGB')\n",
|
| 1244 |
-
" \n",
|
| 1245 |
-
" # Save to disk\n",
|
| 1246 |
-
" img.save(file_path)\n",
|
| 1247 |
-
"\n",
|
| 1248 |
-
" except (UnidentifiedImageError, OSError, ValueError) as e:\n",
|
| 1249 |
-
" # 6. Error Handling\n",
|
| 1250 |
-
" # Instead of crashing the whole script, we just log this one failure.\n",
|
| 1251 |
-
" print(f\"[Worker] Skipping corrupt image ID {original_idx} in {split_name}: {e}\")\n",
|
| 1252 |
-
" \n",
|
| 1253 |
-
" return batch\n",
|
| 1254 |
-
"\n",
|
| 1255 |
-
"def main():\n",
|
| 1256 |
-
" print(f\"🚀 Starting RVL-CDIP Downloader\")\n",
|
| 1257 |
-
" print(f\" Target Folder: {os.path.abspath(OUTPUT_DIR)}\")\n",
|
| 1258 |
-
" print(f\" Workers: {NUM_PROC}\")\n",
|
| 1259 |
-
" \n",
|
| 1260 |
-
" # 1. Load Dataset\n",
|
| 1261 |
-
" # Assuming you are logged into Hugging Face or have access\n",
|
| 1262 |
-
" print(\" Loading dataset structure from Hugging Face...\")\n",
|
| 1263 |
-
" dataset = load_dataset(\"chainyo/rvl-cdip\") \n",
|
| 1264 |
-
"\n",
|
| 1265 |
-
" # 2. Setup Class Mapping\n",
|
| 1266 |
-
" labels_feature = dataset['train'].features['label']\n",
|
| 1267 |
-
" idx_to_class = {idx: name for idx, name in enumerate(labels_feature.names)}\n",
|
| 1268 |
-
" print(f\" Found {len(idx_to_class)} categories.\")\n",
|
| 1269 |
-
"\n",
|
| 1270 |
-
" # 3. CRITICAL: Disable Auto-Decoding\n",
|
| 1271 |
-
" # This prevents the Iterator from crashing when it hits a corrupt file.\n",
|
| 1272 |
-
" # We will handle decoding manually in the worker function.\n",
|
| 1273 |
-
" print(\" Configuring dataset for safe raw access...\")\n",
|
| 1274 |
-
" for split in SPLITS:\n",
|
| 1275 |
-
" dataset[split] = dataset[split].cast_column(\"image\", HFImage(decode=False))\n",
|
| 1276 |
-
"\n",
|
| 1277 |
-
" # 4. Execute Parallel Processing\n",
|
| 1278 |
-
" for split in SPLITS:\n",
|
| 1279 |
-
" print(f\"\\n📦 Processing SPLIT: {split.upper()}\")\n",
|
| 1280 |
-
" \n",
|
| 1281 |
-
" dataset[split].map(\n",
|
| 1282 |
-
" save_image_worker,\n",
|
| 1283 |
-
" batched=True,\n",
|
| 1284 |
-
" batch_size=100, # Process 100 images per task\n",
|
| 1285 |
-
" with_indices=True, # We need the index for the filename\n",
|
| 1286 |
-
" num_proc=NUM_PROC, # Parallel speed!\n",
|
| 1287 |
-
" fn_kwargs={\n",
|
| 1288 |
-
" 'split_name': split,\n",
|
| 1289 |
-
" 'output_root': OUTPUT_DIR,\n",
|
| 1290 |
-
" 'idx_to_class': idx_to_class\n",
|
| 1291 |
-
" },\n",
|
| 1292 |
-
" desc=f\"Saving {split}\"\n",
|
| 1293 |
-
" )\n",
|
| 1294 |
-
"\n",
|
| 1295 |
-
" print(f\"\\n✅ Download and Extraction Complete!\")\n",
|
| 1296 |
-
" print(f\" You can now load this in PyTorch using:\")\n",
|
| 1297 |
-
" print(f\" datasets.ImageFolder(root='{OUTPUT_DIR}/train')\")\n",
|
| 1298 |
-
"\n",
|
| 1299 |
-
"if __name__ == \"__main__\":\n",
|
| 1300 |
-
" main()"
|
| 1301 |
-
]
|
| 1302 |
-
},
|
| 1303 |
{
|
| 1304 |
"cell_type": "code",
|
| 1305 |
"execution_count": null,
|
| 1306 |
-
"id": "
|
| 1307 |
"metadata": {},
|
| 1308 |
"outputs": [],
|
| 1309 |
"source": []
|
|
@@ -1325,7 +529,7 @@
|
|
| 1325 |
"name": "python",
|
| 1326 |
"nbconvert_exporter": "python",
|
| 1327 |
"pygments_lexer": "ipython3",
|
| 1328 |
-
"version": "3.
|
| 1329 |
}
|
| 1330 |
},
|
| 1331 |
"nbformat": 4,
|
|
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
+
"execution_count": 7,
|
| 6 |
"id": "ae9bc87a",
|
| 7 |
"metadata": {},
|
| 8 |
"outputs": [],
|
| 9 |
"source": [
|
| 10 |
"from datasets import load_dataset\n",
|
| 11 |
+
"import datasets\n",
|
| 12 |
+
"from tqdm.notebook import tqdm"
|
| 13 |
]
|
| 14 |
},
|
| 15 |
{
|
| 16 |
"cell_type": "code",
|
| 17 |
+
"execution_count": null,
|
| 18 |
+
"id": "d5bc67fe",
|
| 19 |
"metadata": {},
|
| 20 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
"source": [
|
| 22 |
"ds = load_dataset(\"chainyo/rvl-cdip\")"
|
| 23 |
]
|
| 24 |
},
|
| 25 |
{
|
| 26 |
+
"cell_type": "markdown",
|
| 27 |
+
"id": "85f49eeb",
|
|
|
|
| 28 |
"metadata": {},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"source": [
|
| 30 |
+
"## Creates the \"rvl_cdip_data\" dir"
|
| 31 |
]
|
| 32 |
},
|
| 33 |
{
|
| 34 |
"cell_type": "code",
|
| 35 |
+
"execution_count": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
"id": "936deafa",
|
| 37 |
"metadata": {},
|
| 38 |
"outputs": [
|
|
|
|
| 40 |
"name": "stdout",
|
| 41 |
"output_type": "stream",
|
| 42 |
"text": [
|
| 43 |
+
"🚀 Starting RVL-CDIP Downloader (Disk Optimized)\n",
|
| 44 |
+
" Target Folder: /Users/arpit-zstch1557/Projects/document-classification/rvl_cdip_data\n",
|
| 45 |
+
" Workers: 12\n",
|
| 46 |
+
" Loading dataset structure from Hugging Face...\n"
|
| 47 |
]
|
| 48 |
},
|
| 49 |
{
|
| 50 |
"data": {
|
| 51 |
"application/vnd.jupyter.widget-view+json": {
|
| 52 |
+
"model_id": "0a79c4079dd44915af9193231077adc9",
|
| 53 |
"version_major": 2,
|
| 54 |
"version_minor": 0
|
| 55 |
},
|
| 56 |
"text/plain": [
|
| 57 |
+
"Resolving data files: 0%| | 0/119 [00:00<?, ?it/s]"
|
| 58 |
]
|
| 59 |
},
|
| 60 |
"metadata": {},
|
| 61 |
"output_type": "display_data"
|
| 62 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
{
|
| 64 |
"data": {
|
| 65 |
"application/vnd.jupyter.widget-view+json": {
|
| 66 |
+
"model_id": "105602455af94c04a85e8dd5eed8e1bb",
|
| 67 |
"version_major": 2,
|
| 68 |
"version_minor": 0
|
| 69 |
},
|
| 70 |
"text/plain": [
|
| 71 |
+
"Loading dataset shards: 0%| | 0/64 [00:00<?, ?it/s]"
|
| 72 |
]
|
| 73 |
},
|
| 74 |
"metadata": {},
|
|
|
|
| 78 |
"name": "stdout",
|
| 79 |
"output_type": "stream",
|
| 80 |
"text": [
|
| 81 |
+
" Found 16 categories.\n",
|
| 82 |
+
" Configuring dataset for safe raw access...\n",
|
| 83 |
"\n",
|
| 84 |
+
"📦 Processing SPLIT: TRAIN\n"
|
| 85 |
]
|
| 86 |
},
|
| 87 |
{
|
| 88 |
"data": {
|
| 89 |
"application/vnd.jupyter.widget-view+json": {
|
| 90 |
+
"model_id": "59511ab60fd047758ad0d5671f5f6789",
|
| 91 |
"version_major": 2,
|
| 92 |
"version_minor": 0
|
| 93 |
},
|
| 94 |
"text/plain": [
|
| 95 |
+
"Saving train (num_proc=12): 0%| | 0/319999 [00:00<?, ? examples/s]"
|
| 96 |
]
|
| 97 |
},
|
| 98 |
"metadata": {},
|
| 99 |
"output_type": "display_data"
|
| 100 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
{
|
| 102 |
"name": "stdout",
|
| 103 |
"output_type": "stream",
|
| 104 |
"text": [
|
| 105 |
+
"\n",
|
| 106 |
+
"📦 Processing SPLIT: VAL\n"
|
| 107 |
]
|
| 108 |
},
|
| 109 |
{
|
| 110 |
"data": {
|
| 111 |
"application/vnd.jupyter.widget-view+json": {
|
| 112 |
+
"model_id": "19ac18978db046e0aea0cbf7da2748ba",
|
| 113 |
"version_major": 2,
|
| 114 |
"version_minor": 0
|
| 115 |
},
|
| 116 |
"text/plain": [
|
| 117 |
+
"Saving val (num_proc=12): 0%| | 0/40000 [00:00<?, ? examples/s]"
|
| 118 |
]
|
| 119 |
},
|
| 120 |
"metadata": {},
|
| 121 |
"output_type": "display_data"
|
| 122 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
{
|
| 124 |
"name": "stdout",
|
| 125 |
"output_type": "stream",
|
| 126 |
"text": [
|
| 127 |
+
"\n",
|
| 128 |
+
"📦 Processing SPLIT: TEST\n"
|
| 129 |
]
|
| 130 |
},
|
| 131 |
{
|
| 132 |
"data": {
|
| 133 |
"application/vnd.jupyter.widget-view+json": {
|
| 134 |
+
"model_id": "890739f28bd2495bacc55ea33099c2f2",
|
| 135 |
"version_major": 2,
|
| 136 |
"version_minor": 0
|
| 137 |
},
|
| 138 |
"text/plain": [
|
| 139 |
+
"Saving test (num_proc=12): 0%| | 0/40000 [00:00<?, ? examples/s]"
|
| 140 |
]
|
| 141 |
},
|
| 142 |
"metadata": {},
|
| 143 |
"output_type": "display_data"
|
| 144 |
},
|
| 145 |
+
{
|
| 146 |
+
"name": "stderr",
|
| 147 |
+
"output_type": "stream",
|
| 148 |
+
"text": [
|
| 149 |
+
"/Users/arpit-zstch1557/miniconda3/envs/lab_env/lib/python3.13/site-packages/PIL/TiffImagePlugin.py:949: UserWarning: Corrupt EXIF data. Expecting to read 2 bytes but only got 0. \n",
|
| 150 |
+
" warnings.warn(str(msg))\n"
|
| 151 |
+
]
|
| 152 |
+
},
|
| 153 |
{
|
| 154 |
"name": "stdout",
|
| 155 |
"output_type": "stream",
|
| 156 |
"text": [
|
| 157 |
+
"[Worker] Skipping corrupt image ID 34965 in test: cannot identify image file <_io.BytesIO object at 0x371a331a0>\n",
|
| 158 |
"\n",
|
| 159 |
+
"✅ Download and Extraction Complete!\n",
|
| 160 |
+
" You can now load this in PyTorch using:\n",
|
| 161 |
+
" datasets.ImageFolder(root='rvl_cdip_data/train')\n"
|
|
|
|
| 162 |
]
|
| 163 |
}
|
| 164 |
],
|
| 165 |
"source": [
|
| 166 |
"import os\n",
|
| 167 |
"import io\n",
|
| 168 |
+
"from datasets import load_dataset, Image as HFImage\n",
|
| 169 |
"from PIL import Image, UnidentifiedImageError\n",
|
| 170 |
"\n",
|
| 171 |
+
"OUTPUT_DIR = \"rvl_cdip_data\" # Where data will be saved\n",
|
| 172 |
+
"NUM_PROC = os.cpu_count() # Use all available CPU cores\n",
|
| 173 |
+
"SPLITS = ['train', 'val', 'test'] # Splits to process\n",
|
| 174 |
"\n",
|
| 175 |
+
"def save_image_worker(batch, indices, split_name, output_root, idx_to_class):\n",
|
| 176 |
+
" # Unpack batch\n",
|
| 177 |
+
" images_data = batch['image'] \n",
|
| 178 |
+
" labels = batch['label']\n",
|
| 179 |
+
" \n",
|
| 180 |
+
" for i, (img_data, label_idx, original_idx) in enumerate(zip(images_data, labels, indices)):\n",
|
| 181 |
+
" try:\n",
|
| 182 |
+
" # Determine Paths\n",
|
| 183 |
+
" class_name = idx_to_class[label_idx]\n",
|
| 184 |
+
" target_folder = os.path.join(output_root, split_name, class_name)\n",
|
| 185 |
+
" filename = f\"{original_idx}.png\"\n",
|
| 186 |
+
" file_path = os.path.join(target_folder, filename)\n",
|
| 187 |
+
" \n",
|
| 188 |
+
" if os.path.exists(file_path) and os.path.getsize(file_path) > 0:\n",
|
| 189 |
+
" continue\n",
|
| 190 |
+
" \n",
|
| 191 |
+
" # Create Directory\n",
|
| 192 |
+
" os.makedirs(target_folder, exist_ok=True)\n",
|
| 193 |
+
" \n",
|
| 194 |
+
" # 5. Decode Image Safely\n",
|
| 195 |
+
" image_bytes = img_data['bytes']\n",
|
| 196 |
+
" with Image.open(io.BytesIO(image_bytes)) as img:\n",
|
| 197 |
+
" if img.mode != 'RGB':\n",
|
| 198 |
+
" img = img.convert('RGB')\n",
|
| 199 |
+
" img.save(file_path)\n",
|
| 200 |
"\n",
|
| 201 |
+
" except (UnidentifiedImageError, OSError, ValueError) as e:\n",
|
| 202 |
+
" print(f\"[Worker] Skipping corrupt image ID {original_idx} in {split_name}: {e}\")\n",
|
| 203 |
+
" \n",
|
| 204 |
+
" return {}\n",
|
| 205 |
"\n",
|
| 206 |
+
"def main():\n",
|
| 207 |
+
" print(f\"🚀 Starting RVL-CDIP Downloader (Disk Optimized)\")\n",
|
| 208 |
+
" print(f\" Target Folder: {os.path.abspath(OUTPUT_DIR)}\")\n",
|
| 209 |
+
" print(f\" Workers: {NUM_PROC}\")\n",
|
| 210 |
+
" \n",
|
| 211 |
+
" # Load Dataset\n",
|
| 212 |
+
" print(\" Loading dataset structure from Hugging Face...\")\n",
|
| 213 |
+
" dataset = load_dataset(\"chainyo/rvl-cdip\") \n",
|
| 214 |
"\n",
|
| 215 |
+
" # Setup Class Mapping\n",
|
| 216 |
+
" labels_feature = dataset['train'].features['label']\n",
|
| 217 |
+
" idx_to_class = {idx: name for idx, name in enumerate(labels_feature.names)}\n",
|
| 218 |
+
" print(f\" Found {len(idx_to_class)} categories.\")\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
"\n",
|
| 220 |
+
" # Disable Auto-Decoding (Prevents crashes on corrupt files)\n",
|
| 221 |
+
" print(\" Configuring dataset for safe raw access...\")\n",
|
| 222 |
+
" for split in SPLITS:\n",
|
| 223 |
+
" dataset[split] = dataset[split].cast_column(\"image\", HFImage(decode=False))\n",
|
|
|
|
|
|
|
|
|
|
| 224 |
"\n",
|
| 225 |
+
" # Execute Parallel Processing\n",
|
| 226 |
+
" for split in SPLITS:\n",
|
| 227 |
+
" print(f\"\\n📦 Processing SPLIT: {split.upper()}\")\n",
|
| 228 |
" \n",
|
| 229 |
+
" # We use remove_columns to ensure the output dataset is empty\n",
|
| 230 |
+
" # This prevents the 50GB duplicate cache file.\n",
|
| 231 |
+
" dataset[split].map(\n",
|
| 232 |
+
" save_image_worker,\n",
|
| 233 |
+
" batched=True,\n",
|
| 234 |
+
" batch_size=100,\n",
|
| 235 |
+
" with_indices=True,\n",
|
| 236 |
+
" num_proc=NUM_PROC,\n",
|
| 237 |
+
" remove_columns=dataset[split].column_names, \n",
|
| 238 |
+
" fn_kwargs={\n",
|
| 239 |
+
" 'split_name': split,\n",
|
| 240 |
+
" 'output_root': OUTPUT_DIR,\n",
|
| 241 |
+
" 'idx_to_class': idx_to_class\n",
|
| 242 |
+
" },\n",
|
| 243 |
+
" desc=f\"Saving {split}\"\n",
|
| 244 |
+
" )\n",
|
| 245 |
"\n",
|
| 246 |
+
" print(f\"\\n✅ Download and Extraction Complete!\")\n",
|
| 247 |
+
" print(f\" You can now load this in PyTorch using:\")\n",
|
| 248 |
+
" print(f\" datasets.ImageFolder(root='{OUTPUT_DIR}/train')\")\n",
|
|
|
|
| 249 |
"\n",
|
| 250 |
+
"if __name__ == \"__main__\":\n",
|
| 251 |
+
" main()"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
]
|
| 253 |
},
|
| 254 |
{
|
|
|
|
| 256 |
"id": "c8530c8e",
|
| 257 |
"metadata": {},
|
| 258 |
"source": [
|
| 259 |
+
"## Checking the Data Imbalance in ds (from HF)"
|
| 260 |
]
|
| 261 |
},
|
| 262 |
{
|
| 263 |
"cell_type": "code",
|
| 264 |
+
"execution_count": 8,
|
| 265 |
"id": "2785360c",
|
| 266 |
"metadata": {},
|
| 267 |
"outputs": [
|
|
|
|
| 329 |
"from collections import Counter\n",
|
| 330 |
"import pandas as pd\n",
|
| 331 |
"\n",
|
| 332 |
+
"#Setup\n",
|
| 333 |
"splits = ['train', 'val', 'test']\n",
|
| 334 |
"label_feature = ds['train'].features['label']\n",
|
| 335 |
+
"int2str = label_feature.int2str \n",
|
| 336 |
"\n",
|
| 337 |
"print(f\"{'SPLIT':<10} {'CLASS NAME':<25} {'COUNT':<10} {'STATUS'}\")\n",
|
| 338 |
"print(\"-\" * 60)\n",
|
| 339 |
"\n",
|
| 340 |
"for split in splits:\n",
|
| 341 |
+
" # Get all labels (Load only the label column into memory)\n",
|
| 342 |
" # This is instant compared to loading images\n",
|
| 343 |
" labels = ds[split]['label']\n",
|
| 344 |
" \n",
|
| 345 |
+
" # Count frequencies\n",
|
| 346 |
" counts = Counter(labels)\n",
|
| 347 |
" \n",
|
| 348 |
+
" # Analyze each class\n",
|
| 349 |
" # We sort by class ID to keep it organized\n",
|
| 350 |
" for label_id in sorted(counts.keys()):\n",
|
| 351 |
" count = counts[label_id]\n",
|
| 352 |
" class_name = int2str(label_id)\n",
|
| 353 |
" \n",
|
| 354 |
+
" # Define Expected Counts based on the Paper\n",
|
| 355 |
" # Train: 320k / 16 = 20,000\n",
|
| 356 |
" # Test/Val: 40k / 16 = 2,500\n",
|
| 357 |
" if split == 'train':\n",
|
|
|
|
| 371 |
"id": "5f7b75a2",
|
| 372 |
"metadata": {},
|
| 373 |
"source": [
|
| 374 |
+
"## Checking the data imbalance in \"rvl_cdip_data\" dir"
|
| 375 |
]
|
| 376 |
},
|
| 377 |
{
|
| 378 |
"cell_type": "code",
|
| 379 |
+
"execution_count": 9,
|
| 380 |
"id": "059bfaa5",
|
| 381 |
"metadata": {},
|
| 382 |
"outputs": [
|
|
|
|
| 384 |
"name": "stdout",
|
| 385 |
"output_type": "stream",
|
| 386 |
"text": [
|
| 387 |
+
"📂 Scanning directory: /Users/arpit-zstch1557/Projects/document-classification/rvl_cdip_data\n",
|
| 388 |
"SPLIT CLASS NAME FILES STATUS\n",
|
| 389 |
"-----------------------------------------------------------------\n",
|
| 390 |
"TRAIN advertisement 19963 ❌ MISMATCH (Exp: 20000)\n",
|
|
|
|
| 448 |
"import pandas as pd\n",
|
| 449 |
"\n",
|
| 450 |
"# Configuration\n",
|
| 451 |
+
"DATA_DIR = \"rvl_cdip_data\" # Your directory name\n",
|
| 452 |
"splits = ['train', 'val', 'test']\n",
|
| 453 |
"\n",
|
| 454 |
"print(f\"📂 Scanning directory: {os.path.abspath(DATA_DIR)}\")\n",
|
|
|
|
| 485 |
" # Determine Expected Count based on the paper\n",
|
| 486 |
" if split == 'train':\n",
|
| 487 |
" expected = 20000 \n",
|
|
|
|
| 488 |
" else:\n",
|
| 489 |
" expected = 2500\n",
|
| 490 |
"\n",
|
| 491 |
" # Status Check\n",
|
|
|
|
| 492 |
" if file_count == expected:\n",
|
| 493 |
" status = \"✅ OK\"\n",
|
| 494 |
" elif abs(file_count - expected) < 5: \n",
|
|
|
|
| 504 |
"print(\"\\nAnalysis Complete.\")"
|
| 505 |
]
|
| 506 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
{
|
| 508 |
"cell_type": "code",
|
| 509 |
"execution_count": null,
|
| 510 |
+
"id": "4ef697a2",
|
| 511 |
"metadata": {},
|
| 512 |
"outputs": [],
|
| 513 |
"source": []
|
|
|
|
| 529 |
"name": "python",
|
| 530 |
"nbconvert_exporter": "python",
|
| 531 |
"pygments_lexer": "ipython3",
|
| 532 |
+
"version": "3.13.11"
|
| 533 |
}
|
| 534 |
},
|
| 535 |
"nbformat": 4,
|